一、情景:当Spark streaming程序意外退出时,数据仍然再往Kafka中推送,然而由于Kafka默认是从latest的offset读取,这会导致数据丢失。为了避免数据丢失,那么我们需要记录每次消费的offset,以便下次检查并且从指定的offset开始读取

二、环境:kafka-0.9.0、spark-1.6.0、jdk-1.7、Scala-2.10.5、idea16

三、实现代码:

      1、引入spark和kafka的相关依赖包

 2、新建

1. <?xml version="1.0" encoding="UTF-8"?>  
2. <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  
3. xmlns="http://maven.apache.org/POM/4.0.0"  
4. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  
5. <modelVersion>4.0.0</modelVersion>  
6.   
7. <groupId>com.ngaa</groupId>  
8. <artifactId>test-my</artifactId>  
9. <version>1.0-SNAPSHOT</version>  
10. <inceptionYear>2008</inceptionYear>  
11. <properties>  
12. <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>  
13. <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>  
14. <!--add  maven release-->  
15. <maven.compiler.source>1.7</maven.compiler.source>  
16. <maven.compiler.target>1.7</maven.compiler.target>  
17. <encoding>UTF-8</encoding>  
18. <!--scala版本-->  
19. <scala.version>2.10.5</scala.version>  
20. <!--测试机器上的scala版本-->  
21. <test.scala.version>2.11.7</test.scala.version>  
22.   
23. <jackson.version>2.3.0</jackson.version>  
24. <!--slf4j版本-->  
25. <slf4j-version>1.7.20</slf4j-version>  
26. <!--cdh-spark-->  
27. <spark.cdh.version>1.6.0-cdh5.8.0</spark.cdh.version>  
28. <spark.streaming.cdh.version>1.6.0-cdh5.8.0</spark.streaming.cdh.version>  
29. <kafka.spark.cdh.version>1.6.0-cdh5.8.0</kafka.spark.cdh.version>  
30. <!--cdh-hadoop-->  
31. <hadoop.cdh.version>2.6.0-cdh5.8.0</hadoop.cdh.version>  
32. <!--http client必需要兼容CDH中的hadoop版本(cd /opt/cloudera/parcels/CDH/lib/hadoop/lib)-->  
33. <httpclient.version>4.2.5</httpclient.version>  
34.   
35. <!--http copre-->  
36. <httpcore.version>4.2.5</httpcore.version>  
37. <!--fastjson-->  
38. <fastjson.version>1.1.39</fastjson.version>  
39.   
40. </properties>  
41.   
42. <repositories>  
43. <repository>  
44. <id>scala-tools.org</id>  
45. <name>Scala-Tools Maven2 Repository</name>  
46. <url>http://scala-tools.org/repo-releases</url>  
47. </repository>  
48. <!--配置依赖库地址(用于加载CDH依赖的jar包) -->  
49. <repository>  
50. <id>cloudera</id>  
51. <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>  
52. </repository>  
53. </repositories>  
54.   
55. <pluginRepositories>  
56. <pluginRepository>  
57. <id>scala-tools.org</id>  
58. <name>Scala-Tools Maven2 Repository</name>  
59. <url>http://scala-tools.org/repo-releases</url>  
60. </pluginRepository>  
61. </pluginRepositories>  
62.   
63. <dependencies>  
64.   
65. <!--fastjson-->  
66. <dependency>  
67. <groupId>com.alibaba</groupId>  
68. <artifactId>fastjson</artifactId>  
69. <version>${fastjson.version}</version>  
70. </dependency>  
71. <!--httpclient-->  
72. <dependency>  
73. <groupId>org.apache.httpcomponents</groupId>  
74. <artifactId>httpclient</artifactId>  
75. <version>${httpclient.version}</version>  
76. </dependency>  
77.   
78. <!--http core-->  
79. <dependency>  
80. <groupId>org.apache.httpcomponents</groupId>  
81. <artifactId>httpcore</artifactId>  
82. <version>${httpcore.version}</version>  
83. </dependency>  
84.   
85. <!--slf4j-->  
86. <dependency>  
87. <groupId>org.slf4j</groupId>  
88. <artifactId>slf4j-log4j12</artifactId>  
89. <version>${slf4j-version}</version>  
90. </dependency>  
91. <!--hadoop-->  
92. <dependency>  
93. <groupId>org.apache.hadoop</groupId>  
94. <artifactId>hadoop-client</artifactId>  
95. <version>${hadoop.cdh.version}</version>  
96. <exclusions>  
97. <exclusion>  
98. <groupId>javax.servlet</groupId>  
99. <artifactId>*</artifactId>  
100. </exclusion>  
101. </exclusions>  
102. </dependency>  
103. <dependency>  
104. <groupId>org.apache.hadoop</groupId>  
105. <artifactId>hadoop-common</artifactId>  
106. <version>${hadoop.cdh.version}</version>  
107. <exclusions>  
108. <exclusion>  
109. <groupId>javax.servlet</groupId>  
110. <artifactId>*</artifactId>  
111. </exclusion>  
112. </exclusions>  
113. </dependency>  
114. <dependency>  
115. <groupId>org.apache.hadoop</groupId>  
116. <artifactId>hadoop-hdfs</artifactId>  
117. <version>${hadoop.cdh.version}</version>  
118. <exclusions>  
119. <exclusion>  
120. <groupId>javax.servlet</groupId>  
121. <artifactId>*</artifactId>  
122. </exclusion>  
123. </exclusions>  
124. </dependency>  
125. <!--spark scala-->  
126. <dependency>  
127. <groupId>org.scala-lang</groupId>  
128. <artifactId>scala-library</artifactId>  
129. <version>${scala.version}</version>  
130. </dependency>  
131. <dependency>  
132. <groupId>com.fasterxml.jackson.core</groupId>  
133. <artifactId>jackson-databind</artifactId>  
134. <version>${jackson.version}</version>  
135. </dependency>  
136.   
137. <!--spark streaming和kafka的相关包-->  
138. <dependency>  
139. <groupId>org.apache.spark</groupId>  
140. <artifactId>spark-streaming_2.10</artifactId>  
141. <version>${spark.streaming.cdh.version}</version>  
142. </dependency>  
143. <dependency>  
144. <groupId>org.apache.spark</groupId>  
145. <artifactId>spark-streaming-kafka_2.10</artifactId>  
146. <version>${kafka.spark.cdh.version}</version>  
147. </dependency>  
148. <dependency>  
149. <groupId>junit</groupId>  
150. <artifactId>junit</artifactId>  
151. <version>4.12</version>  
152. <scope>test</scope>  
153. </dependency>  
154.   
155. <!--引入windows本地库的spark包-->  
156. <dependency>  
157. <groupId>org.apache.spark</groupId>  
158. <artifactId>spark-assembly_2.10</artifactId>  
159. <version>${spark.cdh.version}</version>  
160. <scope>system</scope>  
161. <systemPath>D:/crt_send_document/spark-assembly-1.6.0-cdh5.8.0-hadoop2.6.0-cdh5.8.0.jar</systemPath>  
162. </dependency>  
163.   
164. <!--引入测试环境linux本地库的spark包-->  
165. <!--<dependency>-->  
166. <!--<groupId>org.apache.spark</groupId>-->  
167. <!--<artifactId>spark-assembly_2.10</artifactId>-->  
168. <!--<version>${spark.cdh.version}</version>-->  
169. <!--<scope>system</scope>-->  
170. <!--<systemPath>/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples-1.6.0-cdh5.8.0-hadoop2.6.0-cdh5.8.0.jar-->  
171. <!--</systemPath>-->  
172. <!--</dependency>-->  
173.   
174. <!--引入中央仓库的spark包-->  
175. <!--<dependency>-->  
176. <!--<groupId>org.apache.spark</groupId>-->  
177. <!--<artifactId>spark-assembly_2.10</artifactId>-->  
178. <!--<version>${spark.cdh.version}</version>-->  
179. <!--</dependency>-->  
180.   
181. <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-server-web-proxy -->  
182. <dependency>  
183. <groupId>org.apache.hadoop</groupId>  
184. <artifactId>hadoop-yarn-server-web-proxy</artifactId>  
185. <version>2.6.0-cdh5.8.0</version>  
186. </dependency>  
187.   
188. </dependencies>  
189.   
190. <!--maven打包-->  
191. <build>  
192. <finalName>test-my</finalName>  
193. <sourceDirectory>src/main/scala</sourceDirectory>  
194. <testSourceDirectory>src/test/scala</testSourceDirectory>  
195. <plugins>  
196. <plugin>  
197. <groupId>org.scala-tools</groupId>  
198. <artifactId>maven-scala-plugin</artifactId>  
199. <version>2.15.2</version>  
200. <executions>  
201. <execution>  
202. <goals>  
203. <goal>compile</goal>  
204. <goal>testCompile</goal>  
205. </goals>  
206. </execution>  
207. </executions>  
208. <configuration>  
209. <scalaVersion>${scala.version}</scalaVersion>  
210. <args>  
211. <arg>-target:jvm-1.7</arg>  
212. </args>  
213. </configuration>  
214. </plugin>  
215. <plugin>  
216. <groupId>org.apache.maven.plugins</groupId>  
217. <artifactId>maven-eclipse-plugin</artifactId>  
218. <configuration>  
219. <downloadSources>true</downloadSources>  
220. <buildcommands>  
221. <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>  
222. </buildcommands>  
223. <additionalProjectnatures>  
224. <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>  
225. </additionalProjectnatures>  
226. <classpathContainers>  
227. <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>  
228. <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>  
229. </classpathContainers>  
230. </configuration>  
231. </plugin>  
232. <plugin>  
233. <artifactId>maven-assembly-plugin</artifactId>  
234. <configuration>  
235. <descriptorRefs>  
236. <descriptorRef>jar-with-dependencies</descriptorRef>  
237. </descriptorRefs>  
238. <archive>  
239. <manifest>  
240. <mainClass></mainClass>  
241. </manifest>  
242. </archive>  
243. </configuration>  
244. <executions>  
245. <execution>  
246. <id>make-assembly</id>  
247. <phase>package</phase>  
248. <goals>  
249. <goal>single</goal>  
250. </goals>  
251. </execution>  
252. </executions>  
253. </plugin>  
254. </plugins>  
255. </build>  
256. <reporting>  
257. <plugins>  
258. <plugin>  
259. <groupId>org.scala-tools</groupId>  
260. <artifactId>maven-scala-plugin</artifactId>  
261. <configuration>  
262. <scalaVersion>${scala.version}</scalaVersion>  
263. </configuration>  
264. </plugin>  
265. </plugins>  
266. </reporting>  
267.   
268. </project>


1. import kafka.common.TopicAndPartition  
2. import kafka.message.MessageAndMetadata  
3. import kafka.serializer.StringDecoder  
4. import org.apache.log4j.{Level, Logger}  
5. import org.apache.spark.{SparkConf, TaskContext}  
6. import org.apache.spark.streaming.dstream.InputDStream  
7. import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}  
8. import org.apache.spark.streaming.{Seconds, StreamingContext}  
9. import org.slf4j.LoggerFactory  
10.   
11. /**
12.   * Created by yangjf on 2016/12/18
13.   * Update date:
14.   * Time: 11:10
15.   * Describle :从指定偏移量读取kafka数据
16.   * Result of Test:
17.   * Command:
18.   * Email: jifei.yang@ngaa.com.cn
19.   */  
20. object ReadBySureOffsetTest {  
21.   val logger = LoggerFactory.getLogger(ReadBySureOffsetTest.getClass)  
22.   
23.   def main(args: Array[String]) {  
24. //设置打印日志级别  
25. "org.apache.kafka").setLevel(Level.ERROR)  
26. "org.apache.zookeeper").setLevel(Level.ERROR)  
27. "org.apache.spark").setLevel(Level.ERROR)  
28. "测试从指定offset消费kafka的主程序开始")  
29. if (args.length < 1) {  
30. "Your arguments were " + args.mkString(","))  
31. 1)  
32. "主程序意外退出")  
33.     }  
34. //hdfs://hadoop1:8020/user/root/spark/checkpoint  
35.     val Array(checkpointDirectory) = args  
36. "checkpoint检查:" + checkpointDirectory)  
37.     val ssc = StreamingContext.getOrCreate(checkpointDirectory,  
38.       () => {  
39.         createContext(checkpointDirectory)  
40.       })  
41. "streaming开始启动")  
42.     ssc.start()  
43.     ssc.awaitTermination()  
44.   }  
45.   
46.   def createContext(checkpointDirectory: String): StreamingContext = {  
47. //获取配置  
48. "hadoop3:9092,hadoop4:9092"  
49. "20161218a"  
50.   
51. //默认为5秒  
52. 8  
53. // 创建上下文  
54. new SparkConf()  
55. "SendSampleKafkaDataToApple").setMaster("local[2]")  
56. "spark.app.id", "streaming_kafka")  
57.   
58. new StreamingContext(sparkConf, Seconds(split_rdd_time))  
59.   
60.     ssc.checkpoint(checkpointDirectory)  
61.   
62. // 创建包含brokers和topic的直接kafka流  
63. ",").toSet  
64. //kafka配置参数  
65.     val kafkaParams: Map[String, String] = Map[String, String](  
66. "metadata.broker.list" -> brokers,  
67. "group.id" -> "apple_sample",  
68. "serializer.class" -> "kafka.serializer.StringEncoder"  
69. //      "auto.offset.reset" -> "largest"   //自动将偏移重置为最新偏移(默认)  
70. //      "auto.offset.reset" -> "earliest"  //自动将偏移重置为最早的偏移  
71. //      "auto.offset.reset" -> "none"      //如果没有为消费者组找到以前的偏移,则向消费者抛出异常  
72.     )  
73. /**
74.       * 从指定位置开始读取kakfa数据
75.       * 注意:由于Exactly  Once的机制,所以任何情况下,数据只会被消费一次!
76.       *      指定了开始的offset后,将会从上一次Streaming程序停止处,开始读取kafka数据
77.       */  
78. 0, 22753623L),(topics, 1, 327041L))                          //指定topic,partition_no,offset  
79. //构建参数  
80. //构建MessageAndMetadata  
81. //使用高级API从指定的offset开始消费,欲了解详情,  
82. //请进入"http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$"查看  
83.     val messages: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)  
84.   
85. //数据操作  
86.     messages.foreachRDD(mess => {  
87. //获取offset集合  
88.       val offsetsList = mess.asInstanceOf[HasOffsetRanges].offsetRanges  
89.       mess.foreachPartition(lines => {  
90.         lines.foreach(line => {  
91.           val o: OffsetRange = offsetsList(TaskContext.get.partitionId)  
92. "++++++++++++++++++++++++++++++此处记录offset+++++++++++++++++++++++++++++++++++++++")  
93. "${o.topic}  ${o.partition}  ${o.fromOffset}  ${o.untilOffset}")  
94. "+++++++++++++++++++++++++++++++此处消费数据操作++++++++++++++++++++++++++++++++++++++")  
95. "The kafka  line is " + line)  
96.         })  
97.       })  
98.     })  
99.     ssc  
100.   }  
101.   
102. //构建Map  
103.   def setFromOffsets(list: List[(String, Int, Long)]): Map[TopicAndPartition, Long] = {  
104.     var fromOffsets: Map[TopicAndPartition, Long] = Map()  
105. for (offset <- list) {  
106. //topic和分区数  
107. // offset位置  
108.     }  
109.     fromOffsets  
110.   }  
111. }