一、情景:当Spark streaming程序意外退出时,数据仍然再往Kafka中推送,然而由于Kafka默认是从latest的offset读取,这会导致数据丢失。为了避免数据丢失,那么我们需要记录每次消费的offset,以便下次检查并且从指定的offset开始读取
二、环境:kafka-0.9.0、spark-1.6.0、jdk-1.7、Scala-2.10.5、idea16
三、实现代码:
1、引入spark和kafka的相关依赖包
2、新建
1. <?xml version="1.0" encoding="UTF-8"?>
2. <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
3. xmlns="http://maven.apache.org/POM/4.0.0"
4. xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5. <modelVersion>4.0.0</modelVersion>
6.
7. <groupId>com.ngaa</groupId>
8. <artifactId>test-my</artifactId>
9. <version>1.0-SNAPSHOT</version>
10. <inceptionYear>2008</inceptionYear>
11. <properties>
12. <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
13. <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
14. <!--add maven release-->
15. <maven.compiler.source>1.7</maven.compiler.source>
16. <maven.compiler.target>1.7</maven.compiler.target>
17. <encoding>UTF-8</encoding>
18. <!--scala版本-->
19. <scala.version>2.10.5</scala.version>
20. <!--测试机器上的scala版本-->
21. <test.scala.version>2.11.7</test.scala.version>
22.
23. <jackson.version>2.3.0</jackson.version>
24. <!--slf4j版本-->
25. <slf4j-version>1.7.20</slf4j-version>
26. <!--cdh-spark-->
27. <spark.cdh.version>1.6.0-cdh5.8.0</spark.cdh.version>
28. <spark.streaming.cdh.version>1.6.0-cdh5.8.0</spark.streaming.cdh.version>
29. <kafka.spark.cdh.version>1.6.0-cdh5.8.0</kafka.spark.cdh.version>
30. <!--cdh-hadoop-->
31. <hadoop.cdh.version>2.6.0-cdh5.8.0</hadoop.cdh.version>
32. <!--http client必需要兼容CDH中的hadoop版本(cd /opt/cloudera/parcels/CDH/lib/hadoop/lib)-->
33. <httpclient.version>4.2.5</httpclient.version>
34.
35. <!--http copre-->
36. <httpcore.version>4.2.5</httpcore.version>
37. <!--fastjson-->
38. <fastjson.version>1.1.39</fastjson.version>
39.
40. </properties>
41.
42. <repositories>
43. <repository>
44. <id>scala-tools.org</id>
45. <name>Scala-Tools Maven2 Repository</name>
46. <url>http://scala-tools.org/repo-releases</url>
47. </repository>
48. <!--配置依赖库地址(用于加载CDH依赖的jar包) -->
49. <repository>
50. <id>cloudera</id>
51. <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
52. </repository>
53. </repositories>
54.
55. <pluginRepositories>
56. <pluginRepository>
57. <id>scala-tools.org</id>
58. <name>Scala-Tools Maven2 Repository</name>
59. <url>http://scala-tools.org/repo-releases</url>
60. </pluginRepository>
61. </pluginRepositories>
62.
63. <dependencies>
64.
65. <!--fastjson-->
66. <dependency>
67. <groupId>com.alibaba</groupId>
68. <artifactId>fastjson</artifactId>
69. <version>${fastjson.version}</version>
70. </dependency>
71. <!--httpclient-->
72. <dependency>
73. <groupId>org.apache.httpcomponents</groupId>
74. <artifactId>httpclient</artifactId>
75. <version>${httpclient.version}</version>
76. </dependency>
77.
78. <!--http core-->
79. <dependency>
80. <groupId>org.apache.httpcomponents</groupId>
81. <artifactId>httpcore</artifactId>
82. <version>${httpcore.version}</version>
83. </dependency>
84.
85. <!--slf4j-->
86. <dependency>
87. <groupId>org.slf4j</groupId>
88. <artifactId>slf4j-log4j12</artifactId>
89. <version>${slf4j-version}</version>
90. </dependency>
91. <!--hadoop-->
92. <dependency>
93. <groupId>org.apache.hadoop</groupId>
94. <artifactId>hadoop-client</artifactId>
95. <version>${hadoop.cdh.version}</version>
96. <exclusions>
97. <exclusion>
98. <groupId>javax.servlet</groupId>
99. <artifactId>*</artifactId>
100. </exclusion>
101. </exclusions>
102. </dependency>
103. <dependency>
104. <groupId>org.apache.hadoop</groupId>
105. <artifactId>hadoop-common</artifactId>
106. <version>${hadoop.cdh.version}</version>
107. <exclusions>
108. <exclusion>
109. <groupId>javax.servlet</groupId>
110. <artifactId>*</artifactId>
111. </exclusion>
112. </exclusions>
113. </dependency>
114. <dependency>
115. <groupId>org.apache.hadoop</groupId>
116. <artifactId>hadoop-hdfs</artifactId>
117. <version>${hadoop.cdh.version}</version>
118. <exclusions>
119. <exclusion>
120. <groupId>javax.servlet</groupId>
121. <artifactId>*</artifactId>
122. </exclusion>
123. </exclusions>
124. </dependency>
125. <!--spark scala-->
126. <dependency>
127. <groupId>org.scala-lang</groupId>
128. <artifactId>scala-library</artifactId>
129. <version>${scala.version}</version>
130. </dependency>
131. <dependency>
132. <groupId>com.fasterxml.jackson.core</groupId>
133. <artifactId>jackson-databind</artifactId>
134. <version>${jackson.version}</version>
135. </dependency>
136.
137. <!--spark streaming和kafka的相关包-->
138. <dependency>
139. <groupId>org.apache.spark</groupId>
140. <artifactId>spark-streaming_2.10</artifactId>
141. <version>${spark.streaming.cdh.version}</version>
142. </dependency>
143. <dependency>
144. <groupId>org.apache.spark</groupId>
145. <artifactId>spark-streaming-kafka_2.10</artifactId>
146. <version>${kafka.spark.cdh.version}</version>
147. </dependency>
148. <dependency>
149. <groupId>junit</groupId>
150. <artifactId>junit</artifactId>
151. <version>4.12</version>
152. <scope>test</scope>
153. </dependency>
154.
155. <!--引入windows本地库的spark包-->
156. <dependency>
157. <groupId>org.apache.spark</groupId>
158. <artifactId>spark-assembly_2.10</artifactId>
159. <version>${spark.cdh.version}</version>
160. <scope>system</scope>
161. <systemPath>D:/crt_send_document/spark-assembly-1.6.0-cdh5.8.0-hadoop2.6.0-cdh5.8.0.jar</systemPath>
162. </dependency>
163.
164. <!--引入测试环境linux本地库的spark包-->
165. <!--<dependency>-->
166. <!--<groupId>org.apache.spark</groupId>-->
167. <!--<artifactId>spark-assembly_2.10</artifactId>-->
168. <!--<version>${spark.cdh.version}</version>-->
169. <!--<scope>system</scope>-->
170. <!--<systemPath>/opt/cloudera/parcels/CDH/lib/spark/lib/spark-examples-1.6.0-cdh5.8.0-hadoop2.6.0-cdh5.8.0.jar-->
171. <!--</systemPath>-->
172. <!--</dependency>-->
173.
174. <!--引入中央仓库的spark包-->
175. <!--<dependency>-->
176. <!--<groupId>org.apache.spark</groupId>-->
177. <!--<artifactId>spark-assembly_2.10</artifactId>-->
178. <!--<version>${spark.cdh.version}</version>-->
179. <!--</dependency>-->
180.
181. <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-server-web-proxy -->
182. <dependency>
183. <groupId>org.apache.hadoop</groupId>
184. <artifactId>hadoop-yarn-server-web-proxy</artifactId>
185. <version>2.6.0-cdh5.8.0</version>
186. </dependency>
187.
188. </dependencies>
189.
190. <!--maven打包-->
191. <build>
192. <finalName>test-my</finalName>
193. <sourceDirectory>src/main/scala</sourceDirectory>
194. <testSourceDirectory>src/test/scala</testSourceDirectory>
195. <plugins>
196. <plugin>
197. <groupId>org.scala-tools</groupId>
198. <artifactId>maven-scala-plugin</artifactId>
199. <version>2.15.2</version>
200. <executions>
201. <execution>
202. <goals>
203. <goal>compile</goal>
204. <goal>testCompile</goal>
205. </goals>
206. </execution>
207. </executions>
208. <configuration>
209. <scalaVersion>${scala.version}</scalaVersion>
210. <args>
211. <arg>-target:jvm-1.7</arg>
212. </args>
213. </configuration>
214. </plugin>
215. <plugin>
216. <groupId>org.apache.maven.plugins</groupId>
217. <artifactId>maven-eclipse-plugin</artifactId>
218. <configuration>
219. <downloadSources>true</downloadSources>
220. <buildcommands>
221. <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
222. </buildcommands>
223. <additionalProjectnatures>
224. <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
225. </additionalProjectnatures>
226. <classpathContainers>
227. <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
228. <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
229. </classpathContainers>
230. </configuration>
231. </plugin>
232. <plugin>
233. <artifactId>maven-assembly-plugin</artifactId>
234. <configuration>
235. <descriptorRefs>
236. <descriptorRef>jar-with-dependencies</descriptorRef>
237. </descriptorRefs>
238. <archive>
239. <manifest>
240. <mainClass></mainClass>
241. </manifest>
242. </archive>
243. </configuration>
244. <executions>
245. <execution>
246. <id>make-assembly</id>
247. <phase>package</phase>
248. <goals>
249. <goal>single</goal>
250. </goals>
251. </execution>
252. </executions>
253. </plugin>
254. </plugins>
255. </build>
256. <reporting>
257. <plugins>
258. <plugin>
259. <groupId>org.scala-tools</groupId>
260. <artifactId>maven-scala-plugin</artifactId>
261. <configuration>
262. <scalaVersion>${scala.version}</scalaVersion>
263. </configuration>
264. </plugin>
265. </plugins>
266. </reporting>
267.
268. </project>
1. import kafka.common.TopicAndPartition
2. import kafka.message.MessageAndMetadata
3. import kafka.serializer.StringDecoder
4. import org.apache.log4j.{Level, Logger}
5. import org.apache.spark.{SparkConf, TaskContext}
6. import org.apache.spark.streaming.dstream.InputDStream
7. import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
8. import org.apache.spark.streaming.{Seconds, StreamingContext}
9. import org.slf4j.LoggerFactory
10.
11. /**
12. * Created by yangjf on 2016/12/18
13. * Update date:
14. * Time: 11:10
15. * Describle :从指定偏移量读取kafka数据
16. * Result of Test:
17. * Command:
18. * Email: jifei.yang@ngaa.com.cn
19. */
20. object ReadBySureOffsetTest {
21. val logger = LoggerFactory.getLogger(ReadBySureOffsetTest.getClass)
22.
23. def main(args: Array[String]) {
24. //设置打印日志级别
25. "org.apache.kafka").setLevel(Level.ERROR)
26. "org.apache.zookeeper").setLevel(Level.ERROR)
27. "org.apache.spark").setLevel(Level.ERROR)
28. "测试从指定offset消费kafka的主程序开始")
29. if (args.length < 1) {
30. "Your arguments were " + args.mkString(","))
31. 1)
32. "主程序意外退出")
33. }
34. //hdfs://hadoop1:8020/user/root/spark/checkpoint
35. val Array(checkpointDirectory) = args
36. "checkpoint检查:" + checkpointDirectory)
37. val ssc = StreamingContext.getOrCreate(checkpointDirectory,
38. () => {
39. createContext(checkpointDirectory)
40. })
41. "streaming开始启动")
42. ssc.start()
43. ssc.awaitTermination()
44. }
45.
46. def createContext(checkpointDirectory: String): StreamingContext = {
47. //获取配置
48. "hadoop3:9092,hadoop4:9092"
49. "20161218a"
50.
51. //默认为5秒
52. 8
53. // 创建上下文
54. new SparkConf()
55. "SendSampleKafkaDataToApple").setMaster("local[2]")
56. "spark.app.id", "streaming_kafka")
57.
58. new StreamingContext(sparkConf, Seconds(split_rdd_time))
59.
60. ssc.checkpoint(checkpointDirectory)
61.
62. // 创建包含brokers和topic的直接kafka流
63. ",").toSet
64. //kafka配置参数
65. val kafkaParams: Map[String, String] = Map[String, String](
66. "metadata.broker.list" -> brokers,
67. "group.id" -> "apple_sample",
68. "serializer.class" -> "kafka.serializer.StringEncoder"
69. // "auto.offset.reset" -> "largest" //自动将偏移重置为最新偏移(默认)
70. // "auto.offset.reset" -> "earliest" //自动将偏移重置为最早的偏移
71. // "auto.offset.reset" -> "none" //如果没有为消费者组找到以前的偏移,则向消费者抛出异常
72. )
73. /**
74. * 从指定位置开始读取kakfa数据
75. * 注意:由于Exactly Once的机制,所以任何情况下,数据只会被消费一次!
76. * 指定了开始的offset后,将会从上一次Streaming程序停止处,开始读取kafka数据
77. */
78. 0, 22753623L),(topics, 1, 327041L)) //指定topic,partition_no,offset
79. //构建参数
80. //构建MessageAndMetadata
81. //使用高级API从指定的offset开始消费,欲了解详情,
82. //请进入"http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.streaming.kafka.KafkaUtils$"查看
83. val messages: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
84.
85. //数据操作
86. messages.foreachRDD(mess => {
87. //获取offset集合
88. val offsetsList = mess.asInstanceOf[HasOffsetRanges].offsetRanges
89. mess.foreachPartition(lines => {
90. lines.foreach(line => {
91. val o: OffsetRange = offsetsList(TaskContext.get.partitionId)
92. "++++++++++++++++++++++++++++++此处记录offset+++++++++++++++++++++++++++++++++++++++")
93. "${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
94. "+++++++++++++++++++++++++++++++此处消费数据操作++++++++++++++++++++++++++++++++++++++")
95. "The kafka line is " + line)
96. })
97. })
98. })
99. ssc
100. }
101.
102. //构建Map
103. def setFromOffsets(list: List[(String, Int, Long)]): Map[TopicAndPartition, Long] = {
104. var fromOffsets: Map[TopicAndPartition, Long] = Map()
105. for (offset <- list) {
106. //topic和分区数
107. // offset位置
108. }
109. fromOffsets
110. }
111. }