Kafka做为一款流行的分布式发布订阅消息系统,以高吞吐、低延时、高可靠的特点著称
其实说白了,官方提供的思路就是,把JavaInputDStream转换为OffsetRange对象,该对象具有topic对应的分区的所有信息,每次batch处理完,Spark Streaming都会自动更新该对象,所以你只需要找个合适的地方保存该对象(比如HBase、HDFS),就可以愉快的操纵offset了。
一、SparkStreaming直连方式读取kafka数据,使用MySQL保存偏移量
在数据库中新建一张表Offset,表结构设计如图
/*kafka偏移量保存在数据库,spark从kafka拉去数据时候,先读取数据库偏移量*/
object StreamingKafkaMysqlOffset {
//设置日志级别
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
//conf 本地运行设置
val conf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getSimpleName)
//每秒钟每个分区kafka拉取消息的速率
.set("spark.streaming.kafka.maxRatePerPartition", "100")
// 序列化
.set("spark.serilizer", "org.apache.spark.serializer.KryoSerializer")
// 建议开启rdd的压缩
.set("spark.rdd.compress", "true")
//SparkStreaming
val ssc: StreamingContext = new StreamingContext(conf, Seconds(1))
// kafka的参数配置
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "hadoop:9092,hadoop-01:9092,hadoop-02:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean) //自己维护偏移量
)
val groupId = "topic_group0"
val topic = "order"
val topics = Array(topic)‘
val config: Config = ConfigFactory.load()
// 需要设置偏移量的值
val offsets: mutable.HashMap[TopicPartition, Long] = mutable.HashMap[TopicPartition, Long]()
val conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?characterEncoding=utf-8", "root", "123456")
val conn1 = DriverManager.getConnection(config.getString("db.url"), config.getString("db.user"), config.getString("db.password"))
val pstm = conn.prepareStatement("select * from mysqloffset where groupId = ? and topic = ? ")
pstm.setString(1, groupId)
pstm.setString(2, topic)
val result: ResultSet = pstm.executeQuery()
while (result.next()) {
// 把数据库中的偏移量数据加载了
val p = result.getInt("partition")
val f = result.getInt("untilOffset")
// offsets += (new TopicPartition(topic,p)-> f)
val partition: TopicPartition = new TopicPartition(topic, p)
offsets.put(partition, f)
}
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
Subscribe[String, String](topics, kafkaParams, offsets)
)
//转换成RDD
stream.foreachRDD(rdd => {
//手动指定分区的地方
val ranges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
println("长度=" + ranges.length)
ranges.foreach(println)
//: RDD[(String, Int)]
val result = rdd.map(_.value()).flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _)
result.foreach(println)
// result.foreachPartition(p => {
// val jedis: Jedis = ToolsRedisMysql.getJedis()
// // val jedis = RedisUtils.getJedis
// p.foreach(zookeeper => {
// jedis.hincrBy("wc1", zookeeper._1, zookeeper._2)
// })
// jedis.close()
// })
// 把偏移量的Array 写入到mysql中
ranges.foreach(zookeeper => {
// 思考,需要保存哪些数据呢? 起始的offset不需要 还需要加上 groupid
val pstm = conn.prepareStatement("replace into mysqloffset values (?,?,?,?)")
pstm.setString(1, zookeeper.topic)
pstm.setInt(2, zookeeper.partition)
pstm.setLong(3, zookeeper.untilOffset)
pstm.setString(4, groupId)
pstm.execute()
pstm.close()
})
})
ssc.start()
ssc.awaitTermination()
}
}
二、offset 保存到hbase
import scala.collection.mutable
/** 单个跟组情况
* 手工操作offset
* 1 从hbase获取offset,从kafka拉取数据
没有分组消费,所以没有分组信息
htable: hbase_consumer_offset
Family: topic_partition_offset
column: topic
partition
offset
rowkey:topic_groupid_partition
* 2 数据处理完后,把until offset 保存到hbase
* 3 kafka 长时间挂掉之后,从kafka最早的offset 开始读取 此处还需要处理
*/
object OffsetOperate {
var hbaseProp = PropertiesUtil.getProperties("hbase")
var kafkaconsumePro = PropertiesUtil.getProperties("kafkaconsume")
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("sparkStreaming - offset operate")
.setMaster("local[2]") // --master local[2] | spark://xx:7077 | yarn
.set("spark.testing.memory", "2147480000")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc,Seconds(5))
//kafka配置
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> kafkaconsumePro.getProperty("bootstrap.servers"),
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> kafkaconsumePro.getProperty("group"),
"auto.offset.reset" -> "earliest", // 第一次读取时从topic 首位置开始读取
"enable.auto.commit" -> (false: java.lang.Boolean)// kafka 不保存消费的offset
)
//监听频道
val topics = Array(kafkaconsumePro.getProperty("topics"))
// 获取hbase连接
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum",hbaseProp.getProperty("quorum")) //zookeeper 集群
hbaseConf.set("hbase.zookeeper.property.client","2181")
hbaseConf.set("hbase.master", hbaseProp.getProperty("hbase_master"))
hbaseConf.set("hbase.defaults.for.version.skip", "true")
//获取连接对象
val conn = ConnectionFactory.createConnection(hbaseConf)
val admin = conn.getAdmin
val tn = TableName.valueOf("hbase_consumer_offset") //hbase 表名
val isExist = admin.tableExists(tn)
val streams : InputDStream[ConsumerRecord[String,String]]= {
if(isExist) {
val table = new HTable(hbaseConf, "hbase_consumer_offset")
val filter = new RowFilter(CompareOp.GREATER_OR_EQUAL, new BinaryPrefixComparator(Bytes.toBytes(topics + "_"+ groupid)))
println("============ 过滤器已经创建 ==========")
val s = new Scan()
s.setFilter(filter)
val rs = table.getScanner(s)
// 设置 offset
val fromOffsets = scala.collection.mutable.Map[TopicPartition, Long]()
var s1 = ""
var s2 = 0
var s3: Long = 0
for (r: Result <- rs.next(200)) {
println("rowKey : " + new String(r.getRow))
for (keyvalue: KeyValue <- r.raw()) {
if ("topic".equals(new String(keyvalue.getQualifier))) {
s1 = new String(keyvalue.getValue)
println("columnFamily :" + new String(keyvalue.getFamily) + " column :" +new String( keyvalue.getQualifier) + s1)
} else if ("partition".equals(new String(keyvalue.getQualifier))){
s2 = Bytes.toInt(keyvalue.getValue)
println("columnFamily :" + new String(keyvalue.getFamily) + " column :" + new String( keyvalue.getQualifier) + s2)
} else if("offset".equals(new String(keyvalue.getQualifier))) { //if("offset".equals(new String(keyvalue.getQualifier)))
s3 = Bytes.toLong(keyvalue.getValue)
println("columnFamily :" + new String(keyvalue.getFamily) + " column :" + new String( keyvalue.getQualifier) + s3)
}
}
fromOffsets.put(new TopicPartition(s1, s2), s3)
}
println("fromOffset is : "+fromOffsets)
KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent,
ConsumerStrategies.Assign(fromOffsets.keySet, kafkaParams, fromOffsets)) //(fromOffsets.keySet,kafkaParams,fromOffsets))
}else{ //Hbase 里面不存在offset表,从topic首位置开始消费
val htable = new HTableDescriptor(TableName.valueOf("hbase_consumer_offset"))
htable.addFamily(new HColumnDescriptor(("topic_partition_offset")))
admin.createTable(htable)
println("表已经创建成功========" + htable)
KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe(topics, kafkaParams))
}
}
// val dstream = streams.map(x=>URLDecoder.decode(x.value()))
// 操作成功后更新offset
streams.foreachRDD{ rdd =>
//if(!rdd.isEmpty()){
// 打成一个事务,把业务计算和offset保存放在一起,要么成功,要么一起失败,实现精确一次的消费
import scala.collection.JavaConversions._
val table = new HTable(hbaseConf,"hbase_consumer_offset")
table.setAutoFlush(false, false)
var putList:List[Put] = List()
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges // RDD[ConsumerRecord[String,String]] 强转成offsetRanges
for(offsetRange <- offsetRanges){
println("the topic is "+offsetRange.topic)
println("the partition is "+offsetRange.partition)
println("the fromOffset is "+offsetRange.fromOffset)
println("the untilOffset is "+offsetRange.untilOffset)
println("the object is "+offsetRange)
// val table = new HTable(hbaseConf,"hbase_consumer_offset")
// table.setAutoFlush(false, false)
val put = new Put(Bytes.toBytes(offsetRange.topic+"_"+groupid+"_"+offsetRange.partition))//put时候指定列族
put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("topic"),Bytes.toBytes(offsetRange.topic))
put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("partition"),Bytes.toBytes(offsetRange.partition))
put.add(Bytes.toBytes("topic_partition_offset"),Bytes.toBytes("offset"),Bytes.toBytes(offsetRange.untilOffset))
putList = put+:putList
// println("add data success !")
}
println("the RDD records are "+rdd.map{x =>URLDecoder.decode(x.value())}.collect.foreach(println)) // 程序的计算逻辑
// }
table.put(putList)
table.flushCommits()
println("add and compute data success !")
}
ssc.start()
ssc.awaitTermination()
}
}
实现的Spark Streaming代码如下(ConsumerRecord类不能序列化,使用时要注意,不要分发该类到其他工作节点上,避免错误打印)
三、存储在redis(基于内存)读写更快,
2、多个服务器分区,多个组消费组,设计key: 主题_分组; 分区; value :offset
gtKey=groupid/topic作为唯一标识
conn.hset(gtKey, partition.toString, offset.toString)
object KafkaDricteRedis {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("redis").setMaster("local[*]")
val ssc = new StreamingContext(conf, new Duration(5000))
val groupid = "GB01" //组名
val topic = "wordcount3"
//topic 名
//在redis中以 groupid/topic作为唯一标识 ,存储分区偏移量
//在Reids 使用的时hash类型来存储
val gtKey = groupid + "/" + topic
//topic
val topics = Set(topic)
//zk地址
val zkQuorum = "hadoop01:2181,hadoop02:2181,hadoop03:2181"
//brokerList
val brokerList = "hadoop01:9092,hadoop03:9092"
val kafkaParams = Map(
// metadata.broker.list
"metadata.broker.list" -> brokerList,
"group.id" -> groupid,
"auto.offset.reset" -> kafka.api.OffsetRequest.SmallestTimeString
//从头开始消费
)
//记录topic 、分区对应的偏移量偏移量,在创建InputDStream时作为参数传如
//从这个偏移量开始读取
var fromOffset: Map[TopicAndPartition, Long] = Map[TopicAndPartition, Long]()
var offsets = Map[TopicPartition, Long]()
var kafkaDStream: InputDStream[(String, String)] = null
// 获取一个jedis连接
val conn = getConnection()
// conn.flushDB()
//jd.hget(groupid+topic,"")
//获取全部的keys
val values: util.Set[String] = conn.keys("*")
//println(values)
// [GB01/wordcount3] 分区数 偏移量
//如果keys中包含 GB01/wordcount3这样的key,则表示以前读取过
if (values.contains(gtKey)) {
//获取key 为GB01/wordcount3 下面所对应的(k,v)
var allKey: util.Map[String, String] = conn.hgetAll(gtKey)
//导入后,可以把Java中的集合转换为Scala中的集合
import scala.collection.JavaConversions._
var list: List[(String, String)] = allKey.toList
//循环得到的(k,v)
//这里面的 k 对应的是分区, v对应的是偏移量
for (key <- list) { //这里的key是一个tuple类型
//new一个TopicAndPartition 把 topic 和分区数传入
val tp = new TopicAndPartition(topic, key._1.toInt)
//把每个topic 分区 对应的偏移量传入
fromOffset += tp -> key._2.toLong
// 把数据库中的偏移量数据加载了
val p = key._1.toInt
val f = key._2.toLong
// offsets += (new TopicPartition(topic,p)-> f)
val partition: TopicPartition = new TopicPartition(topic, p)
offsets.put(partition, f)
}
//这里的是把数据(key ,value)是kafka 的key默认是null,
//value 是kafka中的value
val messageHandler = (mmd: MessageAndMetadata[String, String]) => {
(mmd.key(), mmd.message())
}
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
Subscribe[String, String](topics, kafkaParams, offsets)
)
} else {
//如果以前没有读取过,创建一个新的InputDStream
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams))
}
//用来更新偏移量,OffsetRange中可以获取分区及偏移量
var OffsetRangs = Array[OffsetRange]()
//
kafkaDStream.foreachRDD(kafkaRDD => {
//这里面的RDD是kafkaRDD ,可以转换为HasOffsetRange
val ranges: HasOffsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges]
OffsetRangs = ranges.offsetRanges
//获取value,(key 默认是null,没有用)
val map: RDD[String] = kafkaRDD.map(_._2)
map.foreach(x => println(x + "==========================="))
//更新偏移量
for (o <- OffsetRangs) {
//取出偏移量
val offset = o.untilOffset
//取出分区
val partition = o.partition
println("partition: " + partition)
println("offset: " + offset)
//把通过hset,把对应的partition和offset写入到redis中
conn.hset(gtKey, partition.toString, offset.toString)
}
})
ssc.start()
ssc.awaitTermination()
}
//Jedis连接池
def getConnection(): Jedis = {
//new 一个JedisPoolConfig,用来设定参数
val conf = new JedisPoolConfig()
val pool = new JedisPool(conf, "hadoop01", 6379)
//最大连接数
conf.setMaxTotal(20)
//最大空闲数
conf.setMaxIdle(20)
val jedis = pool.getResource()
//密码
jedis.auth("123")
jedis
}
}
object direct_offset_redis {
// 过滤日志
Logger.getLogger("org").setLevel(Level.WARN)
def main(args: Array[String]): Unit = {
val Array(topic, brokers, group, sec) = args
val conf = new SparkConf().setAppName("direct_offset_redis").setMaster("local[2]")
//每秒钟每个分区kafka拉取消息的速率
.set("spark.streaming.kafka.maxRatePerPartition", "100")
// 序列化
.set("spark.serilizer", "org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(conf, Seconds(sec.toInt))
//每多少秒对数据进行切分形成一个RDD
val topics = Array(topic)
//配置连接Kafka的参数
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
// kafka的key和value的解码方式
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> group,
// 从头开始消费
"auto.offset.reset" -> "earliest", //"latest"
"enable.auto.commit" -> (false: lang.Boolean)
)
//启动二参数设置 (获取Redis中的kafka偏移量)
var formdbOffset: Map[TopicPartition, Long] = JedisOffset(group)
//拉取kafka数据
val stream: InputDStream[ConsumerRecord[String, String]] =
// 首先判断一下 我们要消费的kafka数据是否是第一次消费,之前有没有消费过
if (formdbOffset.size == 0) {
KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
)
} else {
KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
// ConsumerStrategies.Assign[String, String](formdbOffset.keys, kafkaParams, formdbOffset)
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, formdbOffset) //消费策略,源码强烈推荐使用该策略
)
}
//数据偏移量处理。
stream.foreachRDD({
rdd =>
// 获得偏移量对象数组
val offsetRange: Array[OffsetRange] =
rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//业务代码
rdd.map(_.value())
.map((_, 1))
.reduceByKey(_ + _).foreach(println)
// 偏移量存入redis
val jedis: Jedis = JedisConnectionPool.getConnection()
for (or <- offsetRange) {
jedis.hset(group, or.topic + "-" + or.partition, or.untilOffset.toString)
}
jedis.close()
})
// 启动Streaming程序
ssc.start()
ssc.awaitTermination()
}
object JedisOffset {
def apply(groupId: String) = {
// 创建Map形式的Topic、partition、Offset
var formdbOffset = Map[TopicPartition, Long]()
//获取Jedis连接
val jedis1 = JedisConnectionPool.getConnection()
// 查询出Redis中的所有topic partition
val topicPartitionOffset: util.Map[String, String] = jedis1.hgetAll(groupId)
// 导入隐式转换
import scala.collection.JavaConversions._
// 将Redis中的Topic下的partition中的offset转换成List
val topicPartitionOffsetlist: List[(String, String)] =
topicPartitionOffset.toList
// 循环处理所有的数据
for (topicPL <- topicPartitionOffsetlist) {
val split: Array[String] = topicPL._1.split("[-]")
formdbOffset += (
new TopicPartition(split(0), split(1).toInt) -> topicPL._2.toLong)
}
formdbOffset
}
}
}
四、kafka保存偏移量到zookeeper
object othersUtil {
// todo kafka保存偏移量到zookeeper
def kafkaAndZookeeper(ssc: StreamingContext): DStream[String] = {
val group = "DirectAndZk"
val topic = "apkmsg"
val brokerList = "hadoop1:9092"
val zkQuorum = "hadoop1:2181,hadoop2:2181,hadoop3:2181"
val topics: Set[String] = Set(topic)
val topicDirs = new ZKGroupTopicDirs(group, topic)
val zkTopicPath = s"${topicDirs.consumerOffsetDir}"
val kafkaParams = Map(
"metadata.broker.list" -> brokerList,
"group.id" -> group,
"auto.offset.reset" -> kafka.api.OffsetRequest.LargestTimeString
)
val zkClient = new ZkClient(zkQuorum)
val children = zkClient.countChildren(zkTopicPath)
var kafkaStream: InputDStream[(String, String)] = null
var fromOffsets: Map[TopicAndPartition, Long] = Map()
if (children > 0) {
for (i <- 0 until children) {
val partitionOffset = zkClient.readData[String](s"$zkTopicPath/${i}")
val tp = TopicAndPartition(topic, i)
fromOffsets += (tp -> partitionOffset.toLong)
}
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key(), mmd.message())
kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
} else {
kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}
var offsetRanges = Array[OffsetRange]()
kafkaStream.foreachRDD { kafkaRDD =>
offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
for (o <- offsetRanges) {
val zkPath = s"${topicDirs.consumerOffsetDir}/${o.partition}"
ZkUtils.updatePersistentPath(zkClient, zkPath, o.untilOffset.toString)
}
}
val streamrdd = kafkaStream.map(_._2)
streamrdd
}
}
object kafkazookeeper {
def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf().setAppName("KafkaDirectWordCount")
.setMaster("local[6]")
val ssc = new StreamingContext(conf, Seconds(5))
othersUtil.kafkaAndZookeeper(ssc)
.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
.foreachRDD(x => {
println("****************************************")
println(x.collect().mkString("\n"))
println("****************************************")
})
ssc.start()
ssc.awaitTermination()
}
}
手动输入kafka源数据
369 963 666
5 5 5
6 6 6
0 0 0
5 5 5
6 6 6
0 0 0
zw zw zw zw
zw zw zw zw
55 55 55
结果
(0,6)
(5,6)
(6,6)
(zw,8)
(55,3)
(963,1)
(666,1)
(369,1)
package com.kafka_mysql
/** 使用Spark-Kafka-0-10版本整合,并手动提交偏移量,维护到Zookeeper中
*/
//手动控制spark 消费 kafka的偏移度
//保证spark在意外退出时,重启程序数据不丢失
object direct_offset_zookeeper {
Logger.getLogger("org").setLevel(Level.WARN)
//zookeeper 实例化,方便后面对zk的操作
val zk = ZkWork
def main(args: Array[String]): Unit = {
val Array(brokers, topic, group, sec) = args
val conf = new SparkConf().setAppName("direct_offset_zookeeper").setMaster("local[*]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(sec.toInt))
val topics = Array(topic)
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> group,
"auto.offset.reset" -> "earliest", //"latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
//你可以通过增大会话时间(max.poll.interval.ms)
// 或者减小poll()方法处理的最大记录条数(max.poll.records)
//"max.poll.interval.ms" -> "KAFKA_MAX_POLL_INTERVAL_MS",
//"max.poll.records" -> "KAFKA_MAX_POLL_RECORDS"
)
// 判断zk中是否有保存过该计算的偏移量
// 如果没有保存过,使用不带偏移量的计算,在计算完后保存
// 精髓就在于KafkaUtils.createDirectStream这个地方
// 默认是KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams)),不加偏移度参数
// 实在找不到办法,最后啃了下源码。发现可以使用偏移度参数
val stream = if (zk.znodeIsExists(s"${topic}offset")) {
val nor = zk.znodeDataGet(s"${topic}offset")
val newOffset = Map(new TopicPartition(nor(0).toString, nor(1).toInt) -> nor(2).toLong) //创建以topic,分区为k 偏移度为v的map
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
println(s"[ DealFlowBills2 ] topic ${nor(0).toString}")
println(s"[ DealFlowBills2 ] Partition ${nor(1).toInt}")
println(s"[ DealFlowBills2 ] offset ${nor(2).toLong}")
println(s"[ DealFlowBills2 ] zk中取出来的kafka偏移量★★★ $newOffset")
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams, newOffset))
} else {
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
println(s"[ DealFlowBills2 ] 第一次计算,没有zk偏移量文件")
println(s"[ DealFlowBills2 ] 手动创建一个偏移量文件 ${topic}offset 默认从0号分区 0偏移度开始计算")
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
zk.znodeCreate(s"${topic}offset", s"$topic,0,0")
val nor = zk.znodeDataGet(s"${topic}offset")
val newOffset = Map(new TopicPartition(nor(0).toString, nor(1).toInt) -> nor(2).toLong)
KafkaUtils.createDirectStream[String, String](ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams, newOffset))
}
//业务处理代码
val lines = stream.map(_.value)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
wordCounts.print()
//保存偏移度部分
//如果在计算的时候失败了,会接着上一次偏移度进行重算,不保存新的偏移度
//计算成功后保存偏移度
stream.foreachRDD {
rdd =>
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition {
iter =>
val o: OffsetRange = offsetRanges(TaskContext.get.partitionId)
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
println(s"[ DealFlowBills2 ] topic: ${o.topic}")
println(s"[ DealFlowBills2 ] partition: ${o.partition} ")
println(s"[ DealFlowBills2 ] fromOffset 开始偏移量: ${o.fromOffset} ")
println(s"[ DealFlowBills2 ] untilOffset 结束偏移量: ${o.untilOffset} 需要保存的偏移量,供下次读取使用★★★")
println(s"[ DealFlowBills2 ] --------------------------------------------------------------------")
// 写zookeeper
zk.offsetWork(s"${o.topic}offset", s"${o.topic},${o.partition},${o.untilOffset}")
// 写本地文件系统
// val fw = new FileWriter(new File("/home/hadoop1/testjar/test.log"), true)
// fw.write(offsetsRangerStr)
// fw.close()
}
}
//最后结果保存到hdfs
//result.saveAsTextFiles(output + s"/output/" + "010")
//spark streaming 开始工作
ssc.start()
ssc.awaitTermination()
}
}
updateStateByKey
updateFunc
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.Seconds
import org.apache.spark.HashPartitioner
object kafacount{
//定义一个函数,可以将之前数据与现在数据累加
//upStateByKey已经是分好组的数据。
//String为单词,SparkStreaming是按批次操作,Seq是这个批次,每个单词产生的1,1,1,1(这里就是出现一个数据记录一个1)
//option上一次单词出现的次数。为什么要用Option 因为第一次没有,可以使用getOrElse(0)
def updateFunc=(it : Iterator[(String,Seq[int],Option[Int])]){
it.flatMap(it=>Some(it._2.sum+it._3.getOrElse(0)).map(x=>(it._1,x)))
//已经知道为3个参数,对应xyz。y当前输入value 然后做累加,z是之前输入value的计数 getOrElse(0)代表如果为第一次则Z默认为0。y+z就是输入数据的总次数
it.flatMap{ case(x,y,z)=>some( y.sum+z.getOrElse(0).map(i=>(x,i)))
}
}
def main(args:Array[String]){
//设置输入日志
LoggerLevels.setStreamingLogLevels()
//设置输入的参数作为元组拿到
val Array(zkQuorum,group,topics,numThreads)=args
val conf =new SparkConf().setAppName("kafkacount").setMaster("local[2]")
val sc= new StreamingContext(conf,seconds(5))
//使用updateStateByKey必须要设置检查点
sc.checkpoint("/home/hadoop/Data")
//输入的topic肯能有多个然后进行切分,后与线程数映射成K V
val topicMap=topics.split(",").map((_,numThreads.toInt)).toMap
//用kafakUtils.createStream 主要拿kafka topic输入数据
//Streaming 配置, zk地址,组的名字,topic
val data=KafkaUtils.createStreami(sc,zkQuorum,group,topmicMap)
//在kafka生产者输入的值为value,因此要拿第二个数据_2,key是随意的
//然后输入切分
val word=data.map(_._2).flapMap(_.split(" "))
//将数据作为key 然后每一个key对应一个1。
//用updateStateByKey 计算数据总个数,不止是当前输入的,还会累加之前的数据
//第一个参数为一个函数,定义为tpdateFunc
//第二个是分区数,new一个HashPartitioner,指定一个分区数量sc.sparkContext.defultParallelism为默认分区数
//第三个表示是不是要记住这个分区器
val wordcount=word.map((_,1)).updateStateByKey(updateFunc,new HashPartitioner(sc.defultParallelism),true)
}
wordcount.print()
sc.start()
sc.awaitTermination()
}