前言

代码亲测

Streaming-kafka-0-8            mysql、zookeeper

Streaming-kafka-0-10          kafka、redis

其中都是翻阅前辈们的代码分享,总结汇总在这里供自己参考,但 kafka 的 offset 生产一般都维护在 HBase、kafka、mysql 中,zookeeper 维护需要消耗 zookeeper 资源,切 zookeeper 还是比较昂贵的。

Kafka做为一款流行的分布式发布订阅消息系统,以高吞吐、低延时、高可靠的特点著称,已经成为Spark Streaming常用的流数据来源。

官方提供的思路就是,把JavaInputDStream转换为OffsetRange对象,该对象具有topic对应的分区的所有信息,每次batch处理完,Spark Streaming都会自动更新该对象,所以你只需要找个合适的地方保存该对象(比如HBase、HDFS、Mysql),就可以操纵offset了。

一、offset 保存在 mysql 中

首先在mysql中建立一张表用于存放 offset

kafka spark入门项目 sparkstreaming kafka offset_kafka

数据库相关配置信息

kafka spark入门项目 sparkstreaming kafka offset_kafka_02

KafkaProducer 代码

package Utils

import java.util.Properties
import kafka.producer.{KeyedMessage, Producer, ProducerConfig}

object kafka_producer {
  def main(args: Array[String]): Unit = {

    val topic ="word"
    val brokers ="hadoop01:9092,hadoop02:9092,hadoop03:9092"
    val prop=new Properties()
    prop.put("metadata.broker.list",brokers)
    prop.put("serializer.class", "kafka.serializer.StringEncoder")

    val kafkaConfig=new ProducerConfig(prop)
    val producer=new Producer[String,String](kafkaConfig)

    val content:Array[String]=new Array[String](5)
    content(0)="kafka kafka produce"
    content(1)="kafka produce message"
    content(2)="hello world hello"
    content(3)="wordcount topK topK"
    content(4)="hbase spark kafka"
    while (true){
      val i=(math.random * 5).toInt
      producer.send(new KeyedMessage[String,String](topic,content(i)))
      println(content(i))
      Thread.sleep(2000)
    }
  }
}
<!-- scalikejdbc -->
    <dependency>
      <groupId>org.scalikejdbc</groupId>
      <artifactId>scalikejdbc-core_2.11</artifactId>
      <version>2.5.0</version>
    </dependency>
    <dependency>
        <groupId>org.scalikejdbc</groupId>
        <artifactId>scalikejdbc_2.11</artifactId>
        <version>2.5.0</version>
    </dependency>
    <dependency>
        <groupId>org.scalikejdbc</groupId>
        <artifactId>scalikejdbc-config_2.11</artifactId>
        <version>2.5.0</version>
    </dependency>

SparkStreaming 代码对结果的处理这里只控制台打印,将offset维护在 mysql 中

package SparkStreaming

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.{Err, LeaderOffset}
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scalikejdbc.{DB, SQL}
import scalikejdbc.config.DBs

object SparkStreamingOffsetMySql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("SparkStreamingOffsetMysql")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(2))
    // 基本设置
    val groupid = "GPMMCC"
    val brokerList = "hadoop01:9092,hadoop02:9092,hadoop03:9092"
    val topic = "word"
    val topics = Set(topic)
    val kafkaParams = Map(
      "metadata.broker.list" -> brokerList,
      "group.id" -> groupid,
      "auto.offset.reset" -> kafka.api.OffsetRequest.SmallestTimeString
    )

    // connect mysql
    DBs.setupAll()

    val fromdbOffset: Map[TopicAndPartition, Long] = DB.readOnly {
      implicit session => {
        SQL(s"select * from offset where groupid = '${groupid}'").map(
          m => (TopicAndPartition(m.string("topic"), m.string("partitions").toInt), m.string("offset").toLong )).toList().apply()
      }.toMap
    }

    //创建一个DStream,来获取kafka中数据
    var kafkaDStream: InputDStream[(String,String)] = null

    //从mysql中获取数据进行判断
    if(fromdbOffset.isEmpty){
      kafkaDStream = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
    }else{
      //1\ 不能重复消费
      //2\ 保证偏移量
      var checkOffset = Map[TopicAndPartition,Long]()

      //加载kafka配置
      val kafkaCluster = new KafkaCluster(kafkaParams)
      //首先获得kafka中的所有的topic , partition , offset
      val earliesOffset: Either[Err, Map[TopicAndPartition, LeaderOffset]] = kafkaCluster.getEarliestLeaderOffsets(fromdbOffset.keySet)
      //然后开始比较大小,用mysql中的offset和kafka中的offset进行比较
      if(earliesOffset.isRight){
        //取到需要的 大Map(topic,partition,offset)
        val tap: Map[TopicAndPartition, LeaderOffset] = earliesOffset.right.get
        //比较,直接进行比较大小
        checkOffset = fromdbOffset.map(f => {
          //取kafka中的offset
          //进行比较,不需要重复消费,取最大的
          val KafkatopicOffset = tap.get(f._1).get.offset
          if (f._2 > KafkatopicOffset) {
            f
          } else {
            (f._1, KafkatopicOffset)
          }
        })
        checkOffset
      }
      val messageHandler=(mmd:MessageAndMetadata[String,String])=>{(mmd.key(),mmd.message())}
      //不是第一次启动的话 ,按照之前的偏移量取数据的偏移量
      kafkaDStream = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,(String,String)](ssc,kafkaParams,checkOffset,messageHandler)
    }
    var offsetRanges = Array[OffsetRange]()
    kafkaDStream.foreachRDD(kafkaRDD=>{
      offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
      val map: RDD[String] = kafkaRDD.map(_._2)
      map.foreach(println)

      //更新偏移量
//      DB.localTx(implicit session =>{
//        //去到所有的topic partition offset
//        for (o<- offsetRanges){
//          SQL(s"replace into offset(groupid,topic,partitions,offset) values(?,?,?,?)").bind(
//            groupid,o.topic.toString,o.partition.toInt,o.untilOffset.toLong
//          ).update().apply()
//        }
//      })

      DB.autoCommit(implicit session=>{
        // SQL里面是普通的sql语句,后面bind里面是语句中"?"的值,update().apply()是执行语句
        for(o <- offsetRanges) {
          SQL("update offset set offset = ?,partitions = ? where groupid = ? and topic = ?").bind(
            o.untilOffset.toLong,o.partition.toInt,groupid,topic
          ).update().apply()
        }
      })

    })
    ssc.start()
    ssc.awaitTermination()
  }
}

二、offset 保存在 HBase 中

三、offset 维护在 zookeeper 中

spark-streaming-kafka-0-8

(更详细源于 : 【spark开发】SparkStreaming+Kafka - 简书

spark-streaming 消费 kafka 主类

package newSparkSteaming

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import org.apache.spark.SparkException
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{KafkaCluster, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka.KafkaCluster.{Err, LeaderOffset}

/*
* Create by Jerry on 2020/1/10
*
* Kafka offset管理类,使用zookeeper维护offset。除以下使用集成的kafka API去维护还可以使用zk client API去实现。
* */
class KafkaManager(val kafkaParams:Map[String,String]) extends Serializable {
  private val kc = new KafkaCluster(kafkaParams)
  /*
  * 创建数据流
  * */
  def createDirectStream(ssc:StreamingContext,topics:Set[String]): InputDStream[(String,String)] ={
    val groupid: String = kafkaParams.get("groupid").get
    //从 zookeeper 上读取 offset 前先根据实际情况更新 offsets
    setOrUpdateOffsets(topics,groupid)
    //从zookeeper上读取offset开始消费message
    val kafkaStream = {
      val partitionsE: Either[Err, Set[TopicAndPartition]] = kc.getPartitions(topics)
      if(partitionsE.isLeft){
        throw new SparkException(s"get kafka partition failed: ${partitionsE.left.get}")
      }
      val partitions: Set[TopicAndPartition] = partitionsE.right.get
      val consumerOffsetsE: Either[Err, Map[TopicAndPartition, Long]] = kc.getConsumerOffsets(groupid,partitions)
      if(consumerOffsetsE.isLeft){throw new SparkException(s"get kafka consumer offsets failed: ${consumerOffsetsE.left.get}")}
      val consumerOffsets: Map[TopicAndPartition, Long] = consumerOffsetsE.right.get
      println(consumerOffsets)
      KafkaUtils.createDirectStream(ssc,kafkaParams,consumerOffsets,(mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message))
    }
    kafkaStream
  }
  /*
  * 创建数据流前,根据实际消费情况更新消费 offsets,如果 Streaming 程序执行的时候出现
  * kafka.common.OffsetOutOfRangeException,说明zk上保存的offsets已经过时了,即kafka的
  * 定时清理策略应将包含offsets的文件删除,针对这种情况,只要判断一下zk上的consumerOffsets
  * 和earliestLeaderOffsets的大小,如果consumerOffsets比earliestLeaderOffsets还小的话,
  * 说明从速而Offsets已过时,这时把consumerOffsets更新为earliestLeaderOffsets
  * */
  def setOrUpdateOffsets(topics:Set[String],groupid:String): Unit ={
    topics.foreach(topic =>{
      var hasConsumed = true
      val partitionsE: Either[Err, Set[TopicAndPartition]] = kc.getPartitions(Set(topic))
      if(partitionsE.isLeft){ throw new SparkException(s"get kafka partition failed:${partitionsE.left.get}") }
      val partitions: Set[TopicAndPartition] = partitionsE.right.get
      val consumerOffsetsE: Either[Err, Map[TopicAndPartition, Long]] = kc.getConsumerOffsets(groupid,partitions)
      if(consumerOffsetsE.isLeft){ hasConsumed = false }
      if(hasConsumed){
        //消费过
        val earliestLeaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
        if(earliestLeaderOffsetsE.isLeft){ throw new SparkException(s"get earliest leader offsets failed:${earliestLeaderOffsetsE.left.get}") }
        val earliestLeaderOffsets = earliestLeaderOffsetsE.right.get
        val consumerOffsets: Map[TopicAndPartition, Long] = consumerOffsetsE.right.get
        // 可能只是存在部分分区consumerOffsets过时,所以只更新过时分区的consumerOffsets为earliestLeaderOffsets
        var offsets: Map[TopicAndPartition, Long] = Map()
        consumerOffsets.foreach({ case (tp,n) =>
          val earliestLeaderOffset: Long = earliestLeaderOffsets(tp).offset
          if(n < earliestLeaderOffset){
            println("consumer group:" + groupid + ",topic:" + tp.topic + ",partition:" + tp.partition + " offsets已经过时,更新为:" + earliestLeaderOffset)
            offsets += (tp -> earliestLeaderOffset)
          }
        })
        if (!offsets.isEmpty){
          kc.setConsumerOffsets(groupid,offsets)
        }
      }else{
        //首次消费
        println(groupid + "第一次消费topic:" + topics)
        val reset: Option[String] = kafkaParams.get("auto.offset.reset").map(_.toLowerCase())
        var leaderOffsets: Map[TopicAndPartition,LeaderOffset] = null
        if(reset == Some("smallest")){
          val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
          if(leaderOffsetsE.isLeft){throw new SparkException(s"get earliest leader offset failed:${leaderOffsetsE.left.get}")}
          leaderOffsets = leaderOffsetsE.right.get
        }else{
          val leaderOffsetsE = kc.getEarliestLeaderOffsets(partitions)
          if(leaderOffsetsE.isLeft){ throw new SparkException(s"get latest leader offsets failed:${leaderOffsetsE.left.get}")}
          leaderOffsets = leaderOffsetsE.right.get
        }
        val offsets = leaderOffsets.map {
          case (tp, offset) => (tp, offset.offset)
        }
        kc.setConsumerOffsets(groupid,offsets)
      }
    })
  }
  /*
  * 更新 zookeeper 上的消费 offsets
  * */
  def updateOffsets(offsetRanges:Array[OffsetRange])={
    val groupid = kafkaParams.get("groupid").get
    for(offsets <- offsetRanges){
      val topicAndPartition = TopicAndPartition(offsets.topic,offsets.partition)
      val o: Either[Err, Map[TopicAndPartition, Short]] = kc.setConsumerOffsets(groupid,Map((topicAndPartition,offsets.untilOffset)))
      if(o.isLeft){println(s"Error updating the offset to Kafka cluster: ${o.left.get}")}
    }
  }
}

Kafka offset管理类,使用zookeeper维护offset。除以下使用集成的kafka API去维护还可以使用zk client API去实现。

package newSparkSteaming

import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object DirectKafkameterData {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("DirectKafkameterData")
    val sc = new SparkContext(conf)
    sc.setLogLevel("WARN")
    val ssc = new StreamingContext(conf,Seconds(10))
    // kafka 节点
    val broker_list = "hadoop01:9092,hadoop02:9092,hadoop03:9092"
    val zk_servers = "hadoop01:2181,hadoop02:2181,hadoop03:2181"
    val groupid = "groupid"
    val topics = Set("topic")
    /*
     * 参数说明
     * AUTO_OFFSET_RESET_CONFIG
     * smallest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,从头开始消费
     * largest:当各分区下有已提交的offset时,从提交的offset开始消费;无提交的offset时,消费新产生的该分区下的数据
     *disable:topic各分区都存在已提交的offset时,从offset后开始消费;只要有一个分区不存在已提交的offset,则抛出异常
     */
    val kafkaParams = Map[String, String](
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> broker_list,
      ConsumerConfig.GROUP_ID_CONFIG -> groupid,
      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "smallest"
    )
    val kafkaManager = new KafkaManager(kafkaParams)
    //创建数据流
    val kafkaStream: InputDStream[(String, String)] = kafkaManager.createDirectStream(ssc,topics)

    kafkaStream.foreachRDD { rdd =>
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        rdd.map(msg => msg._2).foreachPartition(ite => {
          ite.foreach(record => {
            //处理数据的逻辑
            println(record)
            //处理数据的逻辑
          })
        })
        kafkaManager.updateOffsets(offsetRanges)
    }
    ssc.start()
    ssc.awaitTermination()
  }
}

四、offset 保存在 redis 中

JedisPoolUtil.scala 工具类

package Utils

import com.typesafe.config.{Config, ConfigFactory}
import redis.clients.jedis.{Jedis,JedisPool, JedisPoolConfig}

object JedisPoolUtil {
  //加载配置文件
  private val config: Config = ConfigFactory.load()
  private val host: String = config.getString("redis.host")
  private val auth: String = config.getString("redis.auth")
  private val port: Int = config.getInt("redis.port")
  private val jedisConfig = new JedisPoolConfig
  //最大连接数
  jedisConfig.setMaxTotal(config.getInt("redis.maxConn"))
  //最大空闲连接数
  jedisConfig.setMaxIdle(config.getInt("redis.maxIdle"))
  //设置连接池属性
  val pool = new JedisPool(jedisConfig,host,port,10000,auth)
  def getConnections():Jedis = {
    pool.getResource
  }
}

StreamingOffsetToRedis.scala Streaming 类

package SpakStreaming

import Utils.JedisPoolUtil
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.kafka010._
import scala.collection.mutable
import scala.util.Try

object StreamingOffsetToRedis {
  def getOffset(topics:Set[String],groupid:String): mutable.Map[TopicPartition,Long] ={
    val fromOffset: mutable.Map[TopicPartition, Long] = scala.collection.mutable.Map[TopicPartition,Long]()
    //获取 redis 中存的值
    val jedis = JedisPoolUtil.getConnections()
    topics.foreach(topic => {
      val keys = jedis.keys(s"offset_${groupid}_${topic}")
      if(!keys.isEmpty){
        keys.forEach(key =>{
          val offset = jedis.get(key)
          val partition = Try(key.split(s"offset_${groupid}_${topic}_").apply(1)).getOrElse("0")
          //输出
          println(partition + "::" + offset)
          fromOffset.put(new TopicPartition(topic,partition.toInt),offset.toLong)
        })
      }
    })
    jedis.close()
    fromOffset
  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("StreamingOffsetToRedis").setMaster("local[2]")
    val sc = new SparkContext(conf)
    sc.setLogLevel("WARN")
    val ssc = new StreamingContext(sc,Seconds(2))
    //kafka topic
    val topics = Set("offset-redis-01")
    //kafka params
    val kafkaParams: Map[String, Object] = Map(
      "bootstrap.servers" -> "node1:9092,node2:9092,node3:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "offSet-Redis-Test",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val groupid = kafkaParams.get("group.id").get.toString
    //获取消费方式消费方式有三种
    //earlist:当个分区下有已经提交的offset时候,从提交的offset开始消费,无提交的offset时,从头开始消费
    //latest:当各分区下有已经提交的offset时候,从提交的offset开始消费,无提交的offset时,消费新产生的该分区下的数据
    //none:topic各分区都存在已经提交的offset时,从offset后开始消费,只要有一个分区不存在已提交的offset,则抛出异常
    val reset = kafkaParams.get("auto.offset.reset").get.toString
    //获取偏移量
    val offsets = getOffset(topics,groupid)
    //spark读取分方式,均匀分布
    val locationStrategy: LocationStrategy = LocationStrategies.PreferConsistent
    val conumerStrategy: ConsumerStrategy[String, String] = ConsumerStrategies.Subscribe(topics,kafkaParams,offsets)

    val kafkaInputStream = KafkaUtils.createDirectStream(ssc,locationStrategy,conumerStrategy)
    kafkaInputStream.foreachRDD(rdd =>{
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      if(!rdd.isEmpty()){
        val jedis = JedisPoolUtil.getConnections()
        //开启jedis事务
        val transaction = jedis.multi()
        //代码逻辑
        rdd.foreachPartition(result =>{
          reset.foreach(println)
        })
        //代码逻辑
        offsetRanges.foreach(iter => {
          val key = s"offset_${groupid}_${iter.topic}_${iter.partition}"
          val value = iter.untilOffset
          transaction.set(key,value.toString)
        })
        transaction.exec()
        transaction.clone()
        jedis.close()
      }
    })
    ssc.start()
    ssc.awaitTermination()
  }
}

(参考别人的代码,自己协写一遍,人后汇总这里供以后参考)

五、offset 维护在 kafka 中

spark-streaming-kafka-0-10

package SpakStreaming

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.slf4j.{Logger, LoggerFactory}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

object StremingOffsetToKafka {
  private val appName = "StreamingTest"
  private val LOG: Logger = LoggerFactory.getLogger(appName)

  def main(args: Array[String]): Unit = {
    if(args.length != 1){
      println("Usage:StreamingTest <propsName>")
      System.exit(1)
    }
    LOG.info("################ Streaming Start ##################")
    //读取配置文件信息
    val propName = args(0)
    val conf = new SparkConf().setAppName("123").setMaster("local[2]")
    conf.set("spark.streaming.kafka.maxRatePerPartition","200")

    val ssc = new StreamingContext(conf,Seconds(4))

    //kafka参数
    val topics = Set("topic")
    val groupid = "groupid"
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "kafka01:9092,kafka02:9092,kafka03:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupid,
      "auto.offset.reset" -> "earliest", // 初次启动从最开始的位置开始消费
      "enable.auto.commit" -> (false: java.lang.Boolean) // 自动提交设置为 false
    )
    // 方法一;使用kafka
    val stream = KafkaUtils.createDirectStream[String,String](
      ssc,
      PreferConsistent,//均匀分发到 executor
      Subscribe[String,String](topics,kafkaParams)
    )
    LOG.info("################## Create Streaming Success ####################")
    stream.foreachRDD(rdd => {
      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.foreachPartition(iter => {
        iter.foreach(line => {
          //处理逻辑
          println(line.value())
          //
        })
      })
      stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
    })
    ssc.start()
    ssc.awaitTermination()
  }
}