文章目录

  • 三、SparkStreaming与Kafka连接
  • 使用连接池技术


三、SparkStreaming与Kafka连接

首先添加依赖注入

<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka_2.11</artifactId>
            <version>1.6.3</version>
        </dependency>

https://mvnrepository.com/我们可以通过这个网站,找到适合自己spark版本的,直接搜索spark kafka即可。
还要添加一个Kafka的连接池依赖

<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-pool2 -->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-pool2</artifactId>
            <version>2.5.0</version>
        </dependency>
使用连接池技术
  • 设计思路

    通过Apache提供的commons-pools实现SparkStreaming写入到Kafka的时候,从连接池中获取到Kafka的连接
    程序部署:
    1、启动Zookeeper和Kafka
bin/kafka-server-start.sh -deamon ./config/server.properties

2、创建俩个topic、一个是source,一个是target

bin/kafka-topics.sh --create --zookeeper 192.168.2.160:2181,192.168.2.157:2181,192.168.2.158:2181 --replication-factor 1 --partitions 2 --topic source
bin/kafka-topics.sh --create --zookeeper 192.168.2.160:2181,192.168.2.157:2181,192.168.2.158:2181 --replication-factor 1 --partitions 2 --topic target

3、启动Kafka console producer写入source topic

bin/kafka-console-consumer.sh --bootstrap-server 192.168.2.160:9092, 192.168.2.157:9092, 192.168.2.158:9092 --topic source

4、创建KafkaPool

package kafka

import java.util.Properties

import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericObjectPool}
import org.apache.commons.pool2.{BasePooledObjectFactory, PooledObject}
import org.apache.kafka.clients.producer.KafkaProducer

/**
  * 创建一个线程池
  * @param broker
  */

//包装客户端
class KafkaProxy(broker:String){
  val prop : Properties = new Properties()
    prop.put("bootstrap.servers" , broker)
    prop.put("key.serializer","org.apache.kafka.common.serialization.StringDeserializer")
    prop.put("value.serializer","org.apache.kafka.common.serialization.StringDeserializer")
  val kafkaClient = new KafkaProducer[String,String](prop)
}

//包装一个工厂
class KafkaProxyFactory(broker:String) extends BasePooledObjectFactory[KafkaProxy]{

  //创建实例
  override def create(): KafkaProxy = new KafkaProxy(broker)

  //包装实例
  override def wrap(t: KafkaProxy): PooledObject[KafkaProxy] = new DefaultPooledObject[KafkaProxy](t)
}

object KafkaPool {

  private var kafkaPool : GenericObjectPool[KafkaProxy] = null

  def apply(broker:String): GenericObjectPool[KafkaProxy] = {
    if (kafkaPool == null){
      KafkaPool.synchronized{
        this.kafkaPool = new GenericObjectPool[KafkaProxy](new KafkaProxyFactory(broker))
      }
    }
    kafkaPool
  }
}

5、连接SparkStreaming和Kafka,将Kafka传入的消息经过处理后再输出

package kafka

import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object streaming2KafKa {
  def main(args: Array[String]): Unit = {
    val sc = new SparkConf().setAppName("app").setMaster("local[*]")

    val ssc = new StreamingContext(sc,Seconds(5))

    val fromTopic = "from1"
    val toTopic = "to1"

    val brokers = "linux01:9092,linux02:9092,linux03:9092"

    val kafkaPro = Map[String,String](
      //初始化链接到kafka
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "hadoop1:9092,hadoop2:9092,hadoop3:9092",
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringSerializer",
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringSerializer",
      //表示消费者属于哪个消费团体
      ConsumerConfig.GROUP_ID_CONFIG -> "kafka",
//      如果没有初始化偏移量或者当前的偏移量没有在任何服务器上,可以使用这个配置属性
      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "largest"
    )

    //根据上一次的Offset来创建

    //连接kafka
    val stream = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaPro,Set(fromTopic))

    //获取上次的Offset

    stream.map{
      case(key,value) => "ABC:"+value
    }.foreachRDD{
      rdd=>rdd.foreachPartition{
        items =>
          //写回kafka,连接池
          val kafkaProxyPool = KafkaPool(brokers)
          val kafkaProxy = kafkaProxyPool.borrowObject()

          for (item <- items){
          //使用
          kafkaProxy.kafkaClient.send(new ProducerRecord[String,String](toTopic,item))
          }
          kafkaProxyPool.returnObject(kafkaProxy)
      }
        //更新Offset
    }

    ssc.start()
    ssc.awaitTermination()
  }
}

6、已经完成一般情况下的SparkStreaming和Kafka的连接,下面将在程序中补充,如果运行中突然宕机的解决措施

package kafka

import kafka.api.{OffsetRequest, PartitionOffsetRequestInfo, TopicMetadataRequest}
import kafka.common.TopicAndPartition
import kafka.consumer.SimpleConsumer
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object Streaming2Kafka {

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("kafka").setMaster("local[*]")

    val ssc = new StreamingContext(sparkConf,Seconds(5))

    //创建俩个主题
    val fromTopic = "from"
    val toTopic = "to"

    val brokers = "hadoop1:9092,hadoop2:9092,hadoop3:9092"
    //获取zookeeper信息
    val zookeeper = "hadoop1:2181"

    //配置相关信息
    val kafkaPro = Map[String,String](
      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,   //用于初始化连接到集群的地址
      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer",
      //用于标识这个消费者属于哪个消费团体
      ConsumerConfig.GROUP_ID_CONFIG -> "kafka",
      //如果没有初始化偏移量或者当前的偏移量不存在任何服务器上,可以使用这个配置属性
      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "largest"
    )

    //获取保存offset的zk路径
    val topicDirs = new ZKGroupTopicDirs("kafkaxy",fromTopic)
    val zkTopicPath = s"${topicDirs.consumerOffsetDir}"

    //创建一个到zk的连接
    val zkClient = new ZkClient(zookeeper)

    //获取偏移的保存的地址目录下的子节点
    val children = zkClient.countChildren(zkTopicPath)

    var stream:InputDStream[(String,String)] = null

    //>0 说明有过保存偏移量
    if (children > 0){

      //新建一个变量,保存消费的偏移量
      var fromOffsets : Map[TopicAndPartition, Long] = Map()

      //首先获取每一个Partition的主节点信息
      val topicList = List(fromTopic)
      //创建一个获取元信息的请求
      val request = new TopicMetadataRequest(topicList,0)
      //创建了一个客户端到kafka的连接
      val getLeaderConsumer = new SimpleConsumer("hadoop1",9092,100000,10000,"OffsetLookUp")
      //连接后的响应
      val response = getLeaderConsumer.send(request)

      val topicMeteOption = response.topicsMetadata.headOption

      val partitions = topicMeteOption match {
        case Some(tm) => {
          tm.partitionsMetadata.map(pm => (pm.partitionId,pm.leader.get.host)).toMap[Int,String]
        }
        case None => {
          Map[Int,String]()
        }
      }
      getLeaderConsumer.close()
      println("partitions information is:" + partitions)
      println("children information is:" + children)
      for (i<- 0 until children){
        //获取保存在zk中的偏移信息
        val partitionOffset = zkClient.readData[String](s"${topicDirs.consumerOffsetDir}/${i}")
        println(s"Partition[${i}] 目前保存的偏移信息是:${partitionOffset}")

        val tp = TopicAndPartition(fromTopic,i)
        //获取当前Partition的最小偏移值(主要防止kafka中的数据过期问题)
        val requestMin = OffsetRequest(Map(tp -> PartitionOffsetRequestInfo(OffsetRequest.EarliestTime,1)))
        val consumerMin = new SimpleConsumer(partitions(i),9092,100000,10000,"getMiniOffset")
        val response = consumerMin.getOffsetsBefore(requestMin)

        //获取当前的偏移量
        val curOffsets = response.partitionErrorAndOffsets(tp).offsets
        consumerMin.close()

        var nextOffset = partitionOffset.toLong
        if (curOffsets.length > 0 && curOffsets.head < nextOffset){
          nextOffset = curOffsets.head
        }
        println(s"Partition[${i}] 最小的偏移信息是:${curOffsets.head}")
        println(s"Partition[${i}] 修改后的偏移信息是:${nextOffset}")
        fromOffsets += (tp -> nextOffset)
      }

      val messageHandler = (mmd:MessageAndMetadata[String,String]) => (mmd.topic,mmd.message())
      println("从zk获取偏移量来创建DStream")
      zkClient.close()
      stream = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,(String,String)](ssc,kafkaPro,fromOffsets,messageHandler)
    }else{
      println("直接创键,没有从zk中获取偏移量")
      stream = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaPro,Set(fromTopic))
    }

    var offsetRanges = Array[OffsetRange]()
    //获取采集数据的偏移量
    val mapDStream = stream.transform{ rdd =>
      offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd
    }.map(_._2)

    //获取上一次的Offset
    mapDStream.map("ABC"+_).foreachRDD{ rdd =>
      rdd.foreachPartition{ items =>

        //写回连接池
        val kafkaProxyPool = KafkaPool(brokers)
        val kafkaProxy = kafkaProxyPool.borrowObject()
        for (item <- items){
          kafkaProxy.kafkaClient.send(new ProducerRecord[String,String](toTopic,item))
        }
        kafkaProxyPool.returnObject(kafkaProxy)
      }

      //更新offset
      val updateTopicDirs = new ZKGroupTopicDirs("kafkaxy",fromTopic)
      val updateZkClient = new ZkClient(zookeeper)
      for (offset <- offsetRanges){
        println(offset)
        val zkPath = s"${updateTopicDirs.consumerOffsetDir}/${offset.partition}"
        ZkUtils.updatePersistentPath(updateZkClient,zkPath,offset.fromOffset.toString)
      }
      updateZkClient.close()
    }
    ssc.start()
    ssc.awaitTermination()
  }
}