一、SparkStreaming读取Kafka的两种模式:

1、Receiver(实时读取)

通过zookeeper来连接kafka队列,使用Kafka的高层次Consumer API来实现的。不过这种方式是先把数据从kafka中读取出来,然后缓存在内存,再定时处理。如果这时候集群退出,而偏移量又没处理好的话,数据就丢掉了,存在程序失败丢失数据的可能。1.2之后引入spark.streaming.receiver.writeAheadLog.enable以规避此风险。

2、Direct(定时批量读取)

直接连接到kafka的节点上获取数据,周期性地查询Kafka,来获得每个topic+partition的最新的offset,从而定义每个batch的offset的范围。当处理数据的job启动时,就会使用Kafka的简单consumer api来获取Kafka指定offset范围的数据。

直接读取方式相对传统Receiver方式的有点:简化并行,高效,精确一次。

二、案例演示(只演示Direct模式)

1、一个简单的演示:

package TestExamples
import kafka.serializer.StringDecoder
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka.KafkaUtils

/**
  * 与kafka整合
  */
object TestSparkStreaming5 {
  def main(args: Array[String]): Unit = {
    //程序入口     -》 跟生产对接了
    val conf = new  SparkConf().setMaster("local[2]").setAppName(s"${this.getClass.getSimpleName}")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc,Seconds(2))
    //数据源
    /**
    createDirectStream[K, V, KD <: Decoder[K], VD <: Decoder[V]]
    (ssc: StreamingContext,
    kafkaParams: Map[String, String],
    topics: Set[String])
      */
    val kafkaParams=Map("metadata.broker.list"  -> "hadoop1:9092")
    //   kafkaParams.getOrElse()
    //    kafkaParams.get("")
    val topics=Set("mytopic")
    //k,v  => k关于kafka的元数据的信息(主题,偏移量)  v: kafka里面的数据
    val dStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
      .map(_._2)
    dStream.flatMap(_.split(","))
      .map((_,1))
      .reduceByKey(_+_)
      .print()
    ssc.start()
    ssc.awaitTermination()
  }
}

2、Kafka多线程代码演示(用并行的方法消费kafka的数据),有两种方法,没有使用线程池和使用线程池。

(1)没有使用线程池

package TestExamples.SparkStreaming_Kafka;
import kafka.producer.KeyedMessage;
import kafka.javaapi.producer.Producer;
import kafka.producer.ProducerConfig;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/**
 * 不使用线程池的生产者:
 */
public class ProducerDemo {
    public static void main(String[] args) {
        //props用户保存一下配置信息的
        Properties props = new Properties();
        //添加配置信息:metadata.broker.list指定kafka的Borker的地址和端口,可以是多个Borker的地址
        props.put("metadata.broker.list", "hadoop1:9092");
        //数据写入到kafka中的是用的序列化方式
        props.put("serializer.class", "kafka.serializer.StringEncoder");
        //通过props创建一个ProducerConfig
        ProducerConfig config = new ProducerConfig(props);
        //创建一个Producer
        Producer<String, String> producer = new Producer<String, String>(config);
           for(int j=0 ; j < 3;j++){
               List<KeyedMessage<String, String>> messageList = new ArrayList<KeyedMessage<String, String>>();
               for (int i =200; i <= 300; i++){
               KeyedMessage<String, String> message =
                       new KeyedMessage<String, String>("hahaha", j+"","producer-parition" + j+"->"+i);
                System.out.println("producer-parition" + j+"->"+i);
                 messageList.add(message);
           }
               producer.send(messageList);
        }
    }
}
package TestExamples.SparkStreaming_Kafka;
import kafka.consumer.Consumer;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import kafka.message.MessageAndMetadata;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
/**
 * 不使用线程池的消费者:每次遍历都创建一个线程
 */
public class ConsumerDemo {
    private static final String topic="hahaha";
    private static final Integer threads=3;
    public static void main(String[] args) {
        Properties properties = new Properties();
        properties.put("zookeeper.connect","hadoop1:2181,hadoop1:2182,hadoop1:2183");
        //指定一个组id
        properties.put("group.id", "1706");
        ConsumerConfig config = new ConsumerConfig(properties);
        ConsumerConnector consumer = Consumer.createJavaConsumerConnector(config);
         HashMap<String, Integer> topicCountMap = new HashMap<>();
        topicCountMap.put(topic,threads);

        Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
        List<KafkaStream<byte[], byte[]>> streams = consumerMap.get(topic);
        for(final KafkaStream<byte[], byte[]> kafkaStream : streams){//有几个线程就会有几个结果,一般有几个分区就会设置几个线程
            new Thread(new Runnable() {
                @Override
                public void run() {
                    for(MessageAndMetadata<byte[], byte[]> mm : kafkaStream){
                        String msg = new String(mm.message());
                        final int partition = mm.partition();
                        System.out.println(Thread.currentThread().getId()+"分区号"+partition + " 信息 "+ msg  + "偏移量"+mm.offset());
                    }
                }
            }).start();
        }
    }
}

(2)使用线程池

package TestExamples.SparkStreaming_Kafka;
import kafka.consumer.ConsumerConfig;
import kafka.consumer.KafkaStream;
import kafka.javaapi.consumer.ConsumerConnector;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
 * 线程池
 */
public class ThreadPoolTest implements Runnable {
    private ConsumerConfig consumerConfig;
    private static String topic="hahaha";
    Properties props;
    final int a_numThreads = 3;
    public ThreadPoolTest() {
        props = new Properties();
        props.put("zookeeper.connect", "hadoop1:2181,hadoop1:2182,hadoop1:2183");
        props.put("group.id", "1707");
        props.put("zookeeper.session.timeout.ms", "400");
        props.put("auto.commit.interval.ms", "1000");
        consumerConfig = new ConsumerConfig(props);
    }
    @Override
    public void run() {
        Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
        topicCountMap.put(topic, new Integer(a_numThreads));
        ConsumerConfig consumerConfig = new ConsumerConfig(props);
        ConsumerConnector consumer = kafka.consumer.Consumer.createJavaConsumerConnector(consumerConfig);
        Map<String, List<KafkaStream<byte[], byte[]>>> consumerMap = consumer.createMessageStreams(topicCountMap);
        List<KafkaStream<byte[], byte[]>> streams = consumerMap.get(topic);
        ExecutorService executor = Executors.newFixedThreadPool(a_numThreads);
        for (final KafkaStream stream : streams) {
            executor.submit(new KafkaConsumerThread(stream));
        }
    }
    public static void main(String[] args) {            //测试代码
        System.out.println(topic);
        Thread t = new Thread(new ThreadPoolTest());
        t.start();
    }
}
package TestExamples.SparkStreaming_Kafka;

import kafka.consumer.ConsumerIterator;
import kafka.consumer.KafkaStream;
import kafka.message.MessageAndMetadata;

/**
 * Created by Administrator on 2017/9/17.
 */
public class KafkaConsumerThread  implements  Runnable{
    private KafkaStream<byte[], byte[]> stream;

    public KafkaConsumerThread(KafkaStream<byte[], byte[]> stream) {
        this.stream = stream;
    }

    @Override
    public void run() {
        ConsumerIterator<byte[], byte[]> it = stream.iterator();
        while (it.hasNext()) {
            MessageAndMetadata<byte[], byte[]> mam = it.next();
            System.out.println(Thread.currentThread().getName() + ": partition[" + mam.partition() + "],"
                    + "offset[" + mam.offset() + "], " + new String(mam.message()));

        }
    }

}

三、控制offset让kafka不丢数据

package TestExamples.SparkStreaming_Kafka

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * 不丢数据【通过设置偏移量】
  */
object KafkaDirectStream {
  def main(args: Array[String]): Unit = {
    val group="1708"
    val conf = new SparkConf().setAppName("KafkaDirectStream").setMaster("local[2]")
    val ssc = new StreamingContext(conf,Seconds(5))
    val topic="hahaha"
    val brokerList="hadoop1:9092"
    val zkQuorum="hadoop1:2181,hadoop1:2182,hadoop1:2183"
    val topics = Set(topic)
    //创建一个对象, 其实是指定往zk写入数据的目录,用于保存偏移量
    val topicDirs = new ZKGroupTopicDirs(group,topic)  // /1708/hahha/
    //获取zk中的路径
    val zkPath = topicDirs.consumerOffsetDir   // /1708/hahha/

    val kafkaPrams = Map(
      "metadata.broker.list" -> brokerList,
      "group.id" -> group
    )

    val zKClient = new ZkClient(zkQuorum)
    //查看该路径下是否有子节点(不同的分区保存不同的offset)
    val children = zKClient.countChildren(zkPath)  // /1708/hahha/
    /**
      * /1708/hahha/
      *  /1708/hahha/0
      *  /1708/hahha/1
      *  /1708/hahha/2
      *
      *
      */

    //如果 zookeeper 中有保存 offset,我们会利用这个 offset 作为 kafkaStream 的起始位置
    var fromOffsets: Map[TopicAndPartition, Long] = Map()

    var kafkaStream:InputDStream[(String, String)]=null

    if(children > 0){
      for(i <- 0 until children){
        //获取分区里面的数据   也就是偏移量
        val partitionOffset = zKClient.readData[String](s"${zkPath}/${i}")

        val tp = TopicAndPartition(topic,i)
        //aura/0  -> 1000  将不同partition对应的offset 增加到fromOffsets中
        fromOffsets +=(tp -> partitionOffset.toLong)
      }

      /**
        *
        * [K, V, KD <: Decoder[K], VD <: Decoder[V], R]
        * (
        * ssc: StreamingContext,
        * kafkaParams: Map[String, String],
        * fromOffsets: Map[TopicAndPartition, Long],
        * messageHandler: (MessageAndMetadata[K, V]) ⇒ R
        * )
        */
      //这个会将 kafka 的消息进行 transform,最终 kafak 的数据都会变成 (topic_name, message) 这样的 tuple
      val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
      kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](
        ssc, kafkaPrams, fromOffsets, messageHandler)
      kafkaStream
    }else{
      //如果未保存,根据 kafkaParam 的配置使用最新或者最旧的 offset
      kafkaStream=KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaPrams, topics)
    }
    var offsetRanges = Array[OffsetRange]()
    kafkaStream.transform( rdd =>{
      offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd
    }).map( msg => msg._2)
      .foreachRDD( rdd => {
        rdd.foreachPartition( parition =>{
          parition.foreach( recoder =>{
            println(recoder)
          })
        })
        for( o <- offsetRanges){
          val newZkPath = s"${zkPath}/${o.partition}"
          //将该 partition 的 offset 保存到 zookeeper
          ZkUtils.updatePersistentPath(zKClient, newZkPath, o.fromOffset.toString)
        }
      })
    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}