OrderCount

基于spark2 kafka 0.10版本 ,便宜量管理是kafka自己管理

package order

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object OrderCount {
  def main(args: Array[String]): Unit = {

    val group = "myorder"
    val topic = "test"
    val conf = new SparkConf().setAppName("DirectStream").setMaster("local[2]")
    val streamingContext = new StreamingContext(conf, Seconds(5))

    //获取ip规则,然后广播
    val broadcastRef = IPUtils broadcastIpRules(streamingContext, "C:\\Users\\admin\\Desktop\\test\\ip.txt")

    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "note01:9092,note02:9092,note03:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> group,
      "auto.offset.reset" -> "earliest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val topics = Array(topic)
    val stream = KafkaUtils.createDirectStream[String, String](
      streamingContext,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    stream.foreachRDD { rdd =>
      if (!rdd.isEmpty()) {
        //只有KafkaRDD可以强转成HasOffsetRanges,并获取到偏移量
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        //      ==============处理计算逻辑===================

        val lines = rdd.map(_.value())
        lines.foreach(println)
        val fields: RDD[Array[String]] = lines.map(_.split(" ",-1))
        //     计算成交总金额
        CalculateUtil.calculateIncome(fields)
        //    计算商品分类金额
        CalculateUtil.calculateItem(fields)
        //计算区域成交金额
        CalculateUtil.calculateZone(fields, broadcastRef)
        stream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
      }
    }
    streamingContext.start()
    streamingContext.awaitTermination()
  }
}

CalculateUtil

通过计算累计总金额,商品种类金额,省份金额

package order

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

object CalculateUtil {
  //将ip转换成long类型
  def ip2Long(ip: String): Long = {
    val fragments = ip.split("[.]")
    var ipNum = 0L
    for (i <- 0 until fragments.length){
      ipNum =  fragments(i).toLong | ipNum << 8L
    }
    ipNum
  }
  //二分查找
  def binarySearch(lines: Array[(Long, Long, String)], ip: Long) : Int = {
    var low = 0
    var high = lines.length - 1
    while (low <= high) {
      val middle = (low + high) / 2
      if ((ip >= lines(middle)._1) && (ip <= lines(middle)._2))
        return middle
      if (ip < lines(middle)._1)
        high = middle - 1
      else {
        low = middle + 1
      }
    }
    -1
  }

  //计算各个省份成交额
  def calculateZone(fields: RDD[Array[String]], broadcastRef: Broadcast[Array[(Long, Long, String)]]) = {
    val provinceAndPrice = fields.map(arr => {
      val ip = arr(1)
      val price = arr(4).toDouble
      val ipNum = ip2Long(ip)
      //在Executor中获取广播的全部规则
      val allRules = broadcastRef.value
      val index = binarySearch(allRules, ipNum)
      var province = "未知"

      if (index != -1) {
        val province = allRules(index)._3
      }
      //省份 订单金额
      (province, price)
    })
    //按省份进行聚合
    val reduced = provinceAndPrice.reduceByKey(_+_)
    reduced.foreachPartition( part => {
      val conn = JedisConnectionPool.getConnection()
      part.foreach( t => {
        conn.incrByFloat(t._1,t._2)
      })
      conn.close()
    })
  }

  //商品分类成交额
  def calculateItem(fields: RDD[Array[String]]) = {

    val ItemAndPrice = fields.map(arr => {
      //分类
      val item = arr(2)
      //金额
      val price = arr(4).toDouble

      (item, price)
    })
      //分组聚合
    val reduced = ItemAndPrice.reduceByKey(_+_)
      //将当前分类统计到redis中,foreachPartition是一个action算子

    //现在这种方式,jeids的连接是在哪一端创建的(Driver)
    //在Driver端拿Jedis连接不好
    //val conn = JedisConnectionPool.getConnection()  由于没有实现序列化 drvier端进行,excutor端是进行计算的
    reduced.foreachPartition(part =>{
      //获取一个jedis连接,这个连接是Executor中获取的,Jedis连接池是单例的,因为定义了Object,属于某一个executor
      val jedis = JedisConnectionPool.getConnection()
      part.foreach(x => {
        jedis.incrByFloat(x._1,x._2)
      })
    })

  }
//总成交额
  def calculateIncome(fields: RDD[Array[String]]) = {
    //将计算结果写入redis
    val priceRDD = fields.map(arr => {
      //string 类型,要进行转换
      val price = arr(4).toDouble
      price
    })
    //redis是一个action算子,会将结果返回到Driver端,将当前批次总金额返回
    val sum: Double = priceRDD.reduce(_+_)
    //获取jedis连接池 这是Driver端执行的
    val conn = JedisConnectionPool.getConnection()
    //incr 只是增加1 incrBy 第二个参数是long类型
    conn.incrByFloat("TOTAL_INCOME",sum)
    //释放连接
    conn.close()
  }

}

IPUtils

将ip转换为数字

package order

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.StreamingContext

object IPUtils {
  def broadcastIpRules(ssc: StreamingContext, ipRulesPath : String): Broadcast[Array[(Long, Long, String)]]  = {
    //获取sparkContext
    val sc = ssc.sparkContext
    //读取hdfs上的ip规则
    val rulesLines = sc.textFile(ipRulesPath)
    //整理ip规则数据
    val ipRulesRDD = rulesLines.map(line => {
      val fields = line.split("[|]",-1)
      val startNum = fields(2).toLong
      val endNum = fields(3).toLong
      val province = fields(6)
      (startNum, endNum, province)
    })
    //将分散多个Executor中的部分IP规则收集到Driver端
    val rulesInDriver = ipRulesRDD.collect()
    //将Driver端的数据广播到Executor,广播变量的引用 还在Driver端
    sc.broadcast(rulesInDriver)
  }

}

JedisConnectionPool

redis数据库连接池

package order

import redis.clients.jedis.{Jedis, JedisPool, JedisPoolConfig}

object JedisConnectionPool {

  val config = new JedisPoolConfig()
  //最大连接数,
  config.setMaxTotal(20)
  //最大空闲连接数
  config.setMaxIdle(10)
  //当调用borrow Object方法时,是否进行有效性检查 -->
  config.setTestOnBorrow(true)
  //10000代表超时时间(10秒)
  val pool = new JedisPool(config, "192.168.18.100", 6379, 10000, "123")

  def getConnection(): Jedis = {
    pool.getResource
  }

}