spark大数据分析:sparkStrreaming(18)DStream操作

原创

wx5ba7ab4695f27 2021-05-31 17:14:04 ©著作权

文章标签 spark 文章分类 Spark 大数据

©著作权归作者所有：来自51CTO博客作者wx5ba7ab4695f27的原创作品，请联系作者获取转载授权，否则将追究法律责任

文章目录

- 无状态转换
- 有状态转换

基础转换操作

map,flatMap,filter,repairtition.union,count,reduce,countByValue,reduceBykey,join,cogroup,tansform,updateStateByKey

窗口转换操作

window
countByWindow
reduceByKeyAndWindow
reduceByWindow
countByValueWindow

输出操作

print
savaAsTextFiles
saveAsObjectFiles
savaAsHadoopFiles
foreachRDD

将结果输出到Mysql中

连接池工具

package stream

import java.sql.{Connection, DriverManager}import java.util.concurrent.ConcurrentLinkedQueue

object ConnectionPool {
  private var queue: ConcurrentLinkedQueue[Connection] = _
  Class.forName("com.mysql.jdbc.Driver")

  def getConnection(): Unit = {
    if (queue == null) queue = new ConcurrentLinkedQueue[Connection]()
    if (queue.isEmpty){
      for (i <- 1 to 10){
        val conn = DriverManager.getConnection(
          "jdbc:mysql://note01:3306/test","root","123456",
        )
        conn.setAutoCommit(false)
        queue.offer(conn)
      }
    }
    queue.poll()
  }

  def returnConnection(conn:Connection): Unit ={
    queue.offer(conn)
  }}

ConcurrentLinkedQueue 是java中提供无界的非阻塞队列.并通过CAS算法实现队列元素的原子性操作,它在多线程安全访问的同时还拥有较好的并发性能
offer 方法用于向连接池队列尾部添加连接
poll方法,用于从连接池队列头部取出连接

package stream

import java.sql.{Connection, DriverManager}import java.util.concurrent.ConcurrentLinkedQueue

object ConnectionPool {
  private var queue: ConcurrentLinkedQueue[Connection] = _
  Class.forName("com.mysql.jdbc.Driver")

  def getConnection(): Unit = {
    if (queue == null) queue = new ConcurrentLinkedQueue[Connection]()
    if (queue.isEmpty){
      for (i <- 1 to 10){
        val conn = DriverManager.getConnection(
          "jdbc:mysql://note01:3306/test","root","123456",
        )
        conn.setAutoCommit(false)
        queue.offer(conn)
      }
    }
    queue.poll()
  }

  def returnConnection(conn:Connection): Unit ={
    queue.offer(conn)
  }}

package stream

import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.streaming.{Seconds, StreamingContext}object StreamTest2 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("Chapter8_4_2")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(2))

    val lines = ssc.socketTextStream("note01", 9999)

    lines.map(_.split(",")).filter(_.length == 4)
      .foreachRDD(rdd => {
        rdd.foreachPartition(p => {
          val conn = ConnectionPool.getConnection()
          val statement = conn.prepareStatement("insert into test('name','age') values (?,?)")

          p.foreach(result =>{
            statement.setString(1,result(0))
            statement.setFloat(2,result(1).toLong)
            statement.addBatch()
          })
          statement.executeBatch()
          conn.commit()
          ConnectionPool.returnConnection(conn)
        })
      })
    ssc.start()
    ssc.awaitTermination()
  }}

join操作

package stream

import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.streaming.{Seconds, StreamingContext}object StreamTest3 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("Chapter8_4_2")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(2))
    val lines1 = ssc.socketTextStream("note01", 9999)
    val lines2 = ssc.socketTextStream("note02", 9998)
    val r1 = lines1.map(_.split(",")).filter(_.length > 3).map(arr => (arr(0),arr(1)))
    val r2 = lines2.map(_.split(",")).filter(_.length > 3).map(arr => (arr(0),arr(1)))
   r1.join(r2)
    ssc.start()
    ssc.awaitTermination()
  }}

DStream转换分类

无状态转换

强调对每一个批次的数据转换都是独立处理的,在处理当前批次数据时,不会依赖之前的数据也不会影响后续的数据,在实际开发中,如果使用外部存储介质来保存中间状态,依然可以利用无状态转换对持续流入的数据进行全局聚合
(1)将上一批次统计的结果存储在Redis中
(2)在处理当前批次时,先从Redis中读取上一次的结果,然后将当前批次的结果与上一次的结果进行累加
(3)将当前累加后的结果重新写回redis中,并覆盖上一批次的值+

有状态转换

每一个批次数据都可以利用上一个批次结果为上一个批次的状态,该状态会通过有状态操作传入下一个批次中,然后按照开发者定义的操作将上一批次的数据进行累加,例如updateByStateByKey,reduceByKeyAndWindow

package stream

import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}import org.apache.spark.streaming.{Seconds, StreamingContext}object Test03 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("Test03")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))

    ssc.checkpoint("/test")
    val lines = ssc.socketTextStream("note01", 9999)
    
    
    
    def updateStateFunc(iter:Iterator[(String,Seq[Int],Option[Int])]):Iterator[(String,Int)] = {
      iter.map{
        case (word,curWordCount,preWordCount) => {
          (word,curWordCount.sum+preWordCount.getOrElse(0))
        }
      }
    }
    
    val func = updateStateFunc _
    lines.flatMap(_.split(" ")).map(x => (x,1)).updateStateByKey[Int](func,
      new HashPartitioner(sc.defaultParallelism),
      true).print()
    ssc.start()
    ssc.awaitTermination()
  }}

对于涉及有状态的转化必须设置checkPoint
updateStateFunc 用于将上一个批次结果与当前批次数据合并Iterator中参数String为当前单词,Seq[Int] 当前批次中的数据被map方法映射数据为(a,1),(b,1),则Seq中数据为Seq[1,1],Option[Int] 当前单词在上一个批次中出现的个数
val func = updateStateFunc _ 将当前方法转换为函数复制