文章目录
基础转换操作
map,flatMap,filter,repairtition.union,count,reduce,countByValue,reduceBykey,join,cogroup,tansform,updateStateByKey
窗口转换操作
window
countByWindow
reduceByKeyAndWindow
reduceByWindow
countByValueWindow
输出操作
print
savaAsTextFiles
saveAsObjectFiles
savaAsHadoopFiles
foreachRDD
将结果输出到Mysql中
连接池工具
package stream import java.sql.{Connection, DriverManager}import java.util.concurrent.ConcurrentLinkedQueue object ConnectionPool { private var queue: ConcurrentLinkedQueue[Connection] = _ Class.forName("com.mysql.jdbc.Driver") def getConnection(): Unit = { if (queue == null) queue = new ConcurrentLinkedQueue[Connection]() if (queue.isEmpty){ for (i <- 1 to 10){ val conn = DriverManager.getConnection( "jdbc:mysql://note01:3306/test","root","123456", ) conn.setAutoCommit(false) queue.offer(conn) } } queue.poll() } def returnConnection(conn:Connection): Unit ={ queue.offer(conn) }}
ConcurrentLinkedQueue 是java中提供无界的非阻塞队列.并通过CAS算法实现队列元素的原子性操作,它在多线程安全访问的同时还拥有较好的并发性能
offer 方法用于向连接池队列尾部添加连接
poll方法,用于从连接池队列头部取出连接
package stream import java.sql.{Connection, DriverManager}import java.util.concurrent.ConcurrentLinkedQueue object ConnectionPool { private var queue: ConcurrentLinkedQueue[Connection] = _ Class.forName("com.mysql.jdbc.Driver") def getConnection(): Unit = { if (queue == null) queue = new ConcurrentLinkedQueue[Connection]() if (queue.isEmpty){ for (i <- 1 to 10){ val conn = DriverManager.getConnection( "jdbc:mysql://note01:3306/test","root","123456", ) conn.setAutoCommit(false) queue.offer(conn) } } queue.poll() } def returnConnection(conn:Connection): Unit ={ queue.offer(conn) }}
package stream import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.streaming.{Seconds, StreamingContext}object StreamTest2 { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Chapter8_4_2") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(2)) val lines = ssc.socketTextStream("note01", 9999) lines.map(_.split(",")).filter(_.length == 4) .foreachRDD(rdd => { rdd.foreachPartition(p => { val conn = ConnectionPool.getConnection() val statement = conn.prepareStatement("insert into test('name','age') values (?,?)") p.foreach(result =>{ statement.setString(1,result(0)) statement.setFloat(2,result(1).toLong) statement.addBatch() }) statement.executeBatch() conn.commit() ConnectionPool.returnConnection(conn) }) }) ssc.start() ssc.awaitTermination() }}
join操作
package stream import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.streaming.{Seconds, StreamingContext}object StreamTest3 { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Chapter8_4_2") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(2)) val lines1 = ssc.socketTextStream("note01", 9999) val lines2 = ssc.socketTextStream("note02", 9998) val r1 = lines1.map(_.split(",")).filter(_.length > 3).map(arr => (arr(0),arr(1))) val r2 = lines2.map(_.split(",")).filter(_.length > 3).map(arr => (arr(0),arr(1))) r1.join(r2) ssc.start() ssc.awaitTermination() }}
DStream转换分类
无状态转换
强调对每一个批次的数据转换都是独立处理的,在处理当前批次数据时,不会依赖之前的数据也不会影响后续的数据,在实际开发中,如果使用外部存储介质来保存中间状态,依然可以利用无状态转换对持续流入的数据进行全局聚合
(1)将上一批次统计的结果存储在Redis中
(2)在处理当前批次时,先从Redis中读取上一次的结果,然后将当前批次的结果与上一次的结果进行累加
(3)将当前累加后的结果重新写回redis中,并覆盖上一批次的值+
有状态转换
每一个批次数据都可以利用上一个批次结果为上一个批次的状态,该状态会通过有状态操作传入下一个批次中,然后按照开发者定义的操作将上一批次的数据进行累加,例如updateByStateByKey,reduceByKeyAndWindow
package stream import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}import org.apache.spark.streaming.{Seconds, StreamingContext}object Test03 { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Test03") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, Seconds(5)) ssc.checkpoint("/test") val lines = ssc.socketTextStream("note01", 9999) def updateStateFunc(iter:Iterator[(String,Seq[Int],Option[Int])]):Iterator[(String,Int)] = { iter.map{ case (word,curWordCount,preWordCount) => { (word,curWordCount.sum+preWordCount.getOrElse(0)) } } } val func = updateStateFunc _ lines.flatMap(_.split(" ")).map(x => (x,1)).updateStateByKey[Int](func, new HashPartitioner(sc.defaultParallelism), true).print() ssc.start() ssc.awaitTermination() }}
对于涉及有状态的转化必须设置checkPoint
updateStateFunc 用于将上一个批次结果与当前批次数据合并Iterator中参数String为当前单词,Seq[Int] 当前批次中的数据被map方法映射数据为(a,1),(b,1),则Seq中数据为Seq[1,1],Option[Int] 当前单词在上一个批次中出现的个数
val func = updateStateFunc _ 将当前方法转换为函数复制
在scala中,方法与函数的区别,通过def指定的是方法,不能为其他方法中参数进行传递,需要将该方法转换为函数