文章目录


实时HDFS数据

import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object TestStream {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestStream")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.textFileStream("hdfs://tmp/a,txt")
lines.print()
ssc.start()
ssc.awaitTermination()
}
}

程序启动后将新文件移动到hdfs目录下,流计算无法被检测到,但是新文件持续动态的输入到目录下会被检测到

读取RDD数据队列

package stream

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object TestStream2 {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("Chapter8_4_2")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(1))

val rddQueue = new mutable.SynchronizedQueue[RDD[Int]]
val addQueueThread = new Thread(new Runnable {
override def run(): Unit = {
for(i <- 5 to 10){
rddQueue += sc.parallelize(1 to i, 2)
Thread.sleep(2000)
}
}
})

val inputDStream = ssc.queueStream(rddQueue)
inputDStream.print()

ssc.start()
addQueueThread.start()
ssc.awaitTermination()
}

读取Flume数据

添加依赖

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-flume_2.11</artifactId>
<version>2.3.1</version>
</dependency>
package stream

import .InetSocketAddress

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object TestFlume {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestFlume")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(1))
val flumeAddr = Seq(new InetSocketAddress("note01", 9999))
val flumeEvent: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, flumeAddr, StorageLevel.MEMORY_AND_DISK_SER_2)
val flumeDstream = flumeEvent.map(s => new String(s.event.getBody.array()))
flumeDstream.map( u => (u.split(",")(0),1)).reduceByKey(_+_).print()
ssc.start()
ssc.awaitTermination()
}
}

读取flume数据有两种方式,一种为poll一种为push,poll为spark主动读取数据,push为flume推送数据,push只允许一个flume地址