文章目录
实时HDFS数据
import org.apache.spark.streaming.{Seconds, StreamingContext}import org.apache.spark.{SparkConf, SparkContext}object TestStream {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestStream")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(5))
val lines = ssc.textFileStream("hdfs://tmp/a,txt")
lines.print()
ssc.start()
ssc.awaitTermination()
}}程序启动后将新文件移动到hdfs目录下,流计算无法被检测到,但是新文件持续动态的输入到目录下会被检测到
读取RDD数据队列
package stream
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}import org.apache.spark.{SparkConf, SparkContext}import scala.collection.mutableobject TestStream2 {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("Chapter8_4_2")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(1))
val rddQueue = new mutable.SynchronizedQueue[RDD[Int]]
val addQueueThread = new Thread(new Runnable {
override def run(): Unit = {
for(i <- 5 to 10){
rddQueue += sc.parallelize(1 to i, 2)
Thread.sleep(2000)
}
}
})
val inputDStream = ssc.queueStream(rddQueue)
inputDStream.print()
ssc.start()
addQueueThread.start()
ssc.awaitTermination()}读取Flume数据
添加依赖
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-flume_2.11</artifactId> <version>2.3.1</version> </dependency>
package stream
import .InetSocketAddress
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent}import org.apache.spark.{SparkConf, SparkContext}import org.apache.spark.streaming.{Seconds, StreamingContext}object TestFlume {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("TestFlume")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(1))
val flumeAddr = Seq(new InetSocketAddress("note01", 9999))
val flumeEvent: ReceiverInputDStream[SparkFlumeEvent] = FlumeUtils.createPollingStream(ssc, flumeAddr, StorageLevel.MEMORY_AND_DISK_SER_2)
val flumeDstream = flumeEvent.map(s => new String(s.event.getBody.array()))
flumeDstream.map( u => (u.split(",")(0),1)).reduceByKey(_+_).print()
ssc.start()
ssc.awaitTermination()
}}读取flume数据有两种方式,一种为poll一种为push,poll为spark主动读取数据,push为flume推送数据,push只允许一个flume地址
















