文章目录


Spark Streaming的核心DStream案例_spark

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.SparkStream</groupId>
<artifactId>SparkStreamspace</artifactId>
<version>1.0-SNAPSHOT</version>

<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation=
"org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass></mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>6</source>
<target>6</target>
</configuration>
</plugin>
</plugins>
</build>


<properties>
<scala.version>2.11.8</scala.version>
<hadoop.version>2.7.4</hadoop.version>
<spark.version>2.3.2</spark.version>
</properties>
<dependencies>
<!--Scala-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!--Spark-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<!--Hadoop-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.2</version>
</dependency>

<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.46</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>2.0.0</version>
</dependency>




<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.2</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.0.2</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.3.2</version>
</dependency>

</dependencies>

</project>

1、transform()方法

TransformTest.scala

import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object TransformTest {
def main(args: Array[String]): Unit = {
//1.创建SparkConf对象
val sparkConf: SparkConf = new SparkConf()
.setAppName("TransformTest ").setMaster("local[2]")
//2.创建SparkContext对象,它是所有任务计算的源头
val sc: SparkContext = new SparkContext(sparkConf)
//3.设置日志级别
sc.setLogLevel("WARN")
//4.创建StreamingContext,需要两个参数,分别为SparkContext和批处理时间间隔
val ssc: StreamingContext = new StreamingContext(sc, Seconds(5))
//5.连接socket服务,需要socket服务地址、端口号及存储级别(默认的)
val dstream: ReceiverInputDStream[String] =
ssc.socketTextStream("192.168.121.134", 9999)
//6.使用RDD-to-RDD函数,返回新的DStream对象(即words),并空格切分每行
val words: DStream[String] = dstream.transform(rdd => rdd
.flatMap(_.split(" ")))
//7.打印输出结果
words.print()
//8.开启流式计算
ssc.start()
//9.用于保持程序一直运行,除非人为干预停止
ssc.awaitTermination()
}
}

执行命令nc -lk 9999启动服务端监听Socket服务,并输入内容。

Spark Streaming的核心DStream案例_big data_02

Spark Streaming的核心DStream案例_spark_03

2、updateStateByKey()方法

UpdateStateByKeyTest.scala

import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object UpdateStateByKeyTest {
//newValues 表示当前批次汇总成的(K,V)中相同K的所有V
//runningCount 表示历史的所有相同key的value总和
def updateFunction(newValues: Seq[Int], runningCount: Option[Int]):
Option[Int] = {
val newCount = runningCount.getOrElse(0) + newValues.sum
Some(newCount)
}

def main(args: Array[String]): Unit = {
//1.创建SparkConf对象
val sparkConf: SparkConf = new SparkConf()
.setAppName("UpdateStateByKeyTest ").setMaster("local[2]")
//2.创建SparkContext对象,它是所有任务计算的源头
val sc: SparkContext = new SparkContext(sparkConf)
//3.设置日志级别
sc.setLogLevel("WARN")
//4.创建StreamingContext,需要两个参数,分别为SparkContext和批处理时间间隔
val ssc: StreamingContext = new StreamingContext(sc, Seconds(5))
//5.配置检查点目录,使用updateStateByKey方法必须配置检查点目录
ssc.checkpoint("./")
//6. 连接socket服务,需要socket服务地址、端口号及存储级别(默认的)
val dstream: ReceiverInputDStream[String] = ssc
.socketTextStream("192.168.121.134", 9999)
//7.按空格进行切分每一行,并将切分出来的单词出现的次数记录为1
val wordAndOne: DStream[(String, Int)] = dstream.flatMap(_.split(" "))
.map(word => (word, 1))
//8.调用updateStateByKey操作,统计单词在全局中出现的次数
var result: DStream[(String, Int)] = wordAndOne
.updateStateByKey(updateFunction)
//9.打印输出结果
result.print()
//10.开启流式计算
ssc.start()
//11.用于保持程序运行,除非被干预停止
ssc.awaitTermination()
}
}

执行命令nc -lk 9999启动服务端监听Socket服务,并输入内容。

Spark Streaming的核心DStream案例_big data_04

Spark Streaming的核心DStream案例_spark_05

3、window()方法

WindowTest.scala

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}

object WindowTest {
def main(args: Array[String]): Unit = {
//1.创建SparkConf对象
val sparkConf: SparkConf = new SparkConf()
.setAppName("WindowTest ").setMaster("local[2]")
//2.创建SparkContext对象,它是所有任务计算的源头
val sc: SparkContext = new SparkContext(sparkConf)
//3.设置日志级别
sc.setLogLevel("WARN")
//4.创建StreamingContext,需要两个参数,分别为SparkContext和批处理时间间隔
val ssc: StreamingContext = new StreamingContext(sc, Seconds(1))
//5.连接socket服务,需要socket服务地址、端口号及存储级别(默认的)
val dstream: ReceiverInputDStream[String] = ssc
.socketTextStream("192.168.121.134", 9999)
//6.按空格进行切分每一行
val words: DStream[String] = dstream.flatMap(_.split(" "))
//7.调用window操作,需要两个参数,窗口长度和滑动时间间隔
val windowWords: DStream[String] = words.window(Seconds(3), Seconds(1))
//8.打印输出结果
windowWords.print()
//9.开启流式计算
ssc.start()
//10.让程序一直运行,除非人为干预停止
ssc.awaitTermination()
}
}

执行命令nc -lk 9999启动服务端监听Socket服务,并输入内容。

Spark Streaming的核心DStream案例_spark_06

4、reduceByKeyAndWindow()方法

ReduceByKeyAndWindowTest.scala

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}

object ReduceByKeyAndWindowTest {
def main(args: Array[String]): Unit = {
//1.创建SparkConf对象
val sparkConf: SparkConf = new SparkConf()
.setAppName("ReduceByKeyAndWindowTest ").setMaster("local[2]")
//2.创建SparkContext对象,它是所有任务计算的源头
val sc: SparkContext = new SparkContext(sparkConf)
//3.设置日志级别
sc.setLogLevel("WARN")
//4.创建StreamingContext,需要两个参数,分别为SparkContext和批处理时间间隔
val ssc: StreamingContext = new StreamingContext(sc, Seconds(1))
//5.连接socket服务,需要socket服务地址、端口号及存储级别(默认的)
val dstream: ReceiverInputDStream[String] = ssc
.socketTextStream("192.168.121.134", 9999)
//6.按空格进行切分每一行, 并将切分的单词出现次数记录为1
val wordAndOne: DStream[(String, Int)] = dstream.flatMap(_.split(" "))
.map(word => (word, 1))
//7.调用reduceByKeyAndWindow操作
val windowWords: DStream[(String, Int)] = wordAndOne
.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(3), Seconds(1))
//8.打印输出结果
windowWords.print()
//9.开启流式计算
ssc.start()
//10.让程序一直运行,除非人为干预停止
ssc.awaitTermination()
}
}

执行命令nc -lk 9999启动服务端监听Socket服务,并输入内容。

Spark Streaming的核心DStream案例_scala_07

5、SaveAsTextFilesTest()方法

SaveAsTextFilesTest.scala

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.ReceiverInputDStream

object SaveAsTextFilesTest {
def main(args: Array[String]): Unit = {
//1.设置本地测试环境
System.setProperty("HADOOP_USER_NAME", "root")
//2.创建SparkConf对象
val sparkConf: SparkConf = new SparkConf()
.setAppName("SaveAsTextFilesTest ").setMaster("local[2]")
//3.创建SparkContext对象,它是所有任务计算的源头
val sc: SparkContext = new SparkContext(sparkConf)
//4.设置日志级别
sc.setLogLevel("WARN")
//5.创建StreamingContext,需要两个参数,分别为SparkContext和批处理时间间隔
val ssc: StreamingContext = new StreamingContext(sc, Seconds(5))
//6.连接socket服务,需要socket服务地址、端口号及存储级别(默认的)
val dstream: ReceiverInputDStream[String] = ssc
.socketTextStream("192.168.121.134", 9999)
//7.调用saveAsTextFiles操作,将nc交互界面输出的内容保存到HDFS上
dstream.saveAsTextFiles("hdfs://hadoop02:9000/data/root/saveAsTextFiles/satf","txt")
ssc.start ()
ssc.awaitTermination ()
}
}

开启zookeeper和Hadoop集群,查看active在哪一个节点,下图就改成哪一个。

Spark Streaming的核心DStream案例_scala_08

执行命令nc -lk 9999启动服务端监听Socket服务,并输入内容。

内容会保存到hdfs中,并以satf开头以txt结尾的文件形式保存。

Spark Streaming的核心DStream案例_apache_09