pom:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.tzb.bigdata</groupId>
<artifactId>spark-test</artifactId>
<!--<packaging>pom</packaging>-->
<version>1.0</version>
<!--<modules>-->
<!--<module>hbase</module>-->
<!--</modules>-->
<properties>
<scala.version>2.10.6</scala.version>
<hadoop.version>2.6.0</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.apache.spark</groupId>-->
<!--<artifactId>spark-sql_2.10</artifactId>-->
<!--<version>1.6.0</version>-->
<!--</dependency>-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>com.typesafe.play</groupId>
<artifactId>play-mailer_2.11</artifactId>
<version>7.0.0</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.41</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<!--=========================spark-streaming-kafka===========================-->
<!--0.8版本版本-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<!--0.10版本 新版本-->
<!--<dependency>-->
<!--<groupId>org.apache.spark</groupId>-->
<!--<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>-->
<!--<version>2.3.0</version>-->
<!--<exclusions>-->
<!--<exclusion>-->
<!--<artifactId>scala-library</artifactId>-->
<!--<groupId>org.scala-lang</groupId>-->
<!--</exclusion>-->
<!--</exclusions>-->
<!--</dependency>-->
<!--======================================================================-->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.2</version>
</dependency>
<!--<dependency>-->
<!--<groupId>org.scala-lang</groupId>-->
<!--<artifactId>scala-library</artifactId>-->
<!--<version>2.10.6</version>-->
<!--</dependency>-->
<!--<dependency>-->
<!--<groupId>org.apache.hadoop</groupId>-->
<!--<artifactId>hadoop-common</artifactId>-->
<!--</dependency>-->
<!--测试Hbase时再打开注释,否则idea本地连接测试环境会报错-->
<!--<dependency>-->
<!--<groupId>org.apache.hbase</groupId>-->
<!--<artifactId>hbase-client</artifactId>-->
<!--<version>2.0.1</version>-->
<!--<exclusions>-->
<!--<exclusion>-->
<!--<groupId>com.fasterxml.jackson.core</groupId>-->
<!--<artifactId>jackson-databind</artifactId>-->
<!--</exclusion>-->
<!--</exclusions>-->
<!--</dependency>-->
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
<dependency>
<groupId>org.neo4j.driver</groupId>
<artifactId>neo4j-java-driver</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<!-- 去掉scope作用域,使用默认的compile,编译、测试、运行都有效的作用域 -->
<!--<scope>test</scope>-->
</dependency>
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.3</version>
</dependency>
<!-- 邮件发送 -->
<!--<dependency>-->
<!--<groupId>com.typesafe.play</groupId>-->
<!--<artifactId>play-mailer_2.11</artifactId>-->
<!--<version>7.0.0</version>-->
<!--</dependency>-->
<!--<dependency>-->
<!--<groupId>org.apache.poi</groupId>-->
<!--<artifactId>poi</artifactId>-->
<!--<version>3.12</version>-->
<!--</dependency>-->
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.10.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-catalyst -->
<!--<dependency>-->
<!--<groupId>org.apache.spark</groupId>-->
<!--<artifactId>spark-catalyst_2.11</artifactId>-->
<!--<version>2.3.0</version>-->
<!--<scope>test</scope>-->
<!--</dependency>-->
<!--中文分词器-->
<dependency>
<groupId>com.huaban</groupId>
<artifactId>jieba-analysis</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.68</version>
</dependency>
<!--es-->
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
<version>6.2.4</version>
</dependency>
<!--poi excel-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.12</version>
</dependency>
</dependencies>
<build>
<finalName>spark-test</finalName>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<!--<version>3.0.0</version>-->
<configuration>
<archive>
<manifest>
<mainClass>WordCount</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
SparkStreaming04_KafkaSource:
package com.tzb.sparkstreaming
import java.net.URLEncoder
import net.minidev.json.JSONObject
import net.minidev.json.parser.JSONParser
import org.apache.commons.httpclient.HttpClient
import org.apache.commons.httpclient.methods.GetMethod
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* SparkStreaming 版本0.8
* 注:本程序是将SparkStreaming与kafka结合,通过从kafka采集数据,并调用HTTP接口传参
* SparkStreaming 从kafka中采集数据 无状态的数据统计(不同批次的word的次数不会合并)
* 1)声明采集器(这里不需要是因为 SparkStreaming03_MyReceiver中已定义)
* 2)重写方法 onStart ,onStop
*
* idea本地和210测试机都测试成功:
* 打开kafkatool向某个主题中推送数据
* 执行main方法,开始消费数据
* 并取出值调用HTTP接口传参
* 打包测试(成功):
* spark-submit --master yarn-client --conf spark.driver.memory=2g --class com.tzb.sparkstreaming.SparkStreaming04_KafkaSource --executor-memory 8G --num-executors 5 --executor-cores 2 /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/spark-test-jar-with-dependencies.jar >> /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/sparkstreaming_datachange.log
* 线上跑的话要把代码里的kafka以及zk 等组件的ip或域名,改为线上的,同时提交任务时把 spark-submit 改为 spark-submit2,命令后边加个&符号,则为后台启动程序,当前窗口可关闭。
*
* 如何停止任务:
* 如果想停止掉这个任务则:ps -ef | grep SparkStreaming04_KafkaSource,并将端口kill掉即可。
*
*/
object SparkStreaming04_KafkaSource {
def main(args: Array[String]): Unit = {
//使用SparkStreaming完成WordCount
//Spark配置对象
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkStreaming01_WordCount")
//实时数据分析环境对象
//采集周期:以指定的时间为周期采集实时数据
val streamingContext = new StreamingContext(sparkConf,Seconds(5))
// 从kafka中采集数据(注意这里要引 kafka 0.8 版本的包)
//家里机器
// val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
// streamingContext,
// "sparkproject1:2181",
// "testgroup", //group
// Map("testsparkstreaming" -> 3) //topics
// )
// 210 测试机
val kafkaDStream: ReceiverInputDStream[(String,String)] = KafkaUtils.createStream(
streamingContext,
"**.**.**.10:2181",
"testgroup", //group
Map("test" -> 3) //topics
)
//家里机器
//bin/kafka-topics.sh --zookeeper sparkproject1:2181 --list //查看kafka topics
//创建topic bin/kafka-topics.sh --zookeeper sparkproject1:2181 --create --topic testsparkstreaming --partitions 3 --replication-factor 2 //总共3*2 6个副本,先定义分区再定义副本
//生产测试数据 bin/kafka-console-producer.sh --broker-list sparkproject1:9092 --topic testsparkstreaming
// 210 测试机
//将采集的数据进行分解(扁平化),注意kafka的消息其实就是k v 对
val wordDStream : DStream[String] = kafkaDStream.flatMap(t => t._2.split(" "))
//将数据进行结构的转换方便统计分析
val mapDStream : DStream[(String,Int)] = wordDStream.map((_,1))
//将转换结构后的数据进行聚合处理
val wordToSumDStream : DStream[(String,Int)] = mapDStream.reduceByKey(_+_)
//将结果打印出来
wordToSumDStream.print()
wordToSumDStream.repartition(1) //未生效?
//将DStream保存成文件
// wordToSumDStream.saveAsTextFiles("file:///D:\\workspace\\spark-test\\output\\sparkstreamingResult1") //注意:如果sparkstreamingResult1文件夹没手动创建的话,会把结果存储到output目录
// wordToSumDStream.saveAsTextFiles("file:///D:/workspace/spark-test/output/sparkstreamingResult/sparkstreaming.txt")
// wordToSumDStream.saveAsTextFiles("file:///output/sparkstreamingResult/sparkstreaming.txt") // 指向D盘根目录
wordToSumDStream.foreachRDD(
rdd => {
val arr : Array[(String, Int)] = rdd.collect()
if(arr!=null && arr.length>0){
println("key:"+ arr(0)._1+" value:" +arr(0)._2)
//调用HTTP接口
val result = requestHTTP(arr(0)._1)
println("=======>HTTP接口调用结果:" + result)
}
}
)
//不能停止采集程序
//streamingContext.stop
//启动采集器
streamingContext.start()
//Driver等待采集器的执行
streamingContext.awaitTermination()
//
}
/**
* 请求HTTP接口
* @param jobName
* @return
*/
def requestHTTP(jobName: String) = {
var data =""
var jobName1="bbb"
// 相当于你拿到了一个浏览器 HTTP:(Get | Post | Put | Delete)
val httpClient = new HttpClient()
// 组装参数
val params = Map[String, String](
"jobName" -> URLEncoder.encode(jobName, "UTF-8"),
"jobName1" -> URLEncoder.encode(jobName1, "UTF-8")
).map(kv => kv._1 + "=" + kv._2).mkString("&")
val getMethod = new GetMethod("http://10.21.4.197:7772/src/main/test/sparkHTTP?" + params) //此接口写法示例:本类搜索 spark请求HTTP接口示例
getMethod.addRequestHeader("Content-Type", "application/json;charset=UTF-8")
// 发送get请求
val status = httpClient.executeMethod(getMethod)
if (status == 200) {
val responseBodyAsString = getMethod.getResponseBodyAsString
val jsonParser = new JSONParser()
val jsonObj: JSONObject = jsonParser.parse(responseBodyAsString).asInstanceOf[JSONObject]
data = jsonObj.get("data").toString
// 释放连接
getMethod.releaseConnection()
} else None
data
}
}
//这里不需要是因为 SparkStreaming03_MyReceiver中已定义
//声明采集器
//1)继承Receiver
//class MyReceiver(host:String,port:Int) extends Receiver[String](StorageLevel.MEMORY_ONLY){
// // val socket = _
// var socket: java.net.Socket = null
//
// def receive(): Unit = {
// socket = new java.net.Socket(host,port)
// val reader = new BufferedReader(new InputStreamReader(socket.getInputStream,"UTF-8"))
// var line : String =null
// while((line = reader.readLine()) != null){
// //将采集的数据存储到采集器的内部进行转换
// if("END".equals(line)){ //设定结束标识符
// return
// }else{ //数据是正常发的
// this.store(line)
// }
// }
// }
//
// override def onStart(): Unit ={
// //启动一个线程
// new Thread(
// new Runnable {
// override def run(): Unit = {
// receive()
// }
// }
//
// ).start()
// }
//
// override def onStop(): Unit = {
// if(socket != null){
// socket.close()
// socket = null
// }
// }
//}
HTTP接口:
package com.huayong.bi.web.controller;
import com.alibaba.fastjson.JSONObject;
import org.springframework.web.bind.annotation.*;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
/**
* 测试
*/
@RestController
@RequestMapping("/src/main/test")
public class TestController {
/**
* spark 请求 HTTP
* @param request
* @param response
*/
@CrossOrigin
@RequestMapping(value = "/sparkHTTP", method={RequestMethod.GET})
public String sparkHTTP(HttpServletRequest request, HttpServletResponse response) {
JSONObject jo = null;
try {
String jobName = request.getParameter("jobName");
String jobName1 = request.getParameter("jobName1");
System.out.println(jobName + "===" + jobName1);
jo = new JSONObject();
jo.put("code", 200);
jo.put("msg", "");
jo.put("data", "成功");
} catch (Exception e) {
e.printStackTrace();
}
return jo.toString();
}
}
如果sparkstreaming任务只需要某个时间段执行,则需要定时执行脚本将任务执行起来,如下示例:
0 08 * * * sh /hadoop/SparkStreaming04_KafkaSource_task.sh
#!bin/bash
pid=`jps -lm | grep SparkStreaming04_KafkaSource | awk -F " " '{print $1}'`
if test -n "$pid";then
jps -lm | grep SparkStreaming04_KafkaSource | awk '{print "kill -9 " $1}' | sh
echo "killed pvuv streaming:SparkStreaming04_KafkaSource pid : " $pid
else
echo "~~~~~~~spark 提交~~~~~~~"
nohup spark-submit --master yarn-client --conf spark.driver.memory=2g --class com.tzb.sparkstreaming.SparkStreaming04_KafkaSource --executor-memory 8G --num-executors 5 --executor-cores 2 /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/spark-test-jar-with-dependencies.jar >> /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/sparkstreaming_datachange.log
fi
执行过程参考:
[root@pf-bigdata1 bi]# jps -lm | grep SparkStreaming04_KafkaSource
25445 org.apache.spark.deploy.SparkSubmit --master yarn-client --conf spark.driver.memory=3g --class bi.streaming.SparkStreaming04_KafkaSource --executor-memory 3G --num-executors 5 --executor-cores 2 /var/lib/hadoop-hdfs/spride_sqoop_beijing/bi_table/test/spark-test-jar-with-dependencies.jar
[root@pf-bigdata1 bi]# jps -lm | grep SparkStreaming04_KafkaSource | awk -F " " '{print $1}'
25445
[root@pf-bigdata1 bi]#
其他配置参数参考:
nohup spark2-submit \
--class bi.streaming.SparkStreaming04_KafkaSource \
--master yarn-client \
--executor-memory 3G \
--driver-memory 3g \
--num-executors 5 \
--executor-cores 2 \
/hadoop/p-i-1.0.2-SNAPSHOT.jar > /hadoop/spark_out_2019.`date +\%Y\%m\%d\%H\%M\%S`.log 2>&1 &