Maven依赖
<properties>
<spark.version>2.2.2</spark.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
1.动态黑名单过滤
广告计费系统,是电商必不可少的一个功能点。为了防止恶意的广告点击,必须对广告点击进行黑名单过滤。黑名单的过滤可以是ID,可以是IP等等,黑名单就是过滤的条件,利用SparkStreaming的流处理特性,可实现实时黑名单的过滤实现。可以使用left outer join 对目标数据和黑名单数据进行关联,将命中黑名单的数据过滤掉
数据
27.19.74.143##2016-05-30 17:38:20##GET /static/image/common/hot_2.gif HTTP/1.1##200##682
27.19.74.143##2016-05-30 17:38:20##GET /static/image/filetype/common.gif HTTP/1.1##200##90
110.52.250.126##2016-05-30 17:38:20##GET /source/plugin/wsh_wx/img/wsh_zk.css HTTP/1.1##200##1482
110.52.250.126##2016-05-30 17:38:20##GET /data/cache/style_1_forum_index.css?y7a HTTP/1.1##200##2331
110.52.250.126##2016-05-30 17:38:20##GET /source/plugin/wsh_wx/img/wx_jqr.gif HTTP/1.1##200##1770
27.19.74.143##2016-05-30 17:38:20##GET /static/image/common/recommend_1.gif HTTP/1.1##200##1030
110.52.250.126##2016-05-30 17:38:20##GET /static/image/common/logo.png HTTP/1.1##200##4542
27.19.74.143##2016-05-30 17:38:20##GET /data/attachment/common/c8/common_2_verify_icon.png HTTP/1.1##200##582
110.52.250.126##2016-05-30 17:38:20##GET /static/js/logging.js?y7a HTTP/1.1##200##603
8.35.201.144##2016-05-30 17:38:20##GET /uc_server/avatar.php?uid=29331&size=middle HTTP/1.1##301##-
27.19.74.143##2016-05-30 17:38:20##GET /data/cache/common_smilies_var.js?y7a HTTP/1.1##200##3184
27.19.74.143##2016-05-30 17:38:20##GET /static/image/common/pn.png HTTP/1.1##200##592
BlacklistFilter.scala
package programcase
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Author Daniel
* @Description 动态黑名单过滤
* 数据格式:110.52.250.126##2016-05-30 17:38:20##GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1##200##1292
* 需要过滤的用户:27.19.74.143 110.52.250.126
**/
object BlacklistFilter {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("BlacklistFilter")
.setMaster("local[*]")
val batchInterval = Seconds(2)
val ssc = new StreamingContext(conf, batchInterval)
//需要添加至黑名单的IP,value为任意信息
val blacklist: RDD[(String, Int)] = ssc.sparkContext.parallelize(List(
("27.19.74.143", 1),
("110.52.250.126", 1)
))
val lines = ssc.socketTextStream("hadoop01", 9999)
//切割数据
val ip2Info: DStream[(String, String)] = lines.map(line => {
val index = line.indexOf("##")
val ip = line.substring(0, index)
val info = line.substring(index + 2)
(ip, info)
})
//进行join
val filterDS = ip2Info.transform(rdd => {
val filteredRDD = rdd.leftOuterJoin(blacklist)
//filter里面是需要保留的数据,需要保留值为null值的数据
.filter { case (ip, (info, option)) => !option.isDefined }
filteredRDD.map { case (ip, (info, option)) => (ip, info) }
})
filterDS.print()
ssc.start()
ssc.awaitTermination()
}
}
发送数据
可以看到只有一条数据保留下来
2.整合kafka的在线ETL案例
从kafka指定的topic中读取如下数据进行清洗,保留有用信息,包括userid、用户操作时间(timestamp)、 用户ip地址、用户访问ip地址、用户端口、服务端口、用户访问的具体url,最后将清洗结果送回kafka指定的topic中
- 数据格式
<<<!>>>3111<<<!>>> --->记录id
<<<!>>>238<<<!>>> --->userid
<<<!>>>20181111132902<<<!>>> --->用户操作时间
<<<!>>>58.223.1.112<<<!>>> --->用户ip地址
<<<!>>>202.102.92.18<<<!>>> --->用户访问ip地址
<<<!>>>59947<<<!>>> --->用户端口
<<<!>>>80<<<!>>> --->服务端口
<<<!>>>www.sumecjob.com<<<!>>> -->服务域名
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>><<<!>>>
<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>> -->用户访问的具体url
<<<!>>>2556928066<<<!>>>
- 构建topic
kafka-topics.sh --create \
--topic etl_original \
--zookeeper hadoop01:2181/kafka \
--partitions 3 --replication-factor 1
kafka-topics.sh --create \
--topic etl_standard \
--zookeeper hadoop01:2181/kafka \
--partitions 3 --replication-factor 1
ServerLogIndex.java
package blog.util;
/**
* @Author Daniel
* @Description 字段常量
**/
public interface ServerLogIndex {
//在原始数据中,需要信息的索引
int INDEX_USER_ID = 1;
int INDEX_TIME = 2;
int INDEX_CLIENT_IP = 3;
int INDEX_CLIENT_PORT = 4;
int INDEX_SERVER_IP = 5;
int INDEX_SERVER_PORT = 6;
int INDEX_URL = 13;
}
DateUtil.java
package blog.util;
import org.apache.commons.lang3.time.FastDateFormat;
import java.text.ParseException;
/**
* @Author Daniel
* @Description 将日期转换为时间戳
**/
public class DateUtil {
//SimpleDateFormat线程不安全,应用于多线程时会报错,这里使用FastDateFormat类来避免这个问题
private static FastDateFormat df = FastDateFormat.getInstance("yyyyMMddHHmmss");
public static Long parseTime(String fmtTime) {
try {
return df.parse(fmtTime).getTime();
} catch (ParseException e) {
e.printStackTrace();
}
return System.currentTimeMillis();
}
}
StreamingETL.scala
package programcase
import java.util.Properties
import blog.kafka.KafkaManager
import blog.util.{DateUtil, ServerLogIndex}
import org.apache.curator.framework.CuratorFrameworkFactory
import org.apache.curator.retry.ExponentialBackoffRetry
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Author Daniel
* @Description ETL
* 从orginal topic中采集原始数据,进行streaming在线处理,落地到standard中
* 数据格式:<<<!>>>3111<<<!>>> --->记录id
* <<<!>>>238<<<!>>> --->userid
* <<<!>>>20181111132902<<<!>>> --->用户操作时间
* <<<!>>>58.223.1.112<<<!>>> --->用户ip地址
* <<<!>>>202.102.92.18<<<!>>> --->用户访问ip地址
* <<<!>>>59947<<<!>>> --->用户端口
* <<<!>>>80<<<!>>> --->服务端口
* <<<!>>>www.sumecjob.com<<<!>>> -->服务域名
* <<<!>>><<<!>>>
* <<<!>>><<<!>>>
* <<<!>>><<<!>>>
* <<<!>>><<<!>>>
* <<<!>>><<<!>>>
* <<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>> -->用户访问的具体url
* <<<!>>>2556928066<<<!>>>
**/
object StreamingETL {
def main(args: Array[String]): Unit = {
if (args == null || args.length < 4) {
println(
"""
|Usage: <brokerList> <groupId> <original> <standard>
""".stripMargin)
System.exit(-1)
}
val Array(brokerList, groupId, original, standard) = args
//original数据以","分割
val topics = original.split(",").toSet
val conf = new SparkConf()
.setAppName("_01RealTimeETLOps")
.setMaster("local[*]")
val batchInterval = Seconds(2)
val kafkaParams = Map[String, String](
"metadata.broker.list" -> brokerList,
"group.id" -> groupId,
"auto.offset.reset" -> "smallest"
)
val ssc = new StreamingContext(conf, batchInterval)
//调用util在kafka中创建message
val messages = KafkaManager.createMsg(ssc, kafkaParams, topics, curator)
messages.foreachRDD((rdd, bTime) => {
if (!rdd.isEmpty()) {
println(s"Time: $bTime")
processRDD(rdd, standard)
//更新偏移量
KafkaManager.storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, kafkaParams("group.id"), curator)
}
})
ssc.start()
ssc.awaitTermination()
}
//核心业务逻辑
def processRDD(rdd: RDD[(String, String)], destTopic: String): Unit = {
val cleanedRDD: RDD[String] = rdd.map { case (key, value) => {
val fields = value.split("<<<!>>>,<<<!>>>")
if (fields == null || fields.length < 15) {
""
} else {
val userId = fields(ServerLogIndex.INDEX_USER_ID)
//格式化timestamp
val timestamp = DateUtil.parseTime(fields(ServerLogIndex.INDEX_TIME))
val clientIp = fields(ServerLogIndex.INDEX_CLIENT_IP)
val clientPort = fields(ServerLogIndex.INDEX_CLIENT_PORT)
val serverIp = fields(ServerLogIndex.INDEX_SERVER_IP)
val serverPort = fields(ServerLogIndex.INDEX_SERVER_PORT)
val url = fields(ServerLogIndex.INDEX_URL)
s"${userId}^${timestamp}^${clientIp}:${clientPort}^${serverIp}:${serverPort}^${url}"
}
}
}.filter(!_.isEmpty)
//将清洗好的数据送回kafka中
produceInfo(cleanedRDD, destTopic)
}
//向kafka发送数据
def produceInfo(cleanedRDD: RDD[String], destTopic: String): Unit = {
//在一个分区中操作可以提高效率
cleanedRDD.foreachPartition(partition => {
if (!partition.isEmpty) {
val conf = new Properties()
conf.put("bootstrap.servers", "hadoop01:9092,hadoop02:9092,hadoop03:9092")
conf.put("acks", "1") //1表示只需要leader确认
conf.put("retries", "0")
conf.put("batch.size", "16384")
conf.put("linger.ms", "5000")
conf.put("buffer.memory", "33554432")
conf.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
conf.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String, String](conf)
partition.foreach { line => {
val record = new ProducerRecord[String, String](destTopic, line)
producer.send(record)
}
}
producer.close()
}
})
}
////构建一个Curator的Client
val curator = {
val client = CuratorFrameworkFactory.builder()
.connectString("hadoop01:2181,hadoop02:2181,hadoop03:2181")
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.namespace("kafka/consumers/offsets")
.build()
client.start()
client
}
}
输入参数
hadoop01:9092,hadoop02:9092,hadoop03:9092 ETL etl_original etl_standard
启动生产者与消费者
kafka-console-producer.sh --topic etl_original --broker-list hadoop01:9092,hadoop02:9092,hadoop03:9092
kafka-console-consumer.sh --topic etl_standard --bootstrap-server hadoop01:9092,hadoop02:9092,hadoop03:9092
发送数据
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>59948<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928065<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>59947<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928066<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60183<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928067<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60184<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928068<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60200<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928069<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60205<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928070<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60227<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928071<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60228<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928072<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60253<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928073<<<!>>>
<<<!>>>3111<<<!>>>,<<<!>>>238<<<!>>>,<<<!>>>20181111132902<<<!>>>,<<<!>>>58.223.1.112<<<!>>>,<<<!>>>202.102.92.18<<<!>>>,<<<!>>>60258<<<!>>>,<<<!>>>80<<<!>>>,<<<!>>>www.sumecjob.com<<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>><<<!>>>,<<<!>>>http://www.sumecjob.com/Social.aspx<<<!>>>,<<<!>>>2556928074<<<!>>>
结果如下