大数据之电商分析系统(二)
本文承接上一篇(电商分析系统一)
一:项目需求具体实现5-10
- 页面转换率统计
计算页面单跳转化率, 什么是页面单跳转换率, 比如一个用户在一次 Session过程中访问的页面路径 3,5,7,9,10,21,那么页面 3 跳到页面 5 叫一次单跳,7-9 也叫一次单跳,那么单跳转化率就是要统计页面点击的概率,比如: 计算 3-5 的单跳转化率,先获取符合条件的 Session 对于页面 3 的访问次数(PV)为 A,然后获取符合条件的 Session 中访问了页面 3 又紧接着访问了页面 5 的次数为 B,那么 B/A 就是 3-5 的页面单跳转化率,我们记为 C;那么页面 5-7 的转化率怎么求呢?先需要求出符合条件的 Session 中访问页面 5 又紧接着访问了页面 7 的次数为 D,那么 D/B即为 5-7 的单跳转化率。
产品经理,可以根据这个指标,去尝试分析, 整个网站,产品, 各个页面的表现怎么样,是不是需要去优化产品的布局;吸引用户最终可以进入最后的支付页面。数据分析师,可以此数据做更深一步的计算和分析。企业管理层, 可以看到整个公司的网站, 各个页面的之间的跳转的表现如何,可以适当调整公司的经营战略或策略。
需要根据查询对象中设置的 Session 过滤条件,先将对应得 Session过滤出来,然后根据查询对象中设置的页面路径, 计算页面单跳转化率, 比如查询的页面路径为:3、5、7、8,那么就要计算 3-5、5-7、7-8 的页面单跳转化率。需要注意的一点是, 页面的访问时有先后的。
数据源解析:
用户访问数据表: UserVisitAction
PageOneStepCoverRate 页面切片转化率
package com.ityouxin.page
import java.util.UUID
import com.ityouxin.commons.conf.ConfigurationManager
import com.ityouxin.commons.constant.Constants
import com.ityouxin.commons.model.UserVisitAction
import com.ityouxin.commons.utils.{DateUtils, NumberUtils, ParamUtils}
import com.ityouxin.session.PageSplitConvertRate
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}
import net.sf.json.JSONObject
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
/*需求5:求每个页面切片的转化率*/
object PageOneStepConverRate {
def main(ags: Array[String]): Unit = {
//初始化配置信息
val conf = new SparkConf().setMaster("local[*]").setAppName("SessionAnalyzer")
//初始化SparkSession
val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
//获取sparkContext
val sparkContext = spark.sparkContext
//获取任务配置
val jsonStr = ConfigurationManager.config.getString("task.params.json")
val taskParm = JSONObject.fromObject(jsonStr)
//查询user_visit_action表的数据 按照日期范围
val userVisitActionRDD: RDD[UserVisitAction] = getActionRDDByDateRange(spark, taskParm)
//将用户行为信息转换为k-v元组
val sessionidActionRDD: RDD[(String, UserVisitAction)] = userVisitActionRDD.map(uva => {
(uva.session_id, uva)
})
//缓存数据
sessionidActionRDD.persist(StorageLevel.MEMORY_ONLY)
//得到每个session的所有用户行为数据的RDD
val sessionidActionsRDD: RDD[(String, Iterable[UserVisitAction])] = sessionidActionRDD.groupByKey()
//生成每个session会话单挑页面切片 即每个会话的单挑点 RDD[flag,1]
val pageSplitRDD: RDD[(String, Int)] = generateAndMatchPageSplit(sparkContext, sessionidActionsRDD, taskParm)
//获取单跳页面的PageVisit [flag,pv]
println("-------------------------------------")
println(pageSplitRDD.collect().mkString(","))
println("*************************************")
val pageSplitPvMap: collection.Map[String, Long] = pageSplitRDD.countByKey()
pageSplitPvMap.foreach(println)
//求查询条件的第一个页面的PV
val startPagePv: Long = getStartPagePV(taskParm,sessionidActionsRDD)
//计算页面流的各个页面切片的转换率
val convertRateMap: ListBuffer[(String, Double)] = computePageSplitConvertRate(taskParm,pageSplitPvMap,startPagePv)
// val convertRateMap: RDD[(String, Double)] = spark.sparkContext.makeRDD(pageSplitConvertRateList)
// convertRateMap
//获取任务的taskid
val taskUUID: String = UUID.randomUUID().toString
//持久化页面切片的转化率
persitConvertRate(spark,taskUUID,convertRateMap)
}
//持久化得到的页面切片转换率
def persitConvertRate(spark: SparkSession, taskUUID: String, convertRateMap: ListBuffer[(String, Double)]): Unit = {
//数据类型为5_6=0.95|2_3=0.92|6_7=1.01|3_4=1.08|4_5=1.02|1_2=0.1
val convertRate: String = convertRateMap.map(item => {
item._1 + "=" + item._2
}).mkString("|")
//.toList.sortWith((x,y)=>{x.split("_")(0) < y.split("_")(0)})
println(convertRate)
//封装整理好的页面切片转化率和任务id的对象
val pageSplitConvertRateRDD: RDD[PageSplitConvertRate] = spark.sparkContext.makeRDD(Array(PageSplitConvertRate(taskUUID,convertRate)))
import spark.implicits._
//保存到数据库
pageSplitConvertRateRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","page_split_convert_rate")
.mode(SaveMode.Append)
.save()
}
//计算页面流的各个页面的切片转化率
def computePageSplitConvertRate(taskParm: JSONObject,
pageSplitPvMap: collection.Map[String, Long],
startPagePv: Long) = {
val convertRateMap=new mutable.HashMap[String,Double]()
//要求计算单挑转换率的配置 1,2,3,4,5,6,7 -> 1_2 2_3 3_4 4_5 5_6 6_7 与之前的方法类似
var targetPageFlow: String = ParamUtils.getParam(taskParm,Constants.PARAM_TARGET_PAGE_FLOW)
//targetPageFlow = targetPageFlow.sorted
val targetPages: List[String] = targetPageFlow.split(",").toList
val targetPagePairs: List[String] = targetPages.slice(0,targetPages.length-1).zip(targetPages.tail)
.map(item=>(item._1+"_"+item._2))
//.sortWith((x,y)=>{x.split("_")(0) < y.split("_")(0)})
/*val tuples: List[(String, String)] = targetPages.slice(0,targetPages.length-1).zip(targetPages.tail)
val tuplesSorted: List[(String, String)] = tuples.sortWith {
case (x, y) =>
x._1 > x._2
y._1 > y._2
}
val targetPagePairs: List[String] = tuplesSorted.map(
item => item._1 + "_" + item._2
)*/
//优化排序
//val targetPagePairs: List[String] = targetPagePairsNoSort.sortWith((x,y)=>x.split("_")(0) < y.split("_")(0))
val list1: ListBuffer[String] = new mutable.ListBuffer[String]
val list2:ListBuffer[Double] = new mutable.ListBuffer[Double]
//更新页面的flag的pv
var lastPageSplitPv: Double = startPagePv.toDouble
//遍历 targetPagePairs 取得每个targetPage 然后再更新页面的pv
//拿第一个页面的pv与1_2 2_3 3_4 4_5 5_6 6_7 其他单挑 求出每个页面的转换率
for (targetPage <- targetPagePairs){
//从单挑页面的pv的map中得到每一个页面的转换率
val targetPageSplitPv: Double = pageSplitPvMap(targetPage).toDouble
//计算转换率
val convertRate: Double = NumberUtils.formatDouble(targetPageSplitPv/lastPageSplitPv,2)
//将每次的到的转换率保存到容器中
//convertRateMap.put(targetPage,convertRate)
list1.append(targetPage)
list2.append(convertRate)
//为下一个单挑页面转换率的计算更新最新的pv值
lastPageSplitPv = targetPageSplitPv
}
val listsZip: ListBuffer[(String, Double)] = list1.zip(list2)
//返回计算好的页面切片的转化率
listsZip
}
//查询开始页面的访问
def getStartPagePV(taskParm: JSONObject,
sessionidActionsRDD: RDD[(String, Iterable[UserVisitAction])]):Long = {
//获取开始的PARAM_TARGET_PAGE_FLOW 信息
val targetPageFlow: String = ParamUtils.getParam(taskParm,Constants.PARAM_TARGET_PAGE_FLOW)
//分割 转Long 获取第一个 1
val startPageId: Long = targetPageFlow.split(",")(0).toLong
//对之前得到的用户行为RDD进行先优化式过滤后map
val startPageRDD: RDD[Long] = sessionidActionsRDD.flatMap {
case (sid, uvas) =>
//过滤 映射 成RDD
uvas.filter(startPageId == _.page_id).map(_.page_id)
}
startPageRDD.count()
}
//获取每个会话的单跳切片 类似配置信息
def generateAndMatchPageSplit(sparkContext: SparkContext,
sessionidActionsRDD: RDD[(String, Iterable[UserVisitAction])],
taskParm: JSONObject) = {
//要求计算单挑转换率 先得到配置 1,2,3,4,5,6,7 -> 1_2 2_3 3_4 4_5....
val targetPageFlow = ParamUtils.getParam(taskParm, Constants.PARAM_TARGET_PAGE_FLOW)
val targetPages: List[String] = targetPageFlow.split(",").toList
//获取1,2,3,4,5,6
val targetPagesSlice: List[String] = targetPages.slice(0, targetPages.length - 1)
//2,3,4,5,6,7
val targetPagesTail: List[String] = targetPages.tail
//(1,2),(2,3),(3,4),(4,5),(5,6),(6,7)
val targetPagesZip: List[(String, String)] = targetPagesSlice.zip(targetPagesTail)
//将得到的list 按照查询条件转换成页面单挑flag 得到查询条件
val targetPagesPairs: List[String] = targetPagesZip.map(item => {
item._1 + "_" + item._2
})
println(targetPagesPairs.mkString(","))
//将查询条件的单跳flag结果 广播出去
val targetPagePairsBroadcast: Broadcast[List[String]] = sparkContext.broadcast(targetPagesPairs)
//对用户的行为数据集进行处理,排序 对action_time进行排序
//不进行赋值 直接返回flatmap后RDD也是可以的
sessionidActionsRDD.flatMap {
case (sessionid, uvas) =>
val sortUVASList: List[UserVisitAction] = uvas.toList
//用户的行为数据 按照时间排序
sortUVASList.sortWith(
//对用户的action_time进行排序
(uva1, uva2) => {
DateUtils.parseTime(uva1.action_time).getTime < DateUtils.parseTime(uva2.action_time).getTime
}
)
//用户顺序访问的页面
val soredPages: List[AnyVal] = sortUVASList.map(item => {
if (item.page_id != null)
item.page_id
})
//当前session页面的单跳flag
println(soredPages.mkString(","))
val sessionPagePairs: List[String] = soredPages.slice(0, soredPages.length - 1).zip(soredPages.tail).map(item => item._1 + "_" + item._2)
//过滤出需要统计的页面单跳
sessionPagePairs.filter(
targetPagePairsBroadcast.value.contains(_)
).map((_, 1))
}
}
//按照日期范围查找数据
def getActionRDDByDateRange(spark: SparkSession, taskParm: JSONObject) = {
//获取开始时间
val startDate: String = ParamUtils.getParam(taskParm, Constants.PARAM_START_DATE)
//获取结束时间
val endDate = ParamUtils.getParam(taskParm, Constants.PARAM_END_DATE)
//导入spark的隐式转换
import spark.implicits._
//在表中查询,注意变量需要用单引号引入
val SQLQuery = "select * from user_visit_action where date >= '" + startDate + "' and date <= '" + endDate + "'"
//将查询出的数据转换成dataSet,再转换成RDD
val ds: Dataset[UserVisitAction] = spark.sql(SQLQuery).as[UserVisitAction]
ds.rdd
}
}
- 各区域Top3商品统计
根据用户指定的日期查询条件范围,统计各个区域下的最热门【点击】的 top3商品,区域信息、各个城市的信息在项目中用固定值进行配置,因为不怎么变动。
区域等级表
A 华北华东
B 华南华中
C 西北西南
D 东北其他
AreaTop3ProducetApp 类
package com.ityouxin.product
import java.util.UUID
import com.ityouxin.commons.conf.ConfigurationManager
import com.ityouxin.commons.constant.Constants
import com.ityouxin.commons.utils.ParamUtils
import net.sf.json.JSONObject
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
//主要使用spark sql
object AreaTop3ProductApp {
//六
def main(args: Array[String]): Unit = {
//x需求六
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("AreaTop3ProductApp")
//初始化SparkSession
val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
//获取sc
val sc: SparkContext = spark.sparkContext
//根据配置工具类ConfigurationManager来获取config,获取任务配置
val jsonStr = ConfigurationManager.config.getString("task.params.json")
//将获取到的配置String转换成json格式,便于传递
val taskParm: JSONObject = JSONObject.fromObject(jsonStr)
//获取开始时间
val startDate: String = ParamUtils.getParam(taskParm,Constants.PARAM_START_DATE)
//获取结束时间
val endDate = ParamUtils.getParam(taskParm,Constants.PARAM_END_DATE)
//得到城市的点击行为RDD RDD["cirtid",Row("cityid","click_product_id")]
val cityClickActionRDD: RDD[(Long, Row)] = getCityClickActionRDD(spark,startDate,endDate)
//查询城市信息
val cityInfoRDD: RDD[(Long, Row)] = getCityInfoRDD(spark)
//生成临时表 tmp_click_product_basic
//"city_id" "city_name" "area" "product_id"
generateTempClickProductBasicTable(spark,cityClickActionRDD,cityInfoRDD)
//生成各个区域各个商品之间的点击次数的临时表 temp_area_product_click_count
//"area" "product_id" "click_count" "city_infos"
generateTempTemAreaProductClickCountTable(spark)
//关联商品信息表
generateTempAreaFullProDuctClickCountTable(spark)
//获取每个区域的top3的商品
val areaTop3ProductDF: DataFrame = getAreaTop3ProductInfo(spark)
//把统计好的数据写入到Mysql数据库中
val taskUUID = UUID.randomUUID().toString
import spark.implicits._
val areaTop3RDD: RDD[AreaTop3Product] = areaTop3ProductDF.rdd.map(
row =>
AreaTop3Product(taskUUID,
row.getAs[String]("area"),
row.getAs[String]("area_level"),
row.getAs[Long]("product_id"),
row.getAs[String]("city_infos"),
row.getAs[Long]("click_count"),
row.getAs[String]("product_name"),
row.getAs[String]("product_status")
)
)
areaTop3RDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","area_top3_product")
.mode(SaveMode.Append)
.save()
spark.close()
}
//获取每个区域热门top3的商品
def getAreaTop3ProductInfo(spark: SparkSession) = {
val sql ="select area," +
" case " +
" when area='华北' or area='华东' THEN 'A'" +
" when area='华南' or area='华中' THEN 'B'" +
" when area='西北' or area='西南' THEN 'C'" +
" else 'D'" +
" end area_level" +
" ,product_id ,click_count ,city_infos ,product_name,product_status from " +
"(select area ,product_id ,click_count ,city_infos ,product_name,product_status ," +
" row_number() Over ( partition by area order by click_count desc ) rn " +
" from tmp_area_fullprod_click_count ) t where t.rn<=3"
spark.sql(sql)
}
//进行表之间的关联 tmp_area_product_click_count 和 product_info 关联
def generateTempAreaFullProDuctClickCountTable(spark: SparkSession): Unit = {
val sql ="select t.area , t.product_id,t.click_count,t.city_infos,p.product_name," +
"if(get_json_object(p.extend_info,\"$.product_status\")='0','Self','Third Party') product_status" +
" from tmp_area_product_click_count t join product_info p on t.product_id = p.product_id"
val df: DataFrame = spark.sql(sql)
df.show()
df.createOrReplaceTempView("tmp_area_fullprod_click_count")
}
//获取,每个区域的每个商品点击次数的临时表 area" "product_id" "click_count" "city_infos"
def generateTempTemAreaProductClickCountTable(spark: SparkSession): Unit = {
val sql ="select t.area,t.product_id,count(*) click_count ," +
"concat_ws(\",\",collect_set(concat_ws(\":\",t.city_id,t.city_name))) city_infos" +
" from tmp_click_product_basic t group by t.area,t.product_id"
val df: DataFrame = spark.sql(sql)
df.show()
df.createOrReplaceTempView("tmp_area_product_click_count")
}
//产生临时表 tmp_click_product_basic 存储数据 "city_id" "city_name" "area" "product_id"
def generateTempClickProductBasicTable(spark: SparkSession,
cityClickActionRDD: RDD[(Long, Row)],
cityInfoRDD: RDD[(Long, Row)]) = {
//将城市点击行为RDD与城市信息RDDjoin
val joinRDD: RDD[(Long, (Row, Row))] = cityClickActionRDD.join(cityInfoRDD)
val mappedRDD: RDD[(Long, String, String, Long)] = joinRDD.map {
case (cityid, (action, cityInfo)) =>
val productId: Long = action.getLong(1)
val cityName: String = cityInfo.getString(1)
val area: String = cityInfo.getString(2)
(cityid, cityName, area, productId)
}
import spark.implicits._
val df: DataFrame = mappedRDD.toDF("city_id","city_name","area","product_id")
df.show()
//创建临时表
df.createOrReplaceTempView("tmp_click_product_basic")
}
//查询寻城市信息
def getCityInfoRDD(spark: SparkSession) = {
val cityInfo = Array(
(0L, "北京", "华北"),
(1L, "上海", "华东"),
(2L, "南京", "华东"),
(3L, "广州", "华南"),
(4L, "三亚", "华南"),
(5L, "武汉", "华中"),
(6L, "长沙", "华中"),
(7L, "西安", "西北"),
(8L, "成都", "西南"),
(9L, "哈尔滨", "东北"))
import spark.implicits._
val cityInfoRDD: RDD[(Long, String, String)] = spark.sparkContext.makeRDD(cityInfo)
val cityInfoDF: DataFrame = cityInfoRDD.toDF("city_id","city_name","area")
//转换格式
cityInfoDF.rdd.map(item => (item.getAs[Long]("city_id"),item))
}
//获取城市的点击行为数据
def getCityClickActionRDD(spark: SparkSession, startDate: String, endDate: String) = {
val clickActionRDDDF: DataFrame = spark.sql("select city_id,click_product_id from user_visit_action " +
"where click_product_id is not null and click_product_id !=-1 " +
"and date >= '" + startDate + "' and date <= '" + endDate + "'")
clickActionRDDDF.rdd.map(
item =>{
(item.getAs[Long]("city_id"),item)
}
)
}
//查询cityInfoRDD的信息
}
AreaTop3Product的封装样例类
package com.ityouxin.product
case class AreaTop3Product (
taskId:String,
area:String,
areaLevel:String,
productid:Long,
cityInfos:String,
clickCount:Long,
productName:String,
productStatus:String)
HiveDB 获取每个表中的数据
package com.ityouxin.product
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
object HiveDB {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("HiveDB")
//初始化SparkSession
val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
//获取sc
val sc: SparkContext = spark.sparkContext
//获取数据结构信息
spark.sql("show tables").show()
spark.sql("desc user_visit_action").show()
spark.sql("slect * from user_visit_action").show(5,false)
spark.sql("desc product_info").show(5,false)
spark.sql("desc user_info").show()
spark.sql("select * from user_info").show(5,false)
}
}
- 广告黑名单实时统计
实现实时的动态黑名单机制:将每天对某个广告点击超过 100 次的用户拉黑。
数据源解析
Kafka数据: timestamp province city userid adid
数据结构:
((0L, “北京”, “华北”), (1L, “上海”, “华东”), (2L, “南京”, “华东”), (3L, “广州”, “华南”), (4L,“三亚”, “华南”), (5L, “武汉”, “华中”), (6L, “长沙”, “华中”), (7L, “西安”, “西北”), (8L, "成都 ", “西南”), (9L, “哈尔滨”, “东北”))
- 广告点击实时统计‘
每天各省各城市各广告的点击流量实时统计。
数据源解析:
Kafka数据: timestamp province city userid adid
- 各省热门广告实时统计
统计每天各省 top3 热门广告
数据源解析:
数据来源于需求八 updateStateByKey 得到的Dstream
Dstream[( dateKey_province_city_adid , count)] - 最近一个小时广告点击实时统计
统计各广告最近 1 小时内的点击量趋势:各广告最近 1 小时内各分钟的点击量
数据源解析:
Kafka数据源 timestamp province city userid adid
需求7-10具体实现
AdClickRealTimeStat 类
package com.ityouxin.advertise
import java.util.Date
import com.ityouxin.commons.conf.ConfigurationManager
import com.ityouxin.commons.utils.DateUtils
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import scala.collection.mutable.ArrayBuffer
object AdClickRealTimeStat {
def main(args: Array[String]): Unit = {
//初始化配置信息
val conf: SparkConf = new SparkConf().setAppName("AdClickRealTimeStat").setMaster("local[*]")
//初始化SparkSession
val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
//初始化sc
val sc: SparkContext = spark.sparkContext
val ssc = new StreamingContext(sc,Seconds(5))
//设置检查点
ssc.checkpoint("./streaming_checkpoint")
//得到kafka的配置信息
val broker_list=ConfigurationManager.config.getString("kafka.broker.list")
//获取创建topics的配置
val topics: String = ConfigurationManager.config.getString("kafka.topics")
//获取kafka的参数信息 将 kafka 参数映射为 map
val kafkaParams = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> broker_list,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.GROUP_ID_CONFIG -> "adverter",
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false:java.lang.Boolean)
)
//创建一个流来读取kafka数据源 得到实时的DS
val adRealTimeDS: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Array(topics), kafkaParams)
)
//得到日志的value数据
val adRealTimeLogValueDS: DStream[String] = adRealTimeDS.map(item => item.value())
adRealTimeLogValueDS.print()
//需求七
//根据黑名单用用户数据进行过滤
val filteredAdRealTimeLogDS:DStream[(Long,String)] = filterByBlackList(spark,adRealTimeLogValueDS)
filteredAdRealTimeLogDS.print()
//需求七
//动态添加黑名单
generateDynamicBlackList(filteredAdRealTimeLogDS)
//八
//统计每天省市广告的点击流量
val aggregatedDS:DStream[(String,Long)] = calculateRealTimeStat(filteredAdRealTimeLogDS)
//九
//计算每天每个省Top3的热门广告
calculateRealTimeProvinceTop3Ad(spark,aggregatedDS)
//需求十
//计算最近每个小时的滑动窗口内的和广告每分钟的点击量趋势
calculateAdClickCountByWindow(adRealTimeLogValueDS)
//启动ssc
ssc.start()
ssc.awaitTermination()
}
//计算最近每个小时的滑动窗口内的和广告每分钟的点击量趋势
def calculateAdClickCountByWindow(adRealTimeLogValueDS: DStream[String]) = {
//获取每分钟广告的点击量
val pairDStream: DStream[(String, Long)] = adRealTimeLogValueDS.map {
case log =>
val logSplited: Array[String] = log.split(" ")
//得到分钟 日期格式为yyyyMMddHHmm
val timeMinute: String = DateUtils.formatTimeMinute(new Date(logSplited(0).toLong))
//得到adid
val adid: Long = logSplited(4).toLong
//返回每分钟的点击量广告id
(timeMinute + "_" + adid, 1L)
}
//进行获取窗户大小为1小时的DS
val aggrDS: DStream[(String, Long)] = pairDStream.reduceByKeyAndWindow(
(a: Long, b: Long) => a + b,
Minutes(60L),
Seconds(10L)
)
//对得到的窗口函数进行遍历,拼装出最新的用户每小时每分钟的点击量字符串
aggrDS.foreachRDD{
rdd =>
//对每个分区进行遍历
rdd.foreachPartition{
items =>
//创建一个AdClickTrend类型的容器
val adClickTrends = ArrayBuffer[AdClickTrend]()
//格式 items timeMinute + "_" + adid,count
for (item <- items){
val keySplited: Array[String] = item._1.split("_")
//yyyyMMddHHmm
val dateMinute: String = keySplited(0)
val adid = keySplited(1).toLong
val clickCount = item._2
//重新拼接出需要的日期格式 yyyy-MM-dd HH:mm
val date = DateUtils.formatDate(DateUtils.parseDateKey(dateMinute.substring(0,8)))
val hour = dateMinute.substring(8,10)
val minute = dateMinute.substring(10)
adClickTrends += AdClickTrend(date,hour,minute,adid,clickCount)
}
//入库
AdClickTrendDAO.updateBatch(adClickTrends.toArray)
}
}
}
//计算每天每个省的top3的热门广告
def calculateRealTimeProvinceTop3Ad(spark: SparkSession,
aggregatedDS: DStream[(String, Long)]) = {
//转换各省份维度的广告点击量
val top3DStream: DStream[Row] = aggregatedDS.transform {
//拼接字符串 从原始的数据中映射 组成新的rdd
rdd =>
val mappedRDD: RDD[(String, Long)] = rdd.map {
case (key, count) =>
val keySplited: Array[String] = key.split("_")
val date: String = keySplited(0)
val province: String = keySplited(1)
val adid: Long = keySplited(3).toLong
val clickCount = count
val provkey = date + "_" + province + "_" + adid
(provkey, count)
}
//每天每个省ad的点击量
val dailyAdClickCountByProvinceRDD: RDD[(String, Long)] = mappedRDD.reduceByKey(_ + _)
val rowRDD: RDD[(String, String, Long, Long)] = dailyAdClickCountByProvinceRDD.map {
case (provkey, count) =>
//date + "_" + province + "_" + adid
val provSplitKey: Array[String] = provkey.split("_")
val date: String = provSplitKey(0)
val province: String = provSplitKey(1)
val adid = provSplitKey(2).toLong
val clickeCount = count
//格式化时间
val dateFormat = DateUtils.formatDate(DateUtils.parseDateKey(date))
(dateFormat, province, adid, clickeCount)
}
import spark.implicits._
val df: DataFrame = rowRDD.toDF("date", "province", "ad_id", "click_count")
//创建一个临时表 将数据出存入临时表中
df.createOrReplaceTempView("tmp_daily_ad_click_count_by_prov")
//Sql执行Top3的查询 查询出Top3的数据
val provincrTop3AdDF: DataFrame = spark.sql("select date,province,ad_id,click_count from " +
"(select date,province,ad_id,click_count," +
"row_number() over( partition by province order by click_count desc ) rn " +
"from tmp_daily_ad_click_count_by_prov ) t where t.rn <=3")
provincrTop3AdDF.rdd
}
//对每天每省的广告的Top3进行遍历后进行重新封装
top3DStream.foreachRDD{
rdd =>
rdd.foreachPartition{
items =>
val adProvinceTop3s = ArrayBuffer[AdProvinceTop3]()
//date, province, ad_id, clicke_Count
for (item <- items){
val date: String = item.getString(0)
val province: String = item.getString(1)
val adid: Long = item.getLong(2)
val clickCount: Long = item.getLong(3)
//将封装好的对象依次存入容器中
adProvinceTop3s += AdProvinceTop3(date,province,adid,clickCount)
}
//入库 更新
AdProvinceTop3DAO.updateBatch(adProvinceTop3s.toArray)
}
}
}
//动态统计每个省市的每天广告的点击流量
def calculateRealTimeStat(filteredAdRealTimeLogDS: DStream[(Long, String)]): DStream[(String, Long)] = {
val mappedDS: DStream[(String, Long)] = filteredAdRealTimeLogDS.map {
case (userid, log) =>
val logSplited: Array[String] = log.split(" ")
val timeStamp: String = logSplited(0)
val date: String = DateUtils.formatDateKey(new Date(timeStamp.toLong))
val province: String = logSplited(1)
val city: String = logSplited(2)
val adid: Long = logSplited(4).toLong
val key = date + "_" + province + "_" + city + "_" + adid
(key, 1L)
}
//有状态转换DS
val aggregatedDS: DStream[(String, Long)] = mappedDS.updateStateByKey[Long] {
(values: Seq[Long], old: Option[Long]) => {
Some(values.sum + old.getOrElse(0L))
}
}
aggregatedDS.foreachRDD{
rdd =>
rdd.foreachPartition{
items =>
val adStats = ArrayBuffer[AdStat]()
//date + "_" + province + "_" + city + "_" + adid
for (item <- items){
val keySplited:Array[String] = item._1.split("_")
val date = keySplited(0)
val province = keySplited(1)
val city = keySplited(2)
val adid = keySplited(3).toLong
val clickCount = item._2
adStats += AdStat(date,province,city,adid,clickCount)
}
//入库
AdStatDAO.updateBatch(adStats.toArray)
}
}
aggregatedDS
}
//动态添加黑名单
def generateDynamicBlackList(filteredAdRealTimeLogDS: DStream[(Long, String)]) = {
//每天每个用户每个广告对应的DS
val dailyUserAdClickDstream: DStream[(String, Long)] = filteredAdRealTimeLogDS.map {
//数据格式=timestamp province city userid adid
case (userid, log) =>
val logSplited: Array[String] = log.split(" ")
val timestamp: String = logSplited(0)
val date = new Date(timestamp.toLong)
//转换日期的数据格式 yyyyMMdd
val dateKey: String = DateUtils.formatDateKey(date)
val adid: String = logSplited(4)
//拼接字符串
val key: String = dateKey + "_" + userid + "_" + key
(key, 1L)
}
//聚合计算每天每个用户每个广告 点击的总数
val dailyUserADClickCountDS: DStream[(String, Long)] = dailyUserAdClickDstream.reduceByKey(_+_)
dailyUserADClickCountDS.foreachRDD{
rdd =>
rdd.foreachPartition{items:Iterator[(String,Long)]=>
//定义容器 用来存放广告点击数
val adUserClickCounts: ArrayBuffer[AdUserClickCount] = ArrayBuffer[AdUserClickCount]()
//item -> yyyyMMdd_userid_adid,count 遍历出日期和userid adid count 然后进行更新到容器中
for (item <- items){
//分割字符串
val keySplited: Array[String] = item._1.split("_")
//yyyy-MM-dd
val date = DateUtils.formatDate(DateUtils.parseDateKey(item._1.split("_")(0)))
//userid
val userid: Long = keySplited(1).toLong
//adid
val adid: Long = keySplited(2).toLong
//clickCount
val clickCount = item._2
adUserClickCounts += AdUserClickCount(date,userid,adid,clickCount)
}
//入库
AdUserClickCountDAO.updateBatch(adUserClickCounts.toArray)
}
}
//判断用户的操作行为是否超过或者等于100
val blackListDS: DStream[(String, Long)] = dailyUserADClickCountDS.filter {
case (key, count) =>
val keySplited: Array[String] = key.split("_")
val date = DateUtils.formatDate(DateUtils.parseDateKey(keySplited(0)))
//userid
val userid: Long = keySplited(1).toLong
//adid
val adid: Long = keySplited(2).toLong
//查询莫一天莫以用户对某一广告点击总数
val clickCount: Int = AdUserClickCountDAO.findClickCountByMultiKey(date, userid, adid)
if (clickCount >= 100) {
true
} else {
false
}
}
blackListDS
//有可能一个用户会点击多个广告,所以需要去重
val blackListUserDS: DStream[Long] = blackListDS.map(item => item._1.split("_")(1).toLong)
val distinctBlackUserDS: DStream[Long] = blackListUserDS.transform(rdd =>
rdd.distinct()
)
distinctBlackUserDS.foreachRDD{
rdd =>
rdd.foreachPartition{
items =>
val adBlackLists: ArrayBuffer[AdBlacklist] = ArrayBuffer[AdBlacklist]()
//userid
for (item <- items){
adBlackLists += AdBlacklist(item)
}
AdBlacklistDAO.insertBatch(adBlackLists.toArray)
}
}
}//end 动态入库
//根据黑名单用户数据进行过滤
def filterByBlackList(spark: SparkSession,
adRealTimeLogValueDS: DStream[String]): DStream[(Long, String)] = {
adRealTimeLogValueDS.transform{
rdd =>
//查询黑名单用户数据
val blacklists: Array[AdBlacklist] = AdBlacklistDAO.findAll()
val blcaklistRDD: RDD[(Long, Boolean)] = spark.sparkContext.makeRDD(
blacklists.map(item=>(item.userid,true))
)
//转换格式 log:(timestamp privince city userid adid) =>(userid,log)
val mappedRDD: RDD[(Long, String)] = rdd.map((log:String)=>{
val userid: Long = log.split(" ")(3).toLong
(userid,log)
})
//把一批数据和黑名单数据进行左外连接
val joinedRDD: RDD[(Long, (String, Option[Boolean]))] = mappedRDD.leftOuterJoin(blcaklistRDD)
//对连接厚度额数据进行过滤,过滤掉黑名单用户的log
val filteredRDD: RDD[(Long, (String, Option[Boolean]))] = joinedRDD.filter{
case (userid,(log,black))=>
if (black.isDefined && black.get) false else true
}
filteredRDD.map{
case(userid,(log,black))=>
(userid,log)
}
}
}
}
DataModel 类 数据模型
package com.ityouxin.advertise
/**
* 广告黑名单
*
*
*/
case class AdBlacklist(userid:Long)
/**
* 用户广告点击量
*
*
*/
case class AdUserClickCount(date:String,
userid:Long,
adid:Long,
clickCount:Long)
/**
* 广告实时统计
*
*
*/
case class AdStat(date:String,
province:String,
city:String,
adid:Long,
clickCount:Long)
/**
* 各省top3热门广告
*
*
*/
case class AdProvinceTop3(date:String,
province:String,
adid:Long,
clickCount:Long)
/**
* 广告点击趋势
*
*
*/
case class AdClickTrend(date:String,
hour:String,
minute:String,
adid:Long,
clickCount:Long)
JDBCHelper 入库操作
package com.ityouxin.advertise
import java.sql.ResultSet
import com.ityouxin.commons.pool.{CreateMySqlPool, QueryCallback}
import scala.collection.mutable.ArrayBuffer
/**
* 用户黑名单DAO类
*/
object AdBlacklistDAO {
/**
* 批量插入广告黑名单用户
*
* @param adBlacklists
*/
def insertBatch(adBlacklists: Array[AdBlacklist]) {
// 批量插入
val sql = "INSERT INTO ad_blacklist VALUES(?)"
val paramsList = new ArrayBuffer[Array[Any]]()
// 向paramsList添加userId
for (adBlacklist <- adBlacklists) {
val params: Array[Any] = Array(adBlacklist.userid)
paramsList += params
}
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 执行批量插入操作
client.executeBatch(sql, paramsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
/**
* 查询所有广告黑名单用户
*
* @return
*/
def findAll(): Array[AdBlacklist] = {
// 将黑名单中的所有数据查询出来
val sql = "SELECT * FROM ad_blacklist"
val adBlacklists = new ArrayBuffer[AdBlacklist]()
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 执行sql查询并且通过处理函数将所有的userid加入array中
client.executeQuery(sql, null, new QueryCallback {
override def process(rs: ResultSet): Unit = {
while (rs.next()) {
val userid = rs.getInt(1).toLong
adBlacklists += AdBlacklist(userid)
}
}
})
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
adBlacklists.toArray
}
}
/**
* 用户广告点击量DAO实现类
*
*/
object AdUserClickCountDAO {
def updateBatch(adUserClickCounts: Array[AdUserClickCount]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 首先对用户广告点击量进行分类,分成待插入的和待更新的
val insertAdUserClickCounts = ArrayBuffer[AdUserClickCount]()
val updateAdUserClickCounts = ArrayBuffer[AdUserClickCount]()
val selectSQL = "SELECT count(*) FROM ad_user_click_count WHERE date=? AND userid=? AND adid=? "
for (adUserClickCount <- adUserClickCounts) {
val selectParams: Array[Any] = Array(adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid)
// 根据传入的用户点击次数统计数据从已有的ad_user_click_count中进行查询
client.executeQuery(selectSQL, selectParams, new QueryCallback {
override def process(rs: ResultSet): Unit = {
// 如果能查询到并且点击次数大于0,则认为是待更新项
if (rs.next() && rs.getInt(1) > 0) {
updateAdUserClickCounts += adUserClickCount
} else {
insertAdUserClickCounts += adUserClickCount
}
}
})
}
// 执行批量插入
val insertSQL = "INSERT INTO ad_user_click_count VALUES(?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将待插入项全部加入到参数列表中
for (adUserClickCount <- insertAdUserClickCounts) {
insertParamsList += Array[Any](adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid, adUserClickCount.clickCount)
}
// 执行批量插入
client.executeBatch(insertSQL, insertParamsList.toArray)
// 执行批量更新
// clickCount=clickCount + :此处的UPDATE是进行累加
val updateSQL = "UPDATE ad_user_click_count SET clickCount=clickCount + ? WHERE date=? AND userid=? AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将待更新项全部加入到参数列表中
for (adUserClickCount <- updateAdUserClickCounts) {
updateParamsList += Array[Any](adUserClickCount.clickCount, adUserClickCount.date, adUserClickCount.userid, adUserClickCount.adid)
}
// 执行批量更新
client.executeBatch(updateSQL, updateParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
/**
* 根据多个key查询用户广告点击量
*
* @param date 日期
* @param userid 用户id
* @param adid 广告id
* @return
*/
def findClickCountByMultiKey(date: String, userid: Long, adid: Long): Int = {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
val sql = "SELECT clickCount FROM ad_user_click_count " +
"WHERE date=? " +
"AND userid=? " +
"AND adid=?"
var clickCount = 0
val params = Array[Any](date, userid, adid)
// 根据多个条件查询指定用户的点击量,将查询结果累加到clickCount中
client.executeQuery(sql, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next()) {
clickCount = rs.getInt(1)
}
}
})
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
clickCount
}
}
/**
* 广告实时统计DAO实现类
*
*
*
*/
object AdStatDAO {
def updateBatch(adStats: Array[AdStat]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 区分开来哪些是要插入的,哪些是要更新的
val insertAdStats = ArrayBuffer[AdStat]()
val updateAdStats = ArrayBuffer[AdStat]()
val selectSQL = "SELECT count(*) " +
"FROM ad_stat " +
"WHERE date=? " +
"AND province=? " +
"AND city=? " +
"AND adid=?"
for (adStat <- adStats) {
val params = Array[Any](adStat.date, adStat.province, adStat.city, adStat.adid)
// 通过查询结果判断当前项时待插入还是待更新
client.executeQuery(selectSQL, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next() && rs.getInt(1) > 0) {
updateAdStats += adStat
} else {
insertAdStats += adStat
}
}
})
}
// 对于需要插入的数据,执行批量插入操作
val insertSQL = "INSERT INTO ad_stat VALUES(?,?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adStat <- insertAdStats) {
insertParamsList += Array[Any](adStat.date, adStat.province, adStat.city, adStat.adid, adStat.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 对于需要更新的数据,执行批量更新操作
// 此处的UPDATE是进行覆盖
val updateSQL = "UPDATE ad_stat SET clickCount=? " +
"WHERE date=? " +
"AND province=? " +
"AND city=? " +
"AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adStat <- updateAdStats) {
updateParamsList += Array[Any](adStat.clickCount, adStat.date, adStat.province, adStat.city, adStat.adid)
}
client.executeBatch(updateSQL, updateParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
/**
* 各省份top3热门广告DAO实现类
*
*
*
*/
object AdProvinceTop3DAO {
def updateBatch(adProvinceTop3s: Array[AdProvinceTop3]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// dateProvinces可以实现一次去重
// AdProvinceTop3:date province adid clickCount,由于每条数据由date province adid组成
// 当只取date province时,一定会有重复的情况
val dateProvinces = ArrayBuffer[String]()
for (adProvinceTop3 <- adProvinceTop3s) {
// 组合新key
val key = adProvinceTop3.date + "_" + adProvinceTop3.province
// dateProvinces中不包含当前key才添加
// 借此去重
if (!dateProvinces.contains(key)) {
dateProvinces += key
}
}
// 根据去重后的date和province,进行批量删除操作
// 先将原来的数据全部删除
val deleteSQL = "DELETE FROM ad_province_top3 WHERE date=? AND province=?"
val deleteParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (dateProvince <- dateProvinces) {
val dateProvinceSplited = dateProvince.split("_")
val date = dateProvinceSplited(0)
val province = dateProvinceSplited(1)
val params = Array[Any](date, province)
deleteParamsList += params
}
client.executeBatch(deleteSQL, deleteParamsList.toArray)
// 批量插入传入进来的所有数据
val insertSQL = "INSERT INTO ad_province_top3 VALUES(?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
// 将传入的数据转化为参数列表
for (adProvinceTop3 <- adProvinceTop3s) {
insertParamsList += Array[Any](adProvinceTop3.date, adProvinceTop3.province, adProvinceTop3.adid, adProvinceTop3.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}
/**
* 广告点击趋势DAO实现类
*
*
*
*/
object AdClickTrendDAO {
def updateBatch(adClickTrends: Array[AdClickTrend]) {
// 获取对象池单例对象
val mySqlPool = CreateMySqlPool()
// 从对象池中提取对象
val client = mySqlPool.borrowObject()
// 区分开来哪些是要插入的,哪些是要更新的
val updateAdClickTrends = ArrayBuffer[AdClickTrend]()
val insertAdClickTrends = ArrayBuffer[AdClickTrend]()
val selectSQL = "SELECT count(*) " +
"FROM ad_click_trend " +
"WHERE date=? " +
"AND hour=? " +
"AND minute=? " +
"AND adid=?"
for (adClickTrend <- adClickTrends) {
// 通过查询结果判断当前项时待插入还是待更新
val params = Array[Any](adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid)
client.executeQuery(selectSQL, params, new QueryCallback {
override def process(rs: ResultSet): Unit = {
if (rs.next() && rs.getInt(1) > 0) {
updateAdClickTrends += adClickTrend
} else {
insertAdClickTrends += adClickTrend
}
}
})
}
// 执行批量更新操作
// 此处的UPDATE是覆盖
val updateSQL = "UPDATE ad_click_trend SET clickCount=? " +
"WHERE date=? " +
"AND hour=? " +
"AND minute=? " +
"AND adid=?"
val updateParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adClickTrend <- updateAdClickTrends) {
updateParamsList += Array[Any](adClickTrend.clickCount, adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid)
}
client.executeBatch(updateSQL, updateParamsList.toArray)
// 执行批量更新操作
val insertSQL = "INSERT INTO ad_click_trend VALUES(?,?,?,?,?)"
val insertParamsList: ArrayBuffer[Array[Any]] = ArrayBuffer[Array[Any]]()
for (adClickTrend <- insertAdClickTrends) {
insertParamsList += Array[Any](adClickTrend.date, adClickTrend.hour, adClickTrend.minute, adClickTrend.adid, adClickTrend.clickCount)
}
client.executeBatch(insertSQL, insertParamsList.toArray)
// 使用完成后将对象返回给对象池
mySqlPool.returnObject(client)
}
}