1 模块创建和数据准备
2 基于服务器 log 的热门页面浏览量统计
package come.atguigu.networkflow_analysis import java.text.SimpleDateFormat import com.sun.jmx.snmp.Timestamp import org.apache.flink.api.common.functions.AggregateFunction import org.apache.flink.api.common.state.{ListState, ListStateDescriptor} import org.apache.flink.api.java.tuple.Tuple import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.functions.KeyedProcessFunction import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.scala.function.WindowFunction import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector import scala.collection.mutable.ListBuffer case class ApacheLogEvent(ip: String, userId: String, eventTime: Long, method: String, url: String) case class PageViewCount(url:String, windowEnd:Long, count:Long ) object NetworkTopNPage { def main(args: Array[String]): Unit = { //创建流处理环境 val env = StreamExecutionEnvironment.getExecutionEnvironment env.setParallelism(1) env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) val inputStream:DataStream[String] = env.readTextFile("C:\\Users\\DELL\\IdeaProjects\\UserBehaviorAnalysis\\NetworkFlowAnalysis\\src\\main\\resources\\apache.log") val dataStream:DataStream[ApacheLogEvent] = inputStream .map(data =>{ val dataArray = data.split(" ") val simpleDataFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss") val timestamp = simpleDataFormat.parse(dataArray(3)).getTime ApacheLogEvent(dataArray(0), dataArray(1), timestamp, dataArray(5), dataArray(6)) }) .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[ApacheLogEvent](Time.seconds(60)) { override def extractTimestamp(element: ApacheLogEvent): Long = element.eventTime }) val aggStream = dataStream .keyBy(_.url) .timeWindow(Time.minutes(10),Time.seconds(5)) .aggregate(new PageCountAgg(), new PageCountWindowResult()) val resultStream = aggStream .keyBy(_.windowEnd) .process(new TopNHotPage(3)) resultStream.print() env.execute("top n page job") } } class PageCountAgg() extends AggregateFunction[ApacheLogEvent, Long, Long]{ override def add(in: ApacheLogEvent, accumulator: Long): Long = accumulator + 1 override def createAccumulator(): Long = 0L override def getResult(accumulator: Long): Long = accumulator override def merge(a: Long, b: Long): Long = a+b } class PageCountWindowResult() extends WindowFunction[Long, PageViewCount, String, TimeWindow]{ override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[PageViewCount]): Unit = { out.collect(PageViewCount(key, window.getEnd, input.head)) } } class TopNHotPage(n: Int) extends KeyedProcessFunction[Long, PageViewCount, String]{ lazy val pageCountListState: ListState[PageViewCount] = getRuntimeContext.getListState(new ListStateDescriptor[PageViewCount]("pagecount-list", classOf[PageViewCount])) override def processElement(value: PageViewCount, context: KeyedProcessFunction[Long, PageViewCount, String]#Context, collector: Collector[String]): Unit = { pageCountListState.add(value) context.timerService().registerEventTimeTimer(value.windowEnd+1) } override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, PageViewCount, String]#OnTimerContext, out: Collector[String]): Unit = { // 获取收到的所有 URL 访问量 val allPageCountList: ListBuffer[PageViewCount] = ListBuffer() val iter = pageCountListState.get().iterator() while(iter.hasNext){ allPageCountList += iter.next() } // 提前清除状态中的数据,释放空间 pageCountListState.clear() // 按照访问量从大到小排序 val sortedPageCountList = allPageCountList.sortWith(_.count > _.count).take(n) //将排名信息格式化成string,方便监控显示 val result:StringBuilder = new StringBuilder result.append("时间: ").append(new Timestamp(timestamp - 1)).append("\n") //遍历sorted列表,输出TopN信息 for(i <- sortedPageCountList.indices){ //获取当前商品的count信息 val currentItemCount = sortedPageCountList(i) result.append("Top").append(i+1).append(":") .append(" 页面url").append(currentItemCount.url) .append(" 访问量=").append(currentItemCount.count) .append("\n") } result.append("====================================\n\n") // 控制输出频率 Thread.sleep(1000) out.collect(result.toString()) } }
基于时间延迟的代码
package come.atguigu.networkflow_analysis import java.text.SimpleDateFormat import com.sun.jmx.snmp.Timestamp import org.apache.flink.api.common.functions.AggregateFunction import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, MapState, MapStateDescriptor} import org.apache.flink.api.java.tuple.Tuple import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.functions.KeyedProcessFunction import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.scala.function.WindowFunction import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector import scala.collection.mutable.ListBuffer case class ApacheLogEvent(ip: String, userId: String, eventTime: Long, method: String, url: String) case class PageViewCount(url:String, windowEnd:Long, count:Long ) object NetworkTopNPageLateness { def main(args: Array[String]): Unit = { //创建流处理环境 val env = StreamExecutionEnvironment.getExecutionEnvironment env.setParallelism(1) env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //val inputStream:DataStream[String] = env.readTextFile("C:\\Users\\DELL\\IdeaProjects\\UserBehaviorAnalysis\\NetworkFlowAnalysis\\src\\main\\resources\\apache.log") val inputStream = env.socketTextStream("localhost", 777) val dataStream:DataStream[ApacheLogEvent] = inputStream .map(data =>{ val dataArray = data.split(" ") val simpleDataFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss") val timestamp = simpleDataFormat.parse(dataArray(3)).getTime ApacheLogEvent(dataArray(0), dataArray(1), timestamp, dataArray(5), dataArray(6)) }) .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[ApacheLogEvent](Time.seconds(60)) { override def extractTimestamp(element: ApacheLogEvent): Long = element.eventTime }) val lateOutputTag = new OutputTag[ApacheLogEvent]("late data") val aggStream = dataStream .keyBy(_.url) .timeWindow(Time.minutes(10),Time.seconds(5)) .allowedLateness(Time.minutes(1)) .sideOutputLateData(lateOutputTag) .aggregate(new PageCountAgg(), new PageCountWindowResult()) val lateDataStream = aggStream.getSideOutput(lateOutputTag) val resultStream = aggStream .keyBy(_.windowEnd) .process(new TopNHotPage(3)) dataStream.print("data") aggStream.print("agg") lateDataStream.print("late") resultStream.print("result") env.execute("top n page job") } } class PageCountAgg() extends AggregateFunction[ApacheLogEvent, Long, Long]{ override def add(in: ApacheLogEvent, accumulator: Long): Long = accumulator + 1 override def createAccumulator(): Long = 0L override def getResult(accumulator: Long): Long = accumulator override def merge(a: Long, b: Long): Long = a+b } class PageCountWindowResult() extends WindowFunction[Long, PageViewCount, String, TimeWindow]{ override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[PageViewCount]): Unit = { out.collect(PageViewCount(key, window.getEnd, input.head)) } } class TopNHotPage(n: Int) extends KeyedProcessFunction[Long, PageViewCount, String]{ lazy val pageCountMapState: MapState[String, Long] = getRuntimeContext.getMapState(new MapStateDescriptor[String, Long]("pagecount-map", classOf[String], classOf[Long])) override def processElement(value: PageViewCount, context: KeyedProcessFunction[Long, PageViewCount, String]#Context, collector: Collector[String]): Unit = { pageCountMapState.put(value.url, value.count) context.timerService().registerEventTimeTimer(value.windowEnd+1) context.timerService().registerEventTimeTimer(value.windowEnd+60*1000L) } override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, PageViewCount, String]#OnTimerContext, out: Collector[String]): Unit = { if(timestamp == ctx.getCurrentKey + 60*1000L){ pageCountMapState.clear() return } val allPageCountList: ListBuffer[(String, Long)] = ListBuffer() val iter = pageCountMapState.entries().iterator() while(iter.hasNext){ val entry = iter.next() allPageCountList += ((entry.getKey, entry.getValue)) } val sortedPageCountList = allPageCountList.sortWith(_._2 > _._2 ).take(n) //将排名信息格式化成string,方便监控显示 val result:StringBuilder = new StringBuilder result.append("时间: ").append(new Timestamp(timestamp - 1)).append("\n") //遍历sorted列表,输出TopN信息 for(i <- sortedPageCountList.indices){ //获取当前商品的count信息 val currentItemCount = sortedPageCountList(i) result.append("Top").append(i+1).append(":") .append(" 页面url").append(currentItemCount._1) .append(" 访问量=").append(currentItemCount._2) .append("\n") } result.append("====================================\n\n") // 控制输出频率 Thread.sleep(1000) out.collect(result.toString()) } }
3 基于埋点日志数据的网络流量统计
package come.atguigu.networkflow_analysis import java.util.Properties import org.apache.flink.api.common.functions.AggregateFunction import org.apache.flink.api.common.serialization.SimpleStringSchema import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.scala.function.WindowFunction import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer import org.apache.flink.util.Collector import scala.tools.cmd.Spec.Accumulator //定义样例类 case class UserBehavior(userId:Long, itemId:Long, categoryId:Int, behavior:String, timestamp:Long) case class PvCount(windowEnd:Long, count:Long) object PageView { def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.setParallelism(1) env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) val inputStream:DataStream[String] = env.readTextFile("C:\\Users\\DELL\\IdeaProjects\\UserBehaviorAnalysis\\NetworkFlowAnalysis\\src\\main\\resources\\UserBehavior.csv") // 将数据转换成样例类类型,并提取timestamp定义watermark val dataStream:DataStream[UserBehavior] = inputStream .map(data =>{ val dataArray = data.split(",") UserBehavior(dataArray(0).toLong, dataArray(1).toLong, dataArray(2).toInt, dataArray(3), dataArray(4).toLong)}) .assignAscendingTimestamps(_.timestamp*1000L) val pvStream:DataStream[PvCount] = dataStream .filter(_.behavior == "pv") .map(data => ("pv", 1L)) .keyBy(_._1) .timeWindow(Time.hours(1)) .aggregate(new PvCountAgg(), new PvCountResult()) pvStream.print() env.execute("pv job") } } class PvCountAgg() extends AggregateFunction[(String, Long), Long, Long]{ override def add(in: (String, Long), acc: Long): Long = acc + 1 override def createAccumulator(): Long = 0L override def getResult(acc: Long): Long = acc override def merge(acc: Long, acc1: Long): Long = acc + acc1 } class PvCountResult() extends WindowFunction[Long, PvCount, String, TimeWindow]{ override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[PvCount]): Unit = { out.collect(PvCount(window.getEnd, input.head)) } }
性能优化后代码
package come.atguigu.networkflow_analysis import org.apache.flink.api.common.functions.{AggregateFunction, MapFunction} import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor} import org.apache.flink.streaming.api.TimeCharacteristic import org.apache.flink.streaming.api.functions.KeyedProcessFunction import org.apache.flink.streaming.api.scala.function.WindowFunction import org.apache.flink.streaming.api.scala._ import org.apache.flink.streaming.api.windowing.time.Time import org.apache.flink.streaming.api.windowing.windows.TimeWindow import org.apache.flink.util.Collector import scala.util.Random //定义样例类 case class UserBehavior(userId:Long, itemId:Long, categoryId:Int, behavior:String, timestamp:Long) case class PvCount(windowEnd:Long, count:Long) object PageViewOp { def main(args: Array[String]): Unit = { val env = StreamExecutionEnvironment.getExecutionEnvironment env.setParallelism(4) env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) val inputStream:DataStream[String] = env.readTextFile("C:\\Users\\DELL\\IdeaProjects\\UserBehaviorAnalysis\\NetworkFlowAnalysis\\src\\main\\resources\\UserBehavior.csv") // 将数据转换成样例类类型,并提取timestamp定义watermark val dataStream: DataStream[UserBehavior] = inputStream .map(data => { val dataArray = data.split(",") UserBehavior(dataArray(0).toLong, dataArray(1).toLong, dataArray(2).toInt, dataArray(3), dataArray(4).toLong)}) .assignAscendingTimestamps(_.timestamp*1000L) val pvStream:DataStream[PvCount] = dataStream .filter(_.behavior == "pv") .map(new MyMapper()) // 主要是为了能够将数据分区进而利用并行计算以及防止数据倾斜 .keyBy(_._1) .timeWindow(Time.hours(1)) .aggregate(new PvCountAgg(), new PvCountResult()) val pvTotalStream:DataStream[PvCount] = pvStream .keyBy(_.windowEnd) .process(new TotalPvCountResult()) pvTotalStream.print() env.execute("pv job") } } class PvCountAgg() extends AggregateFunction[(String, Long), Long, Long]{ override def add(in: (String, Long), acc: Long): Long = acc + 1 override def createAccumulator(): Long = 0L override def getResult(acc: Long): Long = acc override def merge(acc: Long, acc1: Long): Long = acc + acc1 } class PvCountResult() extends WindowFunction[Long, PvCount, String, TimeWindow]{ override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[PvCount]): Unit = { out.collect(PvCount(window.getEnd, input.head)) } } class MyMapper() extends MapFunction[UserBehavior, (String, Long)]{ override def map(value: UserBehavior): (String, Long) = (Random.nextString(10), 1L) } class TotalPvCountResult() extends KeyedProcessFunction[Long, PvCount, PvCount]{ lazy val totalCountState: ValueState[Long] = getRuntimeContext.getState(new ValueStateDescriptor[Long]("total-count", classOf[Long])) override def processElement(value: PvCount, context: KeyedProcessFunction[Long, PvCount, PvCount]#Context, collector: Collector[PvCount]): Unit = { totalCountState.update(totalCountState.value() + value.count) context.timerService().registerEventTimeTimer(value.windowEnd + 1) } override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, PvCount, PvCount]#OnTimerContext, out: Collector[PvCount]): Unit = { out.collect(PvCount(ctx.getCurrentKey, totalCountState.value())) totalCountState.clear() } }