flink 代码解析日志
public class HotPages {
public static void main(String[] args) throws Exception {
/**
* 思路:开窗统计聚合 得到结果 排序输出
*/
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//并行度
env.setParallelism(1);
//时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//拿数据转类型
DataStreamSource<String> inputStream = env.readTextFile("D:\\Tool\\Idea2020\\UserBehaviorAnalysis\\NetWorkFlowAnalysis\\src\\main\\resources\\apache.log");
//System.out.println("一号位输出完毕");
//inputStream.print();
/**
//拿数据的另一种方式,使用反射获取编译后的文件目录;里面的数据文件
URL resource = HotPages.class.getResource("/apache.log");
DataStreamSource<String> inputStream = env.readTextFile(resource.getPath());
*/
//这里做了什么:将数据处理为想要的数据用ApacheLogEvent接收起来
DataStream<ApacheLogEvent> dataStream = inputStream.map(new MapFunction<String, ApacheLogEvent>() {
@Override
public ApacheLogEvent map(String s) throws Exception {
String[] splits = s.split(" ");
SimpleDateFormat simpleDateFormat =
new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
Long timestamp = simpleDateFormat.parse(splits[3]).getTime();
return new ApacheLogEvent(splits[0], splits[1], timestamp, splits[5], splits[6]);
}
})
//事件时间语义===>分配时间戳和WaterMark,BoundedOutOfOrdernessTimestampExtractor处理乱序数据
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheLogEvent>(Time.minutes(1)) { //加上延迟时间,即waterMark
@Override
public long extractTimestamp(ApacheLogEvent apacheLogEvent) {
return apacheLogEvent.getTimestamp();
}
});
// System.out.println("二号位输出完毕");
//分组开窗聚合
SingleOutputStreamOperator<PageViewCount> windowAggStream = dataStream
.filter(new FilterFunction<ApacheLogEvent>() {
//过滤GET请求的
@Override
public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {
return "GET".equals(apacheLogEvent.getMethod());//过滤get请求
}
})
.filter(new FilterFunction<ApacheLogEvent>() {
//此处对一些页面url进行过滤,数据太多
@Override
public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {
String regex="^((?!\\.(css|js|png|ico)$).)*$";
return Pattern.matches(regex,apacheLogEvent.getUrl());
}
})
//按照url分组
.keyBy(ApacheLogEvent::getUrl)
.timeWindow(Time.minutes(10), Time.seconds(4))
.aggregate(new PageCountAgg(), new PageCountResult());
// System.out.println("三号位输出完毕");
//收集同一窗口count数据,排序输出
SingleOutputStreamOperator<String> resultStream = windowAggStream.keyBy(PageViewCount::getWindowEnd)
.process(new TopNHotPages(3));
// System.out.println("四号位输出完毕");
resultStream.print();
env.execute("hot pages job");
}
//预计合
public static class PageCountAgg implements AggregateFunction<ApacheLogEvent, Long, Long> {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(ApacheLogEvent apacheLogEvent, Long aLong) {
return aLong + 1;
}
@Override
public Long getResult(Long aLong) {
return aLong;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}
//自定义窗口函数
public static class PageCountResult implements WindowFunction<Long, PageViewCount, String, TimeWindow> {
@Override
public void apply(String s, TimeWindow timeWindow, Iterable<Long> iterable, Collector<PageViewCount> collector) throws Exception {
PageViewCount pageViewCount = new PageViewCount(s, timeWindow.getEnd(), iterable.iterator().next());
collector.collect(pageViewCount);
}
}
//自定义处理函数
public static class TopNHotPages extends KeyedProcessFunction<Long, PageViewCount, String> {
private Integer topSize;
public TopNHotPages(Integer topSize) {
this.topSize = topSize;
}
//定义状态,保存当前所有PageViewCount到list中
ListState<PageViewCount> pageViewCountListState;
@Override
public void open(Configuration parameters) throws Exception {
pageViewCountListState = getRuntimeContext()
.getListState(new ListStateDescriptor<PageViewCount>("pagecountlist", PageViewCount.class));
}
@Override
public void processElement(PageViewCount pageViewCount, Context context, Collector<String> collector) throws Exception {
pageViewCountListState.add(pageViewCount);
context.timerService().registerEventTimeTimer(pageViewCount.getWindowEnd() + 1);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
ArrayList<PageViewCount> pageViewCounts = Lists.newArrayList(pageViewCountListState.get().iterator());
pageViewCounts.sort(new Comparator<PageViewCount>() {
@Override
public int compare(PageViewCount o1, PageViewCount o2) {
//此方法未包含等于0的情况
// return o1.getCount().intValue() >o2.getCount().intValue()?-1:1;
//以下为更严谨写法
if (o1.getCount() > o2.getCount()) {
return -1;
} else if (o1.getCount() < o2.getCount()) {
return 1;
} else {
return 0;
}
}
});
//格式化string输出
StringBuilder resultBuilder = new StringBuilder();
resultBuilder.append("======================================");
resultBuilder.append("窗口结束时间:").append(new Timestamp(timestamp - 1)).append("\n");
//遍历列表,取top n输出
for (int i = 0; i < Math.min(topSize, pageViewCounts.size()); i++) {
PageViewCount currentItemViewCount = pageViewCounts.get(i);
resultBuilder
.append("No")
.append(i + 1)
.append(":")
.append(" URL = ")
.append(currentItemViewCount.getUrl())
.append(" 浏览量 = ")
.append(currentItemViewCount.getCount())
.append("\n");
}
resultBuilder.append("======================================\n\n");
Thread.sleep(1000L);
out.collect(resultBuilder.toString());
}
}
}
result:
======================================窗口结束时间:2015-05-17 10:08:52.0
No1: URL = /blog/tags/puppet?flav=rss20 浏览量 = 3
No2: URL = /blog/tags/firefox?flav=rss20 浏览量 = 2
No3: URL = / 浏览量 = 2
======================================
======================================窗口结束时间:2015-05-17 10:08:56.0
No1: URL = /blog/tags/puppet?flav=rss20 浏览量 = 3
No2: URL = /blog/tags/firefox?flav=rss20 浏览量 = 2
No3: URL = / 浏览量 = 2
======================================
======================================窗口结束时间:2015-05-17 10:09:00.0
No1: URL = /blog/tags/puppet?flav=rss20 浏览量 = 3
No2: URL = /blog/tags/firefox?flav=rss20 浏览量 = 2
No3: URL = / 浏览量 = 2
======================================
Pom
<!--版本管理-->
<properties>
<flink.version>1.10.1</flink.version>
<scala.binary.version>2.12</scala.binary.version>
<kafka.version>2.2.0</kafka.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!--kafka客户端版本-->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_${scala.binary.version}</artifactId>
<version>${kafka.version}</version>
</dependency>
<!--flink和kafka连接器,官方提供的-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!--table API -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_2.12</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.12</artifactId>
<version>1.10.1</version>
</dependency>
</dependencies>
日志数据放于resource目录下
链接:https://pan.baidu.com/s/1RNFXsBdjNM4_lYL8L7il0A
提取码:czz1
优化后代码
public class HotPages {
public static void main(String[] args) throws Exception {
/**
* 思路:开窗统计聚合 得到结果 排序输出
*/
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//并行度
env.setParallelism(1);
//时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//拿数据转类型
DataStreamSource<String> inputStream = env.readTextFile("D:\\Tool\\Idea2020\\UserBehaviorAnalysis\\NetWorkFlowAnalysis\\src\\main\\resources\\apache.log");
System.out.println("一号位输出完毕");
//inputStream.print();
/**
//拿数据的另一种方式,使用反射获取编译后的文件目录;里面的数据文件
URL resource = HotPages.class.getResource("/apache.log");
DataStreamSource<String> inputStream = env.readTextFile(resource.getPath());
*/
//这里做了什么:将数据处理为想要的数据用ApacheLogEvent接收起来
DataStream<ApacheLogEvent> dataStream = inputStream.map(new MapFunction<String, ApacheLogEvent>() {
@Override
public ApacheLogEvent map(String s) throws Exception {
String[] splits = s.split(" ");
SimpleDateFormat simpleDateFormat =
new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss");
Long timestamp = simpleDateFormat.parse(splits[3]).getTime();
return new ApacheLogEvent(splits[0], splits[1], timestamp, splits[5], splits[6]);
}
})
//事件时间语义===>分配时间戳和WaterMark,BoundedOutOfOrdernessTimestampExtractor处理乱序数据
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ApacheLogEvent>(Time.seconds(1)) { //加上延迟时间,即waterMark
@Override
public long extractTimestamp(ApacheLogEvent apacheLogEvent) {
return apacheLogEvent.getTimestamp();
}
});
System.out.println("二号位输出完毕");
//分组开窗聚合
//定义一个测输出流,此测输出流类型必须和窗口一致
OutputTag<ApacheLogEvent> lateTag =new OutputTag<ApacheLogEvent>("late"){};
SingleOutputStreamOperator<PageViewCount> windowAggStream = dataStream
.filter(new FilterFunction<ApacheLogEvent>() {
//过滤GET请求的
@Override
public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {
return "GET".equals(apacheLogEvent.getMethod());//过滤get请求
}
})
.filter(new FilterFunction<ApacheLogEvent>() {
//此处对一些页面url进行过滤,数据太多
@Override
public boolean filter(ApacheLogEvent apacheLogEvent) throws Exception {
String regex="^((?!\\.(css|js|png|ico)$).)*$";
return Pattern.matches(regex,apacheLogEvent.getUrl());
}
})
//按照url分组
.keyBy(ApacheLogEvent::getUrl)
.timeWindow(Time.minutes(10), Time.seconds(4))
.allowedLateness(Time.minutes(1))
.sideOutputLateData(lateTag)
.aggregate(new PageCountAgg(), new PageCountResult());
windowAggStream.print("agg");
windowAggStream.getSideOutput(lateTag).print();
System.out.println("三号位输出完毕");
//收集同一窗口count数据,排序输出
SingleOutputStreamOperator<String> resultStream = windowAggStream.keyBy(PageViewCount::getWindowEnd)
.process(new TopNHotPages(3));
System.out.println("四号位输出完毕");
resultStream.print();
env.execute("hot pages job");
}
//预计合
public static class PageCountAgg implements AggregateFunction<ApacheLogEvent, Long, Long> {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(ApacheLogEvent apacheLogEvent, Long aLong) {
return aLong + 1;
}
@Override
public Long getResult(Long aLong) {
return aLong;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}
//自定义窗口函数
public static class PageCountResult implements WindowFunction<Long, PageViewCount, String, TimeWindow> {
@Override
public void apply(String s, TimeWindow timeWindow, Iterable<Long> iterable, Collector<PageViewCount> collector) throws Exception {
PageViewCount pageViewCount = new PageViewCount(s, timeWindow.getEnd(), iterable.iterator().next());
collector.collect(pageViewCount);
}
}
//自定义处理函数
public static class TopNHotPages extends KeyedProcessFunction<Long, PageViewCount, String> {
private Integer topSize;
public TopNHotPages(Integer topSize) {
this.topSize = topSize;
}
//定义状态,保存当前所有PageViewCount到list中
// ListState<PageViewCount> pageViewCountListState;
MapState<String,Long> pageViewCountMapState;
@Override
public void open(Configuration parameters) throws Exception {
pageViewCountMapState = getRuntimeContext()
.getMapState(new MapStateDescriptor<String,Long>("page-count-map",String.class,Long.class));
}
@Override
public void processElement(PageViewCount pageViewCount, Context context, Collector<String> collector) throws Exception {
// pageViewCountListState.add(pageViewCount);
//优化
pageViewCountMapState.put(pageViewCount.getUrl(),pageViewCount.getCount());
context.timerService().registerEventTimeTimer(pageViewCount.getWindowEnd() + 1);
//创建一分钟后窗口关闭定时器,用来清空状态
context.timerService().registerEventTimeTimer(pageViewCount.getWindowEnd()+60*1000L);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
//先判断是否到了窗口关闭清理时间,是则直接清空状态返回
if(timestamp==ctx.timestamp()+60*1000L){
pageViewCountMapState.clear();
return;
}
ArrayList<Map.Entry<String,Long>> pageViewCounts = Lists.newArrayList(pageViewCountMapState.entries().iterator());
//优化后
pageViewCounts.sort(new Comparator<Map.Entry<String, Long>>() {
@Override
public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) {
if(o1.getValue() > o2.getValue()){
return -1;
}else if(o1.getValue() < o2.getValue()){
return 1;
}else {
return 0;
}
}
});
// pageViewCounts.sort(new Comparator<PageViewCount>() {
// @Override
// public int compare(PageViewCount o1, PageViewCount o2) {
// //此方法未包含等于0的情况
// // return o1.getCount().intValue() >o2.getCount().intValue()?-1:1;
// //以下为更严谨写法
// if (o1.getCount() > o2.getCount()) {
// return -1;
// } else if (o1.getCount() < o2.getCount()) {
// return 1;
// } else {
// return 0;
// }
// }
// });
//格式化string输出
StringBuilder resultBuilder = new StringBuilder();
resultBuilder.append("======================================");
resultBuilder.append("窗口结束时间:").append(new Timestamp(timestamp - 1)).append("\n");
//遍历列表,取top n输出
for (int i = 0; i < Math.min(topSize, pageViewCounts.size()); i++) {
// PageViewCount currentItemViewCount = pageViewCounts.get(i);
Map.Entry<String, Long> currentItemViewCount = pageViewCounts.get(i);
resultBuilder
.append("No")
.append(i + 1)
.append(":")
.append(" URL = ")
.append(currentItemViewCount.getKey())
.append(" 浏览量 = ")
.append(currentItemViewCount.getValue())
.append("\n");
}
resultBuilder.append("======================================\n\n");
Thread.sleep(1000L);
out.collect(resultBuilder.toString());
//为了清空状态
//pageViewCounts.clear();
}
}
}