解决flink、flink-sql去重过程中的热点问题
1、flink-sql解决热点问题
使用Sql去实现一个去重功能,通常会这样实现SELECT day, COUNT(DISTINCT user_id) FROM T GROUP BY day --sql1
或者select day,count(*) from( select distinct user_id,day from T ) a group by day --sql2
但是这两种方式都未解决计算热点问题,例如当某一个day 对应的devId 特别大的情况下,那么计算压力都会到该day所在的task,使这个task成为任务的性能瓶颈。
package com.yyds.flink_distinct;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.ExplainDetail;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
/**
* 去重过程中的热点问题(使用flink sql进行解决)
*
*/
public class _06_DistinctHotpotFlinkSql {
public static void main(String[] args) {
// 创建表的执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tenv = StreamTableEnvironment.create(env);
// 解决热点问题的配置
tenv.getConfig().getConfiguration().setString("table.optimizer.distinct-agg.split.enabled", "true");
SingleOutputStreamOperator<_06_User> ss1 = env.socketTextStream("hadoop01", 9999)
.map(new MapFunction<String, _06_User>() {
@Override
public _06_User map(String line) throws Exception {
String[] arr = line.split(",");
return new _06_User(arr[0], arr[1]);
}
});
tenv.createTemporaryView("T",ss1);
String executeSql = "SELECT `day`, COUNT(DISTINCT user_id) as cnt FROM T GROUP BY `day`";
/**
* -- 会转换为这个sql
SELECT day, SUM(cnt)
FROM (
SELECT day, COUNT(DISTINCT user_id) as cnt
FROM T
GROUP BY day, MOD(HASH_CODE(user_id), 1024)
)
GROUP BY day
MOD(HASH_CODE(user_id), 1024) 表示对取user_id的hashCode然后对1024取余,也就是将user_id划分到1024个桶里面去,
那么里层通过对day与桶编号进行去重(cnt)外层只需要对cnt执行sum操作即可,因为分桶操作限制了相同的user_id 一定会在相同的桶里面
*/
String explainSql = tenv.explainSql(executeSql, ExplainDetail.CHANGELOG_MODE);
System.out.println(explainSql);
tenv.executeSql(executeSql).print();
}
}
2、flink解决热点问题
去重过程中的热点问题(编码实现)
实时计算广告位访客数,流量数据id(广告位ID)、devId(访问ID)、time(访问时间)
实现思路:
• 首先通过对id、设备id分桶编号、小时级别时间分组,使用一个ProcessFunction计算分桶后的去重数(与MapState方式相同)
• 然后通过对id、小时级别时间分组,使用另一个ProcessFunction做sum操作,
但是这里面需要注意的一个问题是对于相同id与时间其数据可能会来源于上游不同的task,
而上游的每个task的数据都会以全量一直往下发送,如果直接做累加操作会导致重复计算,因此得实现一个类似于sql中retract撤回机制,
也就是上一个ProcessFunction每发送一条数据都需要先将之前的数据发送一份表示其为撤回。
主程序:
package com.yyds.flink_distinct;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
/**
* 去重过程中的热点问题(编码实现)
*
*
* 实时计算广告位访客数,流量数据id(广告位ID)、devId(访问ID)、time(访问时间)
*
* 实现思路:
* • 首先通过对id、设备id分桶编号、小时级别时间分组,使用一个ProcessFunction计算分桶后的去重数(与MapState方式相同)
* • 然后通过对id、小时级别时间分组,使用另一个ProcessFunction做sum操作,
*
*
* 但是这里面需要注意的一个问题是对于相同id与时间其数据可能会来源于上游不同的task,
* 而上游的每个task的数据都会以全量一直往下发送,如果直接做累加操作会导致重复计算,因此得实现一个类似于sql中retract撤回机制,
* 也就是上一个ProcessFunction每发送一条数据都需要先将之前的数据发送一份表示其为撤回。
*
*/
public class _07_DistinctHotpot {
public static void main(String[] args) throws Exception {
// 创建表的执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
/**
* 模拟数据:
* 1,001,1000
* 1,002,1000
* 1,003,1000
* 1,004,1000
*/
// 读取原始数据,转换为javaBean
SingleOutputStreamOperator<_07_AdData> ss1 = env.socketTextStream("hadoop01", 9999)
.map(new MapFunction<String, _07_AdData>() {
@Override
public _07_AdData map(String line) throws Exception {
String[] arr = line.split(",");
return new _07_AdData(Integer.parseInt(arr[0]), arr[1],Long.parseLong( arr[2]));
}
});
// 首先通过对id、设备id分桶编号、小时级别时间分组,使用一个ProcessFunction计算分桶后的去重数(与MapState方式相同)
KeyedStream<_07_AdData, _07_AdKey1> keyedStream1 = ss1.keyBy(new KeySelector<_07_AdData, _07_AdKey1>() {
@Override
public _07_AdKey1 getKey(_07_AdData data) throws Exception {
long endTime = TimeWindow.getWindowStartWithOffset(data.getTime(), 0, Time.hours(1).toMilliseconds()) + Time.hours(1).toMilliseconds();
return new _07_AdKey1(data.getId(), endTime, data.getDevId().hashCode() % 3);
}
});
SingleOutputStreamOperator<Tuple2<Boolean, Tuple3<Integer, Long, Long>>> processStream1 = keyedStream1.process(new _07_DistinctProcessFunction01());
KeyedStream<Tuple2<Boolean, Tuple3<Integer, Long, Long>>, _07_AdKey2> keyedStream2 = processStream1.keyBy(new KeySelector<Tuple2<Boolean, Tuple3<Integer, Long, Long>>, _07_AdKey2>() {
@Override
public _07_AdKey2 getKey(Tuple2<Boolean, Tuple3<Integer, Long, Long>> tp2) throws Exception {
return new _07_AdKey2(tp2.f1.f0, tp2.f1.f1);
}
});
keyedStream2.process(new _07_DistinctProcessFunction02());
env.execute("_07_DistinctHotpot");
}
}
自定义函数
package com.yyds.flink_distinct;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
public class _07_DistinctProcessFunction01 extends KeyedProcessFunction<_07_AdKey1,_07_AdData, Tuple2<Boolean, Tuple3<Integer,Long,Long>>> {
// 定义第一个状态MapState
MapState<String,Integer> deviceIdState ;
// 定义第二个状态ValueState
ValueState<Long> countState ;
@Override
public void open(Configuration parameters) throws Exception {
MapStateDescriptor<String, Integer> deviceIdStateDescriptor = new MapStateDescriptor<>("deviceIdState", String.class, Integer.class);
deviceIdState = getRuntimeContext().getMapState(deviceIdStateDescriptor);
ValueStateDescriptor<Long> countStateDescriptor = new ValueStateDescriptor<>("countState", Long.class);
countState = getRuntimeContext().getState(countStateDescriptor);
}
@Override
public void processElement(_07_AdData adData, Context context, Collector<Tuple2<Boolean, Tuple3<Integer, Long, Long>>> collector) throws Exception {
String devId = adData.getDevId();
Integer i = deviceIdState.get(devId);
if(i == null){
i = 0;
}
int id = context.getCurrentKey().getId();
long time = context.getCurrentKey().getTime();
long code = context.getCurrentKey().getBucketCode();
Long c = countState.value();
if(c == null){
c = 0L;
}
// System.out.println("id = " + id + ",time = " + time + ",c = " + c + ",code = " + code);
if( i == 1 ){
// 表示已经存在
}else {
// 表示不存在,放入到状态中
deviceIdState.put(devId,1);
// 将统计的数据 + 1
Long count = c + 1;
countState.update(count);
System.out.println("id = " + id + ",time = " + time + ",count = " + count + ",code = " + code);
if(count > 1){
// 认为大于1的需要进行撤回
System.out.println("========撤回======");
collector.collect(Tuple2.of(false,Tuple3.of(id,time,c)));
collector.collect(Tuple2.of(true,Tuple3.of(id,time,count)));
}else {
collector.collect(Tuple2.of(true,Tuple3.of(id,time,count)));
}
}
}
}
package com.yyds.flink_distinct;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
/**
* 重点在于如果收到编码为false 的数据,那么需要从当前计数里面减掉撤回的计数值。
*/
public class _07_DistinctProcessFunction02 extends KeyedProcessFunction<_07_AdKey2, Tuple2<Boolean, Tuple3<Integer,Long,Long>>,Void> {
// 定义状态ValueState
ValueState<Long> countState ;
@Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor<Long> countStateDescriptor = new ValueStateDescriptor<>("countState", Long.class);
countState = getRuntimeContext().getState(countStateDescriptor);
}
@Override
public void processElement(Tuple2<Boolean, Tuple3<Integer, Long, Long>> tp2, Context context, Collector<Void> collector) throws Exception {
Long count = countState.value();
if(count == null) count = 0L;
Boolean bool = tp2.f0;
System.out.println(bool);
if(bool){
countState.update(count + tp2.f1.f2);
System.out.println(context.getCurrentKey() + ":" + countState.value());
}else {
// 发生撤回,那么需要从当前计数里面减掉撤回的计数值。
countState.update(count - tp2.f1.f2);
System.out.println(context.getCurrentKey() + ":" + countState.value());
}
}
}
javaBean:
package com.yyds.flink_distinct;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;
@AllArgsConstructor
@NoArgsConstructor
@Data
@ToString
/**
* 原始数据
*/
public class _07_AdData {
private int id;
private String devId;
private Long time;
}
package com.yyds.flink_distinct;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;
@AllArgsConstructor
@NoArgsConstructor
@Data
@ToString
/**
* 第一次keyBy的数据
*/
public class _07_AdKey1 {
private int id;
private Long time;
private int bucketCode ; // 桶的编码
}
package com.yyds.flink_distinct;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;
@AllArgsConstructor
@NoArgsConstructor
@Data
@ToString
/**
* 第二次keyBy的数据
*/
public class _07_AdKey2 {
private int id;
private Long time;
}