某些情况下,我们开窗统某些数量,需要根据主键进行去重操作,这里我们可以利用set集合进行去重操作
但是如果窗口中set里面的数据过多,则会占用大量的内存。
于是在这种场景下,我们可以选择将数据保存到redis,使用一个布隆过滤器,高效又能降低内存使用。
布隆过滤器的基本原理就是将主键进行hash计算,将计算的值在redis里保存的位图的相关位置置1,如果再来数据可以检测相关位置是否是1,如果是1说明已经有该数据。
我们实战一个电商统计每小时内访问量(独立访客数)
1、目录结构:
2、pojo包装类代码
package Bean;
public class UserBehavior {
//定义私有属性
private Long userId; //用户Id
private Long itemId; //分组Id
private Integer categoryId; //类别Id
private String behavior; //动作类型
private Long timestamp; //时间戳
public UserBehavior() {
}
public UserBehavior(Long userId, Long itemId, Integer categoryId, String behavior, Long timestamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timestamp = timestamp;
}
public Long getUserId() {
return userId;
}
public void setUserId(Long userId) {
this.userId = userId;
}
public Long getItemId() {
return itemId;
}
public void setItemId(Long itemId) {
this.itemId = itemId;
}
public Integer getCategoryId() {
return categoryId;
}
public void setCategoryId(Integer categoryId) {
this.categoryId = categoryId;
}
public String getBehavior() {
return behavior;
}
public void setBehavior(String behavior) {
this.behavior = behavior;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
@Override
public String toString() {
return "UserBehavior{" +
"userId=" + userId +
", itemId=" + itemId +
", categoryId=" + categoryId +
", behavior='" + behavior + '\'' +
", timestamp=" + timestamp +
'}';
}
}
3、flink程序 利用set进行数据去重:
package Project;
import Bean.PageViewCount;
import Bean.UserBehavior;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.net.URL;
import java.util.HashSet;
public class UniqueVisitor {
public static void main(String[] args) throws Exception{
//创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//设置并行度
env.setParallelism(1);
URL resource = UniqueVisitor.class.getResource("/UserBehavior.csv");
DataStream<String> inputStream = env.readTextFile(resource.getPath());
//转换为POJO,分配时间戳和watermark
DataStream<UserBehavior> dataStream = inputStream
.map(new MapFunction<String, UserBehavior>() {
public UserBehavior map(String s) throws Exception {
String[] fields = s.split(",");
return new UserBehavior(new Long(fields[0]), new Long(fields[1]), new Integer(fields[2]), fields[3], new Long(fields[4]));
}
}).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<UserBehavior>() {
@Override
public long extractAscendingTimestamp(UserBehavior element) {
return element.getTimestamp()*1000L;
}
});
//开窗统计uv值 set去重
DataStream<PageViewCount> uvStream = dataStream.filter(data -> "pv".equals(data.getBehavior()))
.timeWindowAll(Time.hours(1))
.apply(new UvCountResult());
uvStream.print("uv result");
env.execute("uv count job");
}
//实现自定义全窗口函数
public static class UvCountResult implements AllWindowFunction<UserBehavior, PageViewCount,TimeWindow> {
@Override
public void apply(TimeWindow timeWindow, Iterable<UserBehavior> iterable, Collector<PageViewCount> collector) throws Exception {
//定义一个Set结构 保存所有窗口中的所有userId,自动去重
HashSet<Long> uidSet = new HashSet<>();
for(UserBehavior ub:iterable){
uidSet.add(ub.getUserId());
}
collector.collect(new PageViewCount("uv",timeWindow.getEnd(),(long) uidSet.size()));
}
}
}
4、flink程序利用自定义的布隆过滤器进行去重
package Project;
import Bean.PageViewCount;
import Bean.UserBehavior;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.triggers.Trigger;
import org.apache.flink.streaming.api.windowing.triggers.TriggerResult;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.kafka.common.protocol.types.Field;
import redis.clients.jedis.Jedis;
import java.net.URL;
public class UvWithBloomFilter {
public static void main(String[] args) throws Exception{
//创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//设置并行度
env.setParallelism(1);
URL resource = UniqueVisitor.class.getResource("/UserBehavior.csv");
DataStream<String> inputStream = env.readTextFile(resource.getPath());
//转换为POJO,分配时间戳和watermark
DataStream<UserBehavior> dataStream = inputStream
.map(new MapFunction<String, UserBehavior>() {
public UserBehavior map(String s) throws Exception {
String[] fields = s.split(",");
return new UserBehavior(new Long(fields[0]), new Long(fields[1]), new Integer(fields[2]), fields[3], new Long(fields[4]));
}
}).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<UserBehavior>() {
@Override
public long extractAscendingTimestamp(UserBehavior element) {
return element.getTimestamp()*1000L;
}
});
//开窗统计uv值 set去重
DataStream<PageViewCount> uvStream = dataStream.filter(data -> "pv".equals(data.getBehavior()))
.timeWindowAll(Time.hours(1))
//使用布隆过滤器 就不能把数据都放在窗口 否则没有意义,应当没条数据都进行一次判断
//自定义一个窗口触发器
.trigger(new MyTrigger())
.process(new UvCountResultWithBloomFilter());
uvStream.print("uv result");
env.execute("uv count with bloom filter job");
}
public static class MyTrigger extends Trigger<UserBehavior, TimeWindow>{
@Override
public TriggerResult onElement(UserBehavior userBehavior, long l, TimeWindow timeWindow, TriggerContext triggerContext) throws Exception {
//每条数据来到 直接触发窗口计算,并且直接清空窗口 第一个参数是是否进行计算 后边参数是是否丢掉窗口
// CONTINUE(false, false),
// FIRE_AND_PURGE(true, true),
// FIRE(true, false),
// PURGE(false, true);
return TriggerResult.FIRE_AND_PURGE;
}
@Override
//处理时间
public TriggerResult onProcessingTime(long l, TimeWindow timeWindow, TriggerContext triggerContext) throws Exception {
return TriggerResult.CONTINUE;
}
@Override
//事件时间
public TriggerResult onEventTime(long l, TimeWindow timeWindow, TriggerContext triggerContext) throws Exception {
return TriggerResult.CONTINUE;
}
@Override
public void clear(TimeWindow timeWindow, TriggerContext triggerContext) throws Exception {
}
}
//自定义一个布隆过滤器
public static class MyBloomFilter{
//定义位图的大小 一般需要定义为2的整次幂
private Integer cap;
public MyBloomFilter(Integer cap) {
this.cap = cap;
}
//实现hash函数
public Long hachcode(String value, Integer seed){
Long result = 0L;
for(int i=0;i<value.length();i++){
result = result * seed+value.charAt(i);
}
//位运算 截取部分
return result & (cap-1);
}
}
//实现自定义的处理函数
public static class UvCountResultWithBloomFilter extends ProcessAllWindowFunction<UserBehavior, PageViewCount, TimeWindow>{
//定义redis连接
Jedis jedis;
//定义布隆过滤器
MyBloomFilter myBloomFilter;
@Override
public void open(Configuration parameters) throws Exception {
jedis = new Jedis("192.168.6.23", 6379);
jedis.select(14);
myBloomFilter = new MyBloomFilter(1<<29);//2的29次幂 用64MB大小的位图
}
@Override
public void process(Context context, Iterable<UserBehavior> iterable, Collector<PageViewCount> collector) throws Exception {
//将位图和窗口的count值存入redis(为了防止每条数据处理完 窗口状态都被清空)用windowEnd当key
Long windowEnd = context.window().getEnd();
String bitmapkey = windowEnd.toString();
//把count值存成一张hash表
String countHashName = "uv_count";
String countKey = windowEnd.toString();
//取当前的userId
Long userId = iterable.iterator().next().getUserId();
//计算位图中的偏移量
Long offset = myBloomFilter.hachcode(userId.toString(), 61);
//判断是否存在 redis getbit
Boolean isExist = jedis.getbit(bitmapkey, offset);
if (!isExist){
//如果不存在 对应位图位置置1
jedis.setbit(bitmapkey, offset, true);
//更新redis中保存的count
//初始count值
Long uvCount = 0L;
String uvCountString = jedis.hget(countHashName, countKey);
if(uvCountString!=null && !uvCountString.equals(""))
uvCount = Long.valueOf(uvCountString);
jedis.hset(countHashName,countKey,String.valueOf(uvCount+1));
//输出
collector.collect(new PageViewCount("uv", windowEnd, uvCount+1));
}
}
@Override
public void close() throws Exception {
jedis.close();
}
}
}