1. 获取窗口迟到的数据

  主要流程就是给迟到的数据打上标签,然后使用相应窗口流的实例调用sideOutputLateData(lateDataTag),从而获得窗口迟到的数据,进而进行相关的计算,具体代码见下

WindowLateDataDemo

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.OutputTag;

public class WindowLateDataDemo {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> lines = env.socketTextStream("feng05", 8888);
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        // 数据类型:时间戳,单词,获取时间戳
        SingleOutputStreamOperator<String> assignTimestampsAndWatermarks = lines.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<String>(Time.seconds(2)) {
            @Override
            public long extractTimestamp(String element) {
                String stampTime = element.split(",")[0];
                return Long.parseLong(stampTime);
            }
        });
        // 切分数据
        SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = assignTimestampsAndWatermarks.map(new MapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public Tuple2<String, Integer> map(String line) throws Exception {
                String word = line.split(",")[1];
                return Tuple2.of(word, 1);
            }
        });
        // 按照key分组
        KeyedStream<Tuple2<String, Integer>, Tuple> keyed = wordAndOne.keyBy(0);
        OutputTag<Tuple2<String, Integer>> lateDataTag = new OutputTag<Tuple2<String, Integer>>("late-data"){};
        // 划分窗口并进行计算
        WindowedStream<Tuple2<String, Integer>, Tuple, TimeWindow> window = keyed.window(TumblingEventTimeWindows.of(Time.seconds(5)));
        SingleOutputStreamOperator<Tuple2<String, Integer>> summed = window.sideOutputLateData(lateDataTag).sum(1);
        // 获取迟到数据的测流
        DataStream<Tuple2<String, Integer>> lateDataStream = summed.getSideOutput(lateDataTag);
        summed.print("准时的数据: ");
        SingleOutputStreamOperator<Tuple2<String, Integer>> result = summed.union(lateDataStream).keyBy(0).sum(1);
        result.print();
        env.execute();
    }
}

View Code

 2.双流join

 补充:

  Join、CoGroup和CoFlatMap这三个运算符都能够将双数据流转换为单个数据流。Join和CoGroup会根据指定的条件进行数据配对操作,不同的是Join只输出匹配成功的数据对,CoGroup无论是否有匹配都会输出。CoFlatMap没有匹配操作,只是分别去接收两个流的输入。 

 

  简单来说,就是两个数据流进行join操作,类似mysql中的join,其也包含inner join、left join、right join、full join等,以下是具体例子

  • inner join

  此处案例是先获取两个数据流,然后分别设置水位线,在进行join操作,从而得到joinedStream实例,然后调用joinStream中的where、equalTo方法,即可join到key相同的数据(注意此处需要自己定义key选择器),接着就是划分窗口,处理窗口中的数据

注意:条件相同并且在同一个窗口中的数据才能被join上

知识点:

  一个窗口会有很多分区,只有当每个分区中的时间都满足窗口的触发条件,窗口才会被触发,所以下面的测试为了方便都将并行度设置为1

 StreamDataSourceA

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;

public class StreamDataSourceA extends RichParallelSourceFunction<Tuple3<String, String, Long>> {
    private volatile boolean running = true;

    @Override
    public void run(SourceContext<Tuple3<String, String, Long>> ctx) throws InterruptedException {
        //事先准备好的数据
        Tuple3[] elements = new Tuple3[]{
                Tuple3.of("a", "1", 1000000050000L), //[50000 - 60000)
                Tuple3.of("a", "2", 1000000054000L), //[50000 - 60000)
                Tuple3.of("a", "3", 1000000079900L), //[70000 - 80000)
                Tuple3.of("a", "4", 1000000115000L), //[110000 - 120000)  // 115000 - 5001 = 109999 >= 109999
                Tuple3.of("b", "5", 1000000100000L), //[100000 - 110000)
                Tuple3.of("b", "6", 1000000108000L)  //[100000 - 110000)
        };
        int count = 0;
        while (running && count < elements.length) {
            //将数据发出去
            ctx.collect(new Tuple3<>((String) elements[count].f0, (String) elements[count].f1, (Long) elements[count].f2));
            count++;
            Thread.sleep(1000);
        }
        //Thread.sleep(100000000);
    }
    @Override
    public void cancel() {
        running = false;
    }
}

View Code

 StreamDataSourceB

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;

public class StreamDataSourceB extends RichParallelSourceFunction<Tuple3<String, String, Long>> {
    private volatile boolean running = true;
    @Override
    public void run(SourceContext<Tuple3<String, String, Long>> ctx) throws Exception {
        // a ,1 hangzhou
        Tuple3[] elements = new Tuple3[]{
                Tuple3.of("a", "hangzhou", 1000000059000L), //[50000, 60000)
                Tuple3.of("b", "beijing",  1000000105000L), //[100000, 110000)
        };
        int count = 0;
        while (running && count < elements.length) {
            //将数据发出去
            ctx.collect(new Tuple3<>((String) elements[count].f0, (String) elements[count].f1, (long) elements[count].f2));
            count++;
            Thread.sleep(1000);
        }
        //Thread.sleep(100000000);
    }
    @Override
    public void cancel() {
        running = false;
    }
}

View Code

 FlinkTumblingWindowsInnerJoinDemo

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.JoinedStreams;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

public class FlinkTumblingWindowsInnerJoinDemo {
    public static void main(String[] args) throws Exception {
        int windowSize = 10;
        long delay = 5001L;
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        // 设置时间类型
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        //通过Env设置并行度为1,即以后所有的DataStream的并行都是1
        env.setParallelism(1);
        // 设置数据源
        //第一个流(左流)
        DataStream<Tuple3<String, String, Long>> leftSource = env.addSource(new StreamDataSourceA()).name("Demo SourceA");
        //第二个流(右流)
        DataStream<Tuple3<String, String, Long>> rightSource = env.addSource(new StreamDataSourceB()).name("Demo SourceB");
        // 对leftSource设置水位线
        // ("a", "1", 1000)
        SingleOutputStreamOperator<Tuple3<String, String, Long>> leftStream = leftSource.assignTimestampsAndWatermarks(
                new BoundedOutOfOrdernessTimestampExtractor<Tuple3<String, String, Long>>(Time.milliseconds(delay)) {
            @Override
            public long extractTimestamp(Tuple3<String, String, Long> element) {
                return element.f2;
            }
        });
        // 对rightSource设置水位线
        // ("a", "hangzhou", 6000)
        SingleOutputStreamOperator<Tuple3<String, String, Long>> rightStream = rightSource.assignTimestampsAndWatermarks(
                new BoundedOutOfOrdernessTimestampExtractor<Tuple3<String, String, Long>>(Time.milliseconds(delay)) {
            @Override
            public long extractTimestamp(Tuple3<String, String, Long> element) {
                return element.f2;
            }
        });

        // join操作
        JoinedStreams<Tuple3<String, String, Long>, Tuple3<String, String, Long>> joinedStream = leftStream.join(rightStream);
        joinedStream
                .where(new LeftSelectKey())
                .equalTo(new RightSelectKey())
                .window(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
                .apply(new JoinFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Tuple5<String, String, String, Long, Long>>() {
                    // 两个流的key的值相等,并且在同一个窗口内
                    @Override
                    public Tuple5<String, String, String, Long, Long> join(Tuple3<String, String, Long> first, Tuple3<String, String, Long> second) throws Exception {
                        // a, 1, "hangzhou", 1000001000, 1000006000
                        return new Tuple5<>(first.f0, first.f1, second.f1, first.f2, second.f2);
                    }
                }).print();
        env.execute();
    }

    public static class LeftSelectKey implements KeySelector<Tuple3<String, String, Long>,String>{
        @Override
        public String getKey(Tuple3<String, String, Long> w) throws Exception {
            return w.f0;
        }
    }
    public static class RightSelectKey implements KeySelector<Tuple3<String, String, Long>, String> {
        @Override
        public String getKey(Tuple3<String, String, Long> w) {
            return w.f0;
        }
    }

}

View Code

运行结果

kafka异地双活架构 kafka双流join_ide_09

 

 为什么b没有被join上?

kafka异地双活架构 kafka双流join_apache_10

 

  •  left join

FlinkTumblingWindowLeftJoinDemo

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

public class FlinkTumblingWindowsLeftJoinDemo {
    public static void main(String[] args) throws Exception {
        int windowSize = 10;
        long delay = 5000L;
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        env.setParallelism(1);

        // 设置数据源
        DataStream<Tuple3<String, String, Long>> leftSource = env.addSource(new StreamDataSourceA()).name("Demo SourceA");
        DataStream<Tuple3<String, String, Long>> rightSource = env.addSource(new StreamDataSourceB()).name("Demo SourceB");
        // 设置水位线
        DataStream<Tuple3<String, String, Long>> leftStream = leftSource.assignTimestampsAndWatermarks(
                new BoundedOutOfOrdernessTimestampExtractor<Tuple3<String, String, Long>>(Time.milliseconds(delay)) {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> element) {
                        return element.f2;
                    }
                }
        );

        DataStream<Tuple3<String, String, Long>> rightStream = rightSource.assignTimestampsAndWatermarks(
                new BoundedOutOfOrdernessTimestampExtractor<Tuple3<String, String, Long>>(Time.milliseconds(delay)) {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> element) {
                        return element.f2;
                    }
                }
        );
        int parallelism1 = leftStream.getParallelism();
        int parallelism2 = rightStream.getParallelism();
        // left join 操作
        leftStream.coGroup(rightStream)
                .where(new LeftSelectKey())
                .equalTo(new RightSelectKey())
                .window(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
                .apply(new LeftJoin())
                .print();

        env.execute("TimeWindowDemo");
    }

    public static class LeftJoin implements CoGroupFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Tuple5<String, String, String, Long, Long>> {
        //coGroup左流的数据和有流的数据取出来,可以将key相同,并且在同一个窗口的数据取出来
        @Override
        public void coGroup(
                Iterable<Tuple3<String, String, Long>> leftElements, Iterable<Tuple3<String, String, Long>> rightElements, Collector<Tuple5<String, String, String, Long, Long>> out) {
            //leftElements是左流的数据
            for (Tuple3<String, String, Long> leftElem : leftElements) {
                boolean hadElements = false;
                //如果左边的流join上了右边的流rightElements就不为空
                for (Tuple3<String, String, Long> rightElem : rightElements) {
                    //将join上的数据输出
                    out.collect(new Tuple5<>(leftElem.f0, leftElem.f1, rightElem.f1, leftElem.f2, rightElem.f2));
                    hadElements = true;
                }
                if (!hadElements) {
                    //没join上,给右边的数据赋空值
                    out.collect(new Tuple5<>(leftElem.f0, leftElem.f1, "null", leftElem.f2, -1L));
                }
            }
        }
    }

    public static class LeftSelectKey implements KeySelector<Tuple3<String, String, Long>, String> {
        @Override
        public String getKey(Tuple3<String, String, Long> w) {
            return w.f0;
        }
    }

    public static class RightSelectKey implements KeySelector<Tuple3<String, String, Long>, String> {
        @Override
        public String getKey(Tuple3<String, String, Long> w) {
            return w.f0;
        }
    }

}

View Code

做法和join类似,只是此处使用coGroup算子

 

3. 订单Join案例

在各种各样的系统中,都订单数据表
    订单表:订单主表、订单明细表

    订单主表:
        订单id、订单状态、订单总金额、订单的时间、用户ID

    订单明细表:
        订单主表的ID、商品ID、商品的分类ID、商品的单价、商品的数量

统计某个分类的成交金额
    订单状态为成交的(主表中)
    商品的分类ID(明细表中)
    商品的金额即单价*数量(明细表中)

在京东或淘宝,下来一个单
    o1000,已支付,600,2020-06-29 19:42:00,feng
    p1, 100, 1, 食品
    p1, 200, 1, 食品
    p1, 300, 1, 食品

3.1 订单数据的join实现

读取kafka中topic的数据的工具类

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;

import java.util.Arrays;
import java.util.List;
import java.util.Properties;

public class FlinkUtilsV2 {

    private static StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();


    public static <T> DataStream<T> createKafkaDataStream(ParameterTool parameters, Class<? extends DeserializationSchema<T>> clazz) throws Exception {
        String topics = parameters.getRequired("kafka.topics");
        String groupId = parameters.getRequired("group.id");
        return createKafkaDataStream(parameters, topics, groupId, clazz);
    }



    public static <T> DataStream<T> createKafkaDataStream(ParameterTool parameters, String topics, Class<? extends DeserializationSchema<T>> clazz) throws Exception {

        String groupId = parameters.getRequired("group.id");
        return createKafkaDataStream(parameters, topics, groupId, clazz);
    }


    public static <T> DataStream<T> createKafkaDataStream(ParameterTool parameters, String topics, String groupId, Class<? extends DeserializationSchema<T>> clazz) throws Exception {

        //将ParameterTool的参数设置成全局的参数
        env.getConfig().setGlobalJobParameters(parameters);

        //开启checkpoint
        env.enableCheckpointing(parameters.getLong("checkpoint.interval", 10000L), CheckpointingMode.EXACTLY_ONCE);

        //重启策略
        env.setRestartStrategy(RestartStrategies.fixedDelayRestart(parameters.getInt("restart.times", 10), Time.seconds(5)));

        //设置statebackend
        String path = parameters.get("state.backend.path");
        if(path != null) {
            //最好的方式将setStateBackend配置到Flink的全局配置文件中flink-conf.yaml
            env.setStateBackend(new FsStateBackend(path));
        }

        //设置cancel任务不用删除checkpoint
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        env.getCheckpointConfig().setMaxConcurrentCheckpoints(3);

        //String topics = parameters.getRequired("kafka.topics");

        List<String> topicList = Arrays.asList(topics.split(","));

        Properties properties = parameters.getProperties();

        properties.setProperty("group.id", groupId);

        //创建FlinkKafkaConsumer
        FlinkKafkaConsumer<T> kafkaConsumer = new FlinkKafkaConsumer<T>(
                topicList,
                clazz.newInstance(),
                properties
        );

        return env.addSource(kafkaConsumer);
    }

    public static StreamExecutionEnvironment getEnv() {
        return env;
    }

}

View Code

OrderDetail

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import java.util.Date;

public class OrderDetail {

    private Long id;
    private Long order_id;
    private int category_id;
    private String categoryName;
    private Long sku;
    private Double money;
    private int amount;
    private Date create_time;
    private Date update_time;

    //对数据库的操作类型:INSERT、UPDATE
    private String type;

    public Long getId() {
        return id;
    }

    public void setId(Long id) {
        this.id = id;
    }

    public Long getOrder_id() {
        return order_id;
    }

    public void setOrder_id(Long order_id) {
        this.order_id = order_id;
    }

    public int getCategory_id() {
        return category_id;
    }

    public void setCategory_id(int category_id) {
        this.category_id = category_id;
    }

    public Long getSku() {
        return sku;
    }

    public void setSku(Long sku) {
        this.sku = sku;
    }

    public Double getMoney() {
        return money;
    }

    public void setMoney(Double money) {
        this.money = money;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public Date getCreate_time() {
        return create_time;
    }

    public void setCreate_time(Date create_time) {
        this.create_time = create_time;
    }

    public Date getUpdate_time() {
        return update_time;
    }

    public void setUpdate_time(Date update_time) {
        this.update_time = update_time;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getCategoryName() {
        return categoryName;
    }

    public void setCategoryName(String categoryName) {
        this.categoryName = categoryName;
    }

    @Override
    public String toString() {
        return "OrderDetail{" +
                "id=" + id +
                ", order_id=" + order_id +
                ", category_id=" + category_id +
                ", categoryName='" + categoryName + '\'' +
                ", sku=" + sku +
                ", money=" + money +
                ", amount=" + amount +
                ", create_time=" + create_time +
                ", update_time=" + update_time +
                ", type='" + type + '\'' +
                '}';
    }
}

View Code

OrderMain

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import java.util.Date;

public class OrderMain {

    private Long oid;
    private Date create_time;
    private Double total_money;
    private int status;
    private Date update_time;
    private String province;
    private String city;
    //对数据库的操作类型:INSERT、UPDATE
    private String type;

    public Long getOid() {
        return oid;
    }

    public void setOid(Long oid) {
        this.oid = oid;
    }

    public Date getCreate_time() {
        return create_time;
    }

    public void setCreate_time(Date create_time) {
        this.create_time = create_time;
    }

    public Double getTotal_money() {
        return total_money;
    }

    public void setTotal_money(Double total_money) {
        this.total_money = total_money;
    }

    public int getStatus() {
        return status;
    }

    public void setStatus(int status) {
        this.status = status;
    }

    public Date getUpdate_time() {
        return update_time;
    }

    public void setUpdate_time(Date update_time) {
        this.update_time = update_time;
    }

    public String getProvince() {
        return province;
    }

    public void setProvince(String province) {
        this.province = province;
    }

    public String getCity() {
        return city;
    }

    public void setCity(String city) {
        this.city = city;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    @Override
    public String toString() {
        return "OrderMain{" +
                "oid=" + oid +
                ", create_time=" + create_time +
                ", total_money=" + total_money +
                ", status=" + status +
                ", update_time=" + update_time +
                ", province='" + province + '\'' +
                ", city='" + city + '\'' +
                ", type='" + type + '\'' +
                '}';
    }
}

View Code

业务代码

OrderJoin

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

public class OrderJoin {


    public static void main(String[] args) throws Exception {

        ParameterTool parameters = ParameterTool.fromPropertiesFile(args[0]);

        //使用EventTime作为时间标准
        FlinkUtilsV2.getEnv().setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        DataStream<String> orderMainLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "ordermain", "g1", SimpleStringSchema.class);

        DataStream<String> orderDetailLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "orderdetail", "g1", SimpleStringSchema.class);

        //对数据进行解析
        SingleOutputStreamOperator<OrderMain> orderMainDataStream = orderMainLinesDataStream.process(new ProcessFunction<String, OrderMain>() {

            @Override
            public void processElement(String line, Context ctx, Collector<OrderMain> out) throws Exception {
                //flatMap+filter
                try {
                    JSONObject jsonObject = JSON.parseObject(line);
                    String type = jsonObject.getString("type");
                    if (type.equals("INSERT") || type.equals("UPDATE")) {
                        JSONArray jsonArray = jsonObject.getJSONArray("data");
                        for (int i = 0; i < jsonArray.size(); i++) {
                            OrderMain orderMain = jsonArray.getObject(i, OrderMain.class);
                            orderMain.setType(type); //设置操作类型
                            out.collect(orderMain);
                        }
                    }
                } catch (Exception e) {
                    //e.printStackTrace();
                    //记录错误的数据
                }
            }
        });

        //对数据进行解析
        SingleOutputStreamOperator<OrderDetail> orderDetailDataStream = orderDetailLinesDataStream.process(new ProcessFunction<String, OrderDetail>() {

            @Override
            public void processElement(String line, Context ctx, Collector<OrderDetail> out) throws Exception {
                //flatMap+filter
                try {
                    JSONObject jsonObject = JSON.parseObject(line);
                    String type = jsonObject.getString("type");
                    if (type.equals("INSERT") || type.equals("UPDATE")) {
                        JSONArray jsonArray = jsonObject.getJSONArray("data");
                        for (int i = 0; i < jsonArray.size(); i++) {
                            OrderDetail orderDetail = jsonArray.getObject(i, OrderDetail.class);
                            orderDetail.setType(type); //设置操作类型
                            out.collect(orderDetail);
                        }
                    }
                } catch (Exception e) {
                    //e.printStackTrace();
                    //记录错误的数据
                }
            }
        });

        int delaySeconds = 2;

        //提取EventTime生成WaterMark
        SingleOutputStreamOperator<OrderMain> orderMainStreamWithWaterMark = orderMainDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderMain>(Time.seconds(delaySeconds)) {
            @Override
            public long extractTimestamp(OrderMain element) {
                return element.getCreate_time().getTime();
            }
        });

        SingleOutputStreamOperator<OrderDetail> orderDetailStreamWithWaterMark = orderDetailDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderDetail>(Time.seconds(delaySeconds)) {
            @Override
            public long extractTimestamp(OrderDetail element) {
                return element.getCreate_time().getTime();
            }
        });

        //Left Out JOIN,并且将订单明细表作为左表
        DataStream<Tuple2<OrderDetail, OrderMain>> joined = orderDetailStreamWithWaterMark.coGroup(orderMainStreamWithWaterMark)
                .where(new KeySelector<OrderDetail, Long>() {
                    @Override
                    public Long getKey(OrderDetail value) throws Exception {
                        return value.getOrder_id();
                    }
                })
                .equalTo(new KeySelector<OrderMain, Long>() {
                    @Override
                    public Long getKey(OrderMain value) throws Exception {
                        return value.getOid();
                    }
                })
                .window(TumblingEventTimeWindows.of(Time.seconds(5)))
                .apply(new CoGroupFunction<OrderDetail, OrderMain, Tuple2<OrderDetail, OrderMain>>() {

                    @Override
                    public void coGroup(Iterable<OrderDetail> first, Iterable<OrderMain> second, Collector<Tuple2<OrderDetail, OrderMain>> out) throws Exception {

                        for (OrderDetail orderDetail : first) {
                            boolean isJoined = false;
                            for (OrderMain orderMain : second) {
                                out.collect(Tuple2.of(orderDetail, orderMain));
                                isJoined = true;
                            }
                            if (!isJoined) {
                                out.collect(Tuple2.of(orderDetail, null));
                            }
                        }
                    }
                });

        joined.print();

        FlinkUtilsV2.getEnv().execute();
    }
}

View Code

 

3.2 订单数据和迟到数据join的实现

 OrderJoinAdv

kafka异地双活架构 kafka双流join_apache

kafka异地双活架构 kafka双流join_apache_02

package cn._51doit.flink.day10;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;

import java.sql.Connection;

public class OrderJoinAdv {


    public static void main(String[] args) throws Exception {

        ParameterTool parameters = ParameterTool.fromPropertiesFile(args[0]);

        //使用EventTime作为时间标准
        FlinkUtilsV2.getEnv().setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        DataStream<String> orderMainLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "ordermain", "g1", SimpleStringSchema.class);

        DataStream<String> orderDetailLinesDataStream = FlinkUtilsV2.createKafkaDataStream(parameters, "orderdetail", "g1", SimpleStringSchema.class);

        //对数据进行解析
        SingleOutputStreamOperator<OrderMain> orderMainDataStream = orderMainLinesDataStream.process(new ProcessFunction<String, OrderMain>() {

            @Override
            public void processElement(String line, Context ctx, Collector<OrderMain> out) throws Exception {
                //flatMap+filter
                try {
                    JSONObject jsonObject = JSON.parseObject(line);
                    String type = jsonObject.getString("type");
                    if (type.equals("INSERT") || type.equals("UPDATE")) {
                        JSONArray jsonArray = jsonObject.getJSONArray("data");
                        for (int i = 0; i < jsonArray.size(); i++) {
                            OrderMain orderMain = jsonArray.getObject(i, OrderMain.class);
                            orderMain.setType(type); //设置操作类型
                            out.collect(orderMain);
                        }
                    }
                } catch (Exception e) {
                    //e.printStackTrace();
                    //记录错误的数据
                }
            }
        });

        //对数据进行解析
        SingleOutputStreamOperator<OrderDetail> orderDetailDataStream = orderDetailLinesDataStream.process(new ProcessFunction<String, OrderDetail>() {

            @Override
            public void processElement(String line, Context ctx, Collector<OrderDetail> out) throws Exception {
                //flatMap+filter
                try {
                    JSONObject jsonObject = JSON.parseObject(line);
                    String type = jsonObject.getString("type");
                    if (type.equals("INSERT") || type.equals("UPDATE")) {
                        JSONArray jsonArray = jsonObject.getJSONArray("data");
                        for (int i = 0; i < jsonArray.size(); i++) {
                            OrderDetail orderDetail = jsonArray.getObject(i, OrderDetail.class);
                            orderDetail.setType(type); //设置操作类型
                            out.collect(orderDetail);
                        }
                    }
                } catch (Exception e) {
                    //e.printStackTrace();
                    //记录错误的数据
                }
            }
        });

        int delaySeconds = 2;
        int windowSize = 5;

        //提取EventTime生成WaterMark
        SingleOutputStreamOperator<OrderMain> orderMainStreamWithWaterMark = orderMainDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderMain>(Time.seconds(delaySeconds)) {
            @Override
            public long extractTimestamp(OrderMain element) {
                return element.getCreate_time().getTime();
            }
        });

        SingleOutputStreamOperator<OrderDetail> orderDetailStreamWithWaterMark = orderDetailDataStream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<OrderDetail>(Time.seconds(delaySeconds)) {
            @Override
            public long extractTimestamp(OrderDetail element) {
                return element.getCreate_time().getTime();
            }
        });

        //定义迟到侧流输出的Tag
        OutputTag<OrderDetail> lateTag = new OutputTag<OrderDetail>("late-date") {};

        //对左表进行单独划分窗口,窗口的长度与cogroup的窗口长度一样
        SingleOutputStreamOperator<OrderDetail> orderDetailWithWindow = orderDetailStreamWithWaterMark.windowAll(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
                .sideOutputLateData(lateTag) //将迟到的数据打上Tag
                .apply(new AllWindowFunction<OrderDetail, OrderDetail, TimeWindow>() {
                    @Override
                    public void apply(TimeWindow window, Iterable<OrderDetail> values, Collector<OrderDetail> out) throws Exception {
                        for (OrderDetail value : values) {
                            //out.collect(value);
                        }
                    }
                });

        //获取迟到的数据
        DataStream<OrderDetail> lateOrderDetailStream = orderDetailWithWindow.getSideOutput(lateTag);

        //应为orderDetail表的数据迟到数据不是很多,没必要使用异步IO,直接使用RichMapFunction
        SingleOutputStreamOperator<Tuple2<OrderDetail, OrderMain>> lateOrderDetailAndOrderMain = lateOrderDetailStream.map(new RichMapFunction<OrderDetail, Tuple2<OrderDetail, OrderMain>>() {

            @Override
            public void open(Configuration parameters) throws Exception {
                //创建书库的连接
            }

            @Override
            public Tuple2<OrderDetail, OrderMain> map(OrderDetail value) throws Exception {
                return null;
            }

            @Override
            public void close() throws Exception {
                //关闭数据库的连接
            }
        });


        //Left Out JOIN,并且将订单明细表作为左表
        DataStream<Tuple2<OrderDetail, OrderMain>> joined = orderDetailWithWindow.coGroup(orderMainStreamWithWaterMark)
                .where(new KeySelector<OrderDetail, Long>() {
                    @Override
                    public Long getKey(OrderDetail value) throws Exception {
                        return value.getOrder_id();
                    }
                })
                .equalTo(new KeySelector<OrderMain, Long>() {
                    @Override
                    public Long getKey(OrderMain value) throws Exception {
                        return value.getOid();
                    }
                })
                .window(TumblingEventTimeWindows.of(Time.seconds(windowSize)))
                .apply(new CoGroupFunction<OrderDetail, OrderMain, Tuple2<OrderDetail, OrderMain>>() {

                    @Override
                    public void coGroup(Iterable<OrderDetail> first, Iterable<OrderMain> second, Collector<Tuple2<OrderDetail, OrderMain>> out) throws Exception {

                        for (OrderDetail orderDetail : first) {
                            boolean isJoined = false;
                            for (OrderMain orderMain : second) {
                                out.collect(Tuple2.of(orderDetail, orderMain));
                                isJoined = true;
                            }
                            if (!isJoined) {
                                out.collect(Tuple2.of(orderDetail, null));
                            }
                        }
                    }
                });

        //join后,有可orderMain没有join上
        SingleOutputStreamOperator<Tuple2<OrderDetail, OrderMain>> punctualOrderDetailAndOrderMain = joined.map(new RichMapFunction<Tuple2<OrderDetail, OrderMain>, Tuple2<OrderDetail, OrderMain>>() {

            private transient Connection connection;

            @Override
            public void open(Configuration parameters) throws Exception {
                //打开数据库连接
            }

            @Override
            public Tuple2<OrderDetail, OrderMain> map(Tuple2<OrderDetail, OrderMain> tp) throws Exception {
                //每个关联上订单主表的数据,就查询书库
                if (tp.f1 == null) {
                    OrderMain orderMain = queryOrderMainFromMySQL(tp.f0.getOrder_id(), connection);
                    tp.f1 = orderMain;
                }
                return tp;
            }

            @Override
            public void close() throws Exception {
                //关闭数据库连接
            }
        });

        //将准时的和迟到的UNION到一起
        DataStream<Tuple2<OrderDetail, OrderMain>> allOrderStream = punctualOrderDetailAndOrderMain.union(lateOrderDetailAndOrderMain);

        //根据具体的场景,写入到Kafka、Hbase、ES、ClickHouse
        allOrderStream.print();

        FlinkUtilsV2.getEnv().execute();
    }

    private static OrderMain queryOrderMainFromMySQL(Long order_id, Connection connection) {
        return null;
    }
}

View Code