5 Flink 流处理API

Enviroment
package env;

import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.streaming.api.environment.LocalStreamEnvironment;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * @author wangkai
 */
public class Env {
    public static void main(String[] args) {
        /**
         * 流式执行环境
         * 创建一个执行环境,表示当前执行程序的上下文。 如果程序是独立调用的,则
         * 此方法返回本地执行环境;如果从命令行客户端调用程序以提交到集群,则此方法
         * 返回此集群的执行环境,也就是说,getExecutionEnvironment 会根据查询运行的方
         * 式决定返回什么样的运行环境,是最常用的一种创建执行环境的方式
         * */
        StreamExecutionEnvironment StreamEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();

        /**
         * 批处理执行环境
         * */
        ExecutionEnvironment BatchEnvironment1 = ExecutionEnvironment.getExecutionEnvironment();

        /**
         *
         *创建本地执行环境
         * */
        LocalStreamEnvironment localEnvironment = StreamExecutionEnvironment.createLocalEnvironment();


        /**
         * 创建远程执行环境
         * */
        StreamExecutionEnvironment remoteEnvironment = StreamExecutionEnvironment.createRemoteEnvironment("xxx", 1234, "xxx.jar");

    }
}
Source
package source;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import java.util.Arrays;
import java.util.Properties;

/**
 * @author wangkai 
 */
public class Source {
    public static void main(String[] args) {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        /**
         * 1 从集合读取
         * */
        DataStreamSource<String> source = env.fromCollection(Arrays.asList("zs", "ls", "wmz"));

        /**
         * 2 从文件读取数据
         * */
        DataStreamSource<String> source1 = env.readTextFile("path");

        /**
         * 3 从kafka读取数据
         * */
        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers","xxx");
        properties.setProperty("group.id","xxx");

        DataStreamSource<String> source2 = env.addSource(new FlinkKafkaConsumer011<String>("topic", new SimpleStringSchema(), properties));
        
        /**
         *
         * 4 自定义source
         * */
        DataStreamSource<String> source3 = env.addSource(new MySourceFunction());

    }

    public static class MySourceFunction implements SourceFunction<String> {

        private boolean isRunning = true;

        public void run(SourceContext<String> ctx) throws Exception {
            while(isRunning){
                System.out.println("自定义数据源");
            }

        }

        public void cancel() {
            isRunning = false;

        }
    }
}
Transform
  • 转换算子
package transform;

import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @author wangkai 
 */
public class TransForm {
    public static void main(String[] args) {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<String> source = env.socketTextStream("", 7777);


        source.map(new MapFunction<String, String>() {
            /**
             * map算子,接受一个元素并产生一个元素
             * */
                       public String map(String value) throws Exception {
                           Object parse = JSONObject.parse(value);
                           return parse.toString();
                       }
                   }
       );


        SingleOutputStreamOperator<Tuple2<String, Integer>> flatMapSource = source.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            /**
             * flatMap 算子:接受一个元素并产生零个、一个或多个元素
             *
             * */
            public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] s = value.split(" ");
                for (int i = 0; i < s.length; i++) {
                    out.collect(new Tuple2<String, Integer>(s[i], 1));
                }

            }
        });

        source.filter(new FilterFunction<String>() {
            /**
             * filter算子: 对每个元素计算一个布尔函数,并保留该函数返回true的那些元素
             * */
            public boolean filter(String value) throws Exception {
                return value == "filter";
            }
        });


        KeyedStream<Tuple2<String, Integer>, String> keyedStream = flatMapSource.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
            public String getKey(Tuple2<String, Integer> value) throws Exception {
                return value.f0;
            }
        });

    }

}
  • 滚动聚合算子
  • sum
  • max
  • min
  • minBy
  • maxBy
package transform;


import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @author wangkai
 */
public class RollingAggregation {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);

        /**
         * 滚动聚合算子:sum max min maxBy minBy
         *
         * */
        
        
        /**
         * 
         * sensor1 123456789 35
         * sensor2 234567890 36
         * sensor3 456962456 24
         * sensor1 123456789 20
         * 
         * 
         * */

        DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor");

        SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() {
            public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception {
                String[] s = value.split(" ");
                out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2])));
            }
        });

        /**
         * sum
         * */

        SingleOutputStreamOperator<Tuple3<String, String, Long>> sum = stream
                .keyBy(0)
                .sum(2);

        sum.print("sum");

        /**
         * max
         * */
        SingleOutputStreamOperator<Tuple3<String, String, Long>> max = stream
                .keyBy(0)
                .max(2);
        max.print("max");

        /**
         * min
         * */
        SingleOutputStreamOperator<Tuple3<String, String, Long>> min = stream
                .keyBy(0)
                .min(2);
        min.print("min");

        /**
         * maxBy
         * */
        SingleOutputStreamOperator<Tuple3<String, String, Long>> maxBy = stream
                .keyBy(0)
                .maxBy(2);
        maxBy.print("maxby");


        /**
         * minBy
         * */
        SingleOutputStreamOperator<Tuple3<String, String, Long>> minBy = stream
                .keyBy(0)
                .minBy(2);
        minBy.print("minby");


        env.execute("rolling aggregate");
        
        /**
         * max> (sensor1,123456789,35)
         * min> (sensor1,123456789,35)
         * maxby> (sensor1,123456789,35)
         * minby> (sensor1,123456789,35)
         * sum> (sensor1,123456789,35)
         * maxby> (sensor2,234567890,36)
         * min> (sensor2,234567890,36)
         * max> (sensor2,234567890,36)
         * maxby> (sensor3,456962456,24)
         * sum> (sensor2,234567890,36)
         * min> (sensor3,456962456,24)
         * minby> (sensor2,234567890,36)
         * sum> (sensor3,456962456,24)
         * min> (sensor1,123456789,20)
         * maxby> (sensor1,123456789,35)
         * max> (sensor3,456962456,24)
         * sum> (sensor1,123456789,55)
         * max> (sensor1,123456789,35)
         * minby> (sensor3,456962456,24)
         * minby> (sensor1,123456789,20)
         * */
    }


}
  • Reduce
package transform;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @author wangkai 

 */
public class Reduce {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        /**
         *
         * KeyedStream → DataStream:一个分组数据流的聚合操作,合并当前的元素
         * 和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是
         * 只返回最后一次聚合的最终结果
         * 
         * sensor1 123456789 35
         * sensor2 234567890 36
         * sensor3 456962456 24
         * sensor1 123456789 20
         * */
        DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor");

        SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() {
            public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception {
                String[] s = value.split(" ");
                out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2])));
            }
        });


        SingleOutputStreamOperator<Tuple3<String, String, Long>> reduce = stream
                .keyBy(0)
                .reduce(new ReduceFunction<Tuple3<String, String, Long>>() {
                    public Tuple3<String, String, Long> reduce(Tuple3<String, String, Long> value1, Tuple3<String, String, Long> value2) throws Exception {
                        return new Tuple3<String, String, Long>(value1.f0, value1.f1, Math.max(value1.f2, value2.f2));
                    }
                });

        
        reduce.print("reduce");

        env.execute("reduce");
        
        
        /**
         * reduce> (sensor1,123456789,35)
         * reduce> (sensor2,234567890,36)
         * reduce> (sensor3,456962456,24)
         * reduce> (sensor1,123456789,35)
         * */
    }
}
  • Split和select
package transform;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

import java.util.Collections;

/**
 * @author 
 */
public class SplitAndSelect {
    public static void main(String[] args) throws Exception{
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        /**
         *split:DataStream → SplitStream:根据某些特征把一个 DataStream 拆分成两个或者多个DataStream。
         *select:SplitStream→DataStream:从一个 SplitStream 中获取一个或者多个DataStream
         *
         * sensor1 123456789 35
         * sensor2 234567890 36
         * sensor3 456962456 24
         * sensor1 123456789 20
         * */
        DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor");

        SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() {
            public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception {
                String[] s = value.split(" ");
                out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2])));
            }
        });


        SplitStream<Tuple3<String, String, Long>> split = stream.split(new OutputSelector<Tuple3<String, String, Long>>() {
            public Iterable<String> select(Tuple3<String, String, Long> value) {
                return value.f2 > 30 ? Collections.singletonList("high") : Collections.singletonList("low");
            }
        });

        split.print("split");

        DataStream<Tuple3<String, String, Long>> high = split.select("high");
        DataStream<Tuple3<String, String, Long>> low = split.select("low");

        high.print("high");
        low.print("low");


        env.execute("split and select");
        
        
        /**
         * 
         * split> (sensor1,123456789,35)
         * high> (sensor1,123456789,35)
         * split> (sensor2,234567890,36)
         * high> (sensor2,234567890,36)
         * split> (sensor3,456962456,24)
         * low> (sensor3,456962456,24)
         * split> (sensor1,123456789,20)
         * low> (sensor1,123456789,20)
         * 
         * */

    }
}
  • connect和CoMap
DataStream<Tuple3<String, String, Long>> high = split.select("high");
        DataStream<Tuple3<String, String, Long>> low = split.select("low");

        high.print("high");
        low.print("low");


        ConnectedStreams<Tuple3<String, String, Long>, Tuple3<String, String, Long>> connect = high.connect(low);
        connect.flatMap(new CoFlatMapFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Object>() {
            public void flatMap1(Tuple3<String, String, Long> value, Collector<Object> out) throws Exception {
                out.collect(value);

            }

            public void flatMap2(Tuple3<String, String, Long> value, Collector<Object> out) throws Exception {
                out.collect(value);

            }
        });
  • union
DataStream<Tuple3<String, String, Long>> union = high.union(low);

  Connect 与 Union 区别:
   1. Union 之前两个流的类型必须是一样,Connect 可以不一样,在之后的 coMap中再去调整成为一样的。
   2. Connect 只能操作两个流,Union 可以操作多个。
支持的数据类型

Flink 流应用程序处理的是以数据对象表示的事件流。所以在 Flink 内部,我们 需要能够处理这些对象。它们需要被序列化和反序列化,以便通过网络传送它们; 或者从状态后端、检查点和保存点读取它们。为了有效地做到这一点,Flink 需要明 确知道应用程序所处理的数据类型。Flink 使用类型信息的概念来表示数据类型,并 为每个数据类型生成特定的序列化器、反序列化器和比较器。 Flink 还具有一个类型提取系统,该系统分析函数的输入和返回类型,以自动获 取类型信息,从而获得序列化器和反序列化器。但是,在某些情况下,例如 lambda 函数或泛型类型,需要显式地提供类型信息,才能使应用程序正常工作或提高其性 能。

自定义udf函数
  • 函数类
  • 匿名函数
  • 富函数
    “富函数”是 DataStream API 提供的一个函数类的接口,所有 Flink 函数类都 有其 Rich 版本。它与常规函数的不同在于,可以获取运行环境的上下文,并拥有一 些生命周期方法,所以可以实现更复杂的功能。 有一个生命周期的概念。典型的生命周期方法有: open()方法是 rich function 的初始化方法,当一个算子例如 map 或者 filter 被调用之前 open()会被调用。 close()方法是生命周期中的最后一个调用的方法,做一些清理工作。 getRuntimeContext()方法提供了函数的 RuntimeContext 的一些信息,例如函 数执行的并行度,任务的名字,以及 state 状态
Sink
  • 自定义sink函数
package sink;

import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;

public class MyJdbcSink extends RichSinkFunction<String> {
    Connection conn = null;
    PreparedStatement insertStmt = null;


    @Override
    public void open(Configuration parameters) throws Exception {
        conn = DriverManager.getConnection("url", "username", "password");
        insertStmt = conn.prepareStatement("insert into xxx (a) values (?)");
    }

    /**
     * 调用连接,执行sql
     */

    @Override
    public void invoke(String value, Context context) throws Exception {
        insertStmt.setString(1,"test");
        insertStmt.execute();

    }

    @Override
    public void close() throws Exception {
        insertStmt.close();
        conn.close();
    }
}