5 Flink 流处理API
Enviroment
package env;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.streaming.api.environment.LocalStreamEnvironment;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @author wangkai
*/
public class Env {
public static void main(String[] args) {
/**
* 流式执行环境
* 创建一个执行环境,表示当前执行程序的上下文。 如果程序是独立调用的,则
* 此方法返回本地执行环境;如果从命令行客户端调用程序以提交到集群,则此方法
* 返回此集群的执行环境,也就是说,getExecutionEnvironment 会根据查询运行的方
* 式决定返回什么样的运行环境,是最常用的一种创建执行环境的方式
* */
StreamExecutionEnvironment StreamEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
/**
* 批处理执行环境
* */
ExecutionEnvironment BatchEnvironment1 = ExecutionEnvironment.getExecutionEnvironment();
/**
*
*创建本地执行环境
* */
LocalStreamEnvironment localEnvironment = StreamExecutionEnvironment.createLocalEnvironment();
/**
* 创建远程执行环境
* */
StreamExecutionEnvironment remoteEnvironment = StreamExecutionEnvironment.createRemoteEnvironment("xxx", 1234, "xxx.jar");
}
}
Source
package source;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;
import java.util.Arrays;
import java.util.Properties;
/**
* @author wangkai
*/
public class Source {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
/**
* 1 从集合读取
* */
DataStreamSource<String> source = env.fromCollection(Arrays.asList("zs", "ls", "wmz"));
/**
* 2 从文件读取数据
* */
DataStreamSource<String> source1 = env.readTextFile("path");
/**
* 3 从kafka读取数据
* */
Properties properties = new Properties();
properties.setProperty("bootstrap.servers","xxx");
properties.setProperty("group.id","xxx");
DataStreamSource<String> source2 = env.addSource(new FlinkKafkaConsumer011<String>("topic", new SimpleStringSchema(), properties));
/**
*
* 4 自定义source
* */
DataStreamSource<String> source3 = env.addSource(new MySourceFunction());
}
public static class MySourceFunction implements SourceFunction<String> {
private boolean isRunning = true;
public void run(SourceContext<String> ctx) throws Exception {
while(isRunning){
System.out.println("自定义数据源");
}
}
public void cancel() {
isRunning = false;
}
}
}
Transform
- 转换算子
package transform;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* @author wangkai
*/
public class TransForm {
public static void main(String[] args) {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> source = env.socketTextStream("", 7777);
source.map(new MapFunction<String, String>() {
/**
* map算子,接受一个元素并产生一个元素
* */
public String map(String value) throws Exception {
Object parse = JSONObject.parse(value);
return parse.toString();
}
}
);
SingleOutputStreamOperator<Tuple2<String, Integer>> flatMapSource = source.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
/**
* flatMap 算子:接受一个元素并产生零个、一个或多个元素
*
* */
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] s = value.split(" ");
for (int i = 0; i < s.length; i++) {
out.collect(new Tuple2<String, Integer>(s[i], 1));
}
}
});
source.filter(new FilterFunction<String>() {
/**
* filter算子: 对每个元素计算一个布尔函数,并保留该函数返回true的那些元素
* */
public boolean filter(String value) throws Exception {
return value == "filter";
}
});
KeyedStream<Tuple2<String, Integer>, String> keyedStream = flatMapSource.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
public String getKey(Tuple2<String, Integer> value) throws Exception {
return value.f0;
}
});
}
}
- 滚动聚合算子
- sum
- max
- min
- minBy
- maxBy
package transform;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* @author wangkai
*/
public class RollingAggregation {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
/**
* 滚动聚合算子:sum max min maxBy minBy
*
* */
/**
*
* sensor1 123456789 35
* sensor2 234567890 36
* sensor3 456962456 24
* sensor1 123456789 20
*
*
* */
DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor");
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() {
public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception {
String[] s = value.split(" ");
out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2])));
}
});
/**
* sum
* */
SingleOutputStreamOperator<Tuple3<String, String, Long>> sum = stream
.keyBy(0)
.sum(2);
sum.print("sum");
/**
* max
* */
SingleOutputStreamOperator<Tuple3<String, String, Long>> max = stream
.keyBy(0)
.max(2);
max.print("max");
/**
* min
* */
SingleOutputStreamOperator<Tuple3<String, String, Long>> min = stream
.keyBy(0)
.min(2);
min.print("min");
/**
* maxBy
* */
SingleOutputStreamOperator<Tuple3<String, String, Long>> maxBy = stream
.keyBy(0)
.maxBy(2);
maxBy.print("maxby");
/**
* minBy
* */
SingleOutputStreamOperator<Tuple3<String, String, Long>> minBy = stream
.keyBy(0)
.minBy(2);
minBy.print("minby");
env.execute("rolling aggregate");
/**
* max> (sensor1,123456789,35)
* min> (sensor1,123456789,35)
* maxby> (sensor1,123456789,35)
* minby> (sensor1,123456789,35)
* sum> (sensor1,123456789,35)
* maxby> (sensor2,234567890,36)
* min> (sensor2,234567890,36)
* max> (sensor2,234567890,36)
* maxby> (sensor3,456962456,24)
* sum> (sensor2,234567890,36)
* min> (sensor3,456962456,24)
* minby> (sensor2,234567890,36)
* sum> (sensor3,456962456,24)
* min> (sensor1,123456789,20)
* maxby> (sensor1,123456789,35)
* max> (sensor3,456962456,24)
* sum> (sensor1,123456789,55)
* max> (sensor1,123456789,35)
* minby> (sensor3,456962456,24)
* minby> (sensor1,123456789,20)
* */
}
}
- Reduce
package transform;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* @author wangkai
*/
public class Reduce {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
/**
*
* KeyedStream → DataStream:一个分组数据流的聚合操作,合并当前的元素
* 和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是
* 只返回最后一次聚合的最终结果
*
* sensor1 123456789 35
* sensor2 234567890 36
* sensor3 456962456 24
* sensor1 123456789 20
* */
DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor");
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() {
public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception {
String[] s = value.split(" ");
out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2])));
}
});
SingleOutputStreamOperator<Tuple3<String, String, Long>> reduce = stream
.keyBy(0)
.reduce(new ReduceFunction<Tuple3<String, String, Long>>() {
public Tuple3<String, String, Long> reduce(Tuple3<String, String, Long> value1, Tuple3<String, String, Long> value2) throws Exception {
return new Tuple3<String, String, Long>(value1.f0, value1.f1, Math.max(value1.f2, value2.f2));
}
});
reduce.print("reduce");
env.execute("reduce");
/**
* reduce> (sensor1,123456789,35)
* reduce> (sensor2,234567890,36)
* reduce> (sensor3,456962456,24)
* reduce> (sensor1,123456789,35)
* */
}
}
- Split和select
package transform;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.collector.selector.OutputSelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.SplitStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.util.Collections;
/**
* @author
*/
public class SplitAndSelect {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
/**
*split:DataStream → SplitStream:根据某些特征把一个 DataStream 拆分成两个或者多个DataStream。
*select:SplitStream→DataStream:从一个 SplitStream 中获取一个或者多个DataStream
*
* sensor1 123456789 35
* sensor2 234567890 36
* sensor3 456962456 24
* sensor1 123456789 20
* */
DataStreamSource<String> source = env.readTextFile("D:\\git\\csdn-flink\\csdn-flink-1\\src\\main\\resources\\sensor");
SingleOutputStreamOperator<Tuple3<String, String, Long>> stream = source.flatMap(new FlatMapFunction<String, Tuple3<String, String, Long>>() {
public void flatMap(String value, Collector<Tuple3<String, String, Long>> out) throws Exception {
String[] s = value.split(" ");
out.collect(new Tuple3<String, String, Long>(s[0], s[1], Long.parseLong(s[2])));
}
});
SplitStream<Tuple3<String, String, Long>> split = stream.split(new OutputSelector<Tuple3<String, String, Long>>() {
public Iterable<String> select(Tuple3<String, String, Long> value) {
return value.f2 > 30 ? Collections.singletonList("high") : Collections.singletonList("low");
}
});
split.print("split");
DataStream<Tuple3<String, String, Long>> high = split.select("high");
DataStream<Tuple3<String, String, Long>> low = split.select("low");
high.print("high");
low.print("low");
env.execute("split and select");
/**
*
* split> (sensor1,123456789,35)
* high> (sensor1,123456789,35)
* split> (sensor2,234567890,36)
* high> (sensor2,234567890,36)
* split> (sensor3,456962456,24)
* low> (sensor3,456962456,24)
* split> (sensor1,123456789,20)
* low> (sensor1,123456789,20)
*
* */
}
}
- connect和CoMap
DataStream<Tuple3<String, String, Long>> high = split.select("high");
DataStream<Tuple3<String, String, Long>> low = split.select("low");
high.print("high");
low.print("low");
ConnectedStreams<Tuple3<String, String, Long>, Tuple3<String, String, Long>> connect = high.connect(low);
connect.flatMap(new CoFlatMapFunction<Tuple3<String, String, Long>, Tuple3<String, String, Long>, Object>() {
public void flatMap1(Tuple3<String, String, Long> value, Collector<Object> out) throws Exception {
out.collect(value);
}
public void flatMap2(Tuple3<String, String, Long> value, Collector<Object> out) throws Exception {
out.collect(value);
}
});
- union
DataStream<Tuple3<String, String, Long>> union = high.union(low);
Connect 与 Union 区别:
1. Union 之前两个流的类型必须是一样,Connect 可以不一样,在之后的 coMap中再去调整成为一样的。
2. Connect 只能操作两个流,Union 可以操作多个。
支持的数据类型
Flink 流应用程序处理的是以数据对象表示的事件流。所以在 Flink 内部,我们 需要能够处理这些对象。它们需要被序列化和反序列化,以便通过网络传送它们; 或者从状态后端、检查点和保存点读取它们。为了有效地做到这一点,Flink 需要明 确知道应用程序所处理的数据类型。Flink 使用类型信息的概念来表示数据类型,并 为每个数据类型生成特定的序列化器、反序列化器和比较器。 Flink 还具有一个类型提取系统,该系统分析函数的输入和返回类型,以自动获 取类型信息,从而获得序列化器和反序列化器。但是,在某些情况下,例如 lambda 函数或泛型类型,需要显式地提供类型信息,才能使应用程序正常工作或提高其性 能。
自定义udf函数
- 函数类
- 匿名函数
- 富函数
“富函数”是 DataStream API 提供的一个函数类的接口,所有 Flink 函数类都 有其 Rich 版本。它与常规函数的不同在于,可以获取运行环境的上下文,并拥有一 些生命周期方法,所以可以实现更复杂的功能。 有一个生命周期的概念。典型的生命周期方法有: open()方法是 rich function 的初始化方法,当一个算子例如 map 或者 filter 被调用之前 open()会被调用。 close()方法是生命周期中的最后一个调用的方法,做一些清理工作。 getRuntimeContext()方法提供了函数的 RuntimeContext 的一些信息,例如函 数执行的并行度,任务的名字,以及 state 状态
Sink
- 自定义sink函数
package sink;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
public class MyJdbcSink extends RichSinkFunction<String> {
Connection conn = null;
PreparedStatement insertStmt = null;
@Override
public void open(Configuration parameters) throws Exception {
conn = DriverManager.getConnection("url", "username", "password");
insertStmt = conn.prepareStatement("insert into xxx (a) values (?)");
}
/**
* 调用连接,执行sql
*/
@Override
public void invoke(String value, Context context) throws Exception {
insertStmt.setString(1,"test");
insertStmt.execute();
}
@Override
public void close() throws Exception {
insertStmt.close();
conn.close();
}
}