文章目录
- Flink 系列文章
- 1、Map
- 2、FlatMap[DataStream->DataStream]
- 3、Filter
- 4、KeyBy
- 5、Reduce
- 6、Aggregations
- 7、first、distinct、join、outjoin、cross
- 8、Window
- 9、WindowAll
- 10、Union
- 11、Window join
- 12、Split
- 13、Select
- 14、Project
本文将常用的operator作为单独的示例,使用内部匿名类和lambda各自实现了,有些不常见的或在新版本将替换掉的则叙述不多。
本文作为Flink(五)source、transformations、sink的详细示例(二)-source和transformation示例的补充。
本文分为14个部分,每个operator作为一个部分,具体参考目录,不再赘述。
本文所有依赖均为前面文章中所包含的内容,没有新增依赖。User 的bean也是前文中的内容,不再赘述。
1、Map
[DataStream->DataStream]
这是最简单的转换之一,其中输入是一个数据流,输出的也是一个数据流。
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.Partitioner;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.source_transformation_sink.bean.User;
/**
* @author alanchan
*
*/
public class TransformationMapDemo {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// source
// transformation
mapPartitionFunction8(env);
// sink
// execute
env.execute();
}
// 构造一个list,然后将list中数字乘以2输出,内部匿名类实现
public static void mapFunction1(StreamExecutionEnvironment env) throws Exception {
List<Integer> data = new ArrayList<Integer>();
for (int i = 1; i <= 10; i++) {
data.add(i);
}
DataStreamSource<Integer> source = env.fromCollection(data);
SingleOutputStreamOperator<Integer> sink = source.map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer inValue) throws Exception {
return inValue * 2;
}
});
sink.print();
}
// 构造一个list,然后将list中数字乘以2输出,lambda实现
public static void mapFunction2(StreamExecutionEnvironment env) throws Exception {
List<Integer> data = new ArrayList<Integer>();
for (int i = 1; i <= 10; i++) {
data.add(i);
}
DataStreamSource<Integer> source = env.fromCollection(data);
SingleOutputStreamOperator<Integer> sink = source.map(i -> 2 * i);
sink.print();
}
// 构造User数据源
public static DataStreamSource<User> source(StreamExecutionEnvironment env) {
DataStreamSource<User> source = env.fromCollection(Arrays.asList(
new User(1, "alan1", "1", "1@1.com", 12, 1000),
new User(2, "alan2", "2", "2@2.com", 19, 200),
new User(3, "alan1", "3", "3@3.com", 28, 1500),
new User(5, "alan1", "5", "5@5.com", 15, 500),
new User(4, "alan2", "4", "4@4.com", 30, 400)));
return source;
}
// lambda实现用户对象的balance×2和age+5功能
public static SingleOutputStreamOperator<User> mapFunction3(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
SingleOutputStreamOperator<User> sink = source.map((MapFunction<User, User>) user -> {
User user2 = user;
user2.setAge(user.getAge() + 5);
user2.setBalance(user.getBalance() * 2);
return user2;
});
sink.print();
return sink;
}
// lambda实现balance*2和age+5后,balance》=2000和age》=20的数据过滤出来
public static SingleOutputStreamOperator<User> mapFunction4(StreamExecutionEnvironment env) throws Exception {
SingleOutputStreamOperator<User> sink = mapFunction3(env).filter(user -> user.getBalance() >= 2000 && user.getAge() >= 20);
sink.print();
return sink;
}
// lambda实现balance*2和age+5后,balance》=2000和age》=20的数据过滤出来并通过flatmap收集
public static SingleOutputStreamOperator<User> mapFunction5(StreamExecutionEnvironment env) throws Exception {
SingleOutputStreamOperator<User> sink = mapFunction4(env).flatMap((FlatMapFunction<User, User>) (user, out) -> {
if (user.getBalance() >= 3000) {
out.collect(user);
}
}).returns(User.class);
sink.print();
return sink;
}
// 数据分区示例
public static void mapPartitionFunction6(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
DataStream<User> userTemp = source.map(user -> {
User user2 = user;
user2.setAge(user.getAge() + 5);
return user2;
}).returns(User.class);
// public <K> DataStream<T> partitionCustom(Partitioner<K> partitioner, KeySelector<T, K> keySelector) {
// return setConnectionType(new CustomPartitionerWrapper<>(clean(partitioner),
// clean(keySelector)));
// }
DataStream<User> sink = userTemp.partitionCustom(new Partitioner<Integer>() {
public int partition(Integer key, int numPartitions) {
System.out.println("分区数:" + numPartitions);
if (key < 20)
numPartitions = 0;
else if (key >= 20 && key < 30)
numPartitions = 1;
else if (key >= 0)
numPartitions = 2;
System.out.println("分区数2:" + numPartitions);
return numPartitions;
}
}, new KeySelector<User, Integer>() {
@Override
public Integer getKey(User value) throws Exception {
return value.getAge();
}
});
sink.map((MapFunction<User, User>) user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.toString());
return user;
}).returns(User.class);
// System.out.println("并行数:" + sink.getParallelism());
// 输出结果,3个区,按照年龄分的
// 当前线程ID:138,user:User(id=3, name=alan1, pwd=3, email=3@3.com, age=33, balance=1500.0)
// 当前线程ID:136,user:User(id=1, name=alan1, pwd=1, email=1@1.com, age=17, balance=1000.0)
// 当前线程ID:138,user:User(id=4, name=alan2, pwd=4, email=4@4.com, age=35, balance=400.0)
// 当前线程ID:140,user:User(id=2, name=alan2, pwd=2, email=2@2.com, age=24, balance=200.0)
// 当前线程ID:140,user:User(id=5, name=alan1, pwd=5, email=5@5.com, age=20, balance=500.0)
sink.print();
}
// lambda数据分区示例
public static void mapPartitionFunction7(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
DataStream<User> userTemp = source.map(user -> {
User user2 = user;
user2.setAge(user.getAge() + 5);
return user2;
}).returns(User.class);
DataStream<User> sink = userTemp.partitionCustom((key, numPartitions) -> {
if (key < 20)
numPartitions = 0;
else if (key >= 20 && key < 30)
numPartitions = 1;
else if (key >= 0)
numPartitions = 2;
return numPartitions;
}, user -> user.getAge());
sink.print();
}
//按照用户id的奇数和偶数进行分区,如果id=1是单独分区
public static void mapPartitionFunction8(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
DataStream<User> sink = source.partitionCustom(new CusPartitioner(), user -> user.getId());
// 示例分区过程,输出结果如下
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 当前线程ID:90,user:User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 当前线程ID:89,user:User(id=3, name=alan1, pwd=3, email=3@3.com, age=28, balance=1500.0)
// 2> User(id=3, name=alan1, pwd=3, email=3@3.com, age=28, balance=1500.0)
// 当前线程ID:88,user:User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 当前线程ID:89,user:User(id=5, name=alan1, pwd=5, email=5@5.com, age=15, balance=500.0)
// 1> User(id=4, name=alan2, pwd=4, email=4@4.com, age=30, balance=400.0)
// 3> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 当前线程ID:88,user:User(id=4, name=alan2, pwd=4, email=4@4.com, age=30, balance=400.0)
// 2> User(id=5, name=alan1, pwd=5, email=5@5.com, age=15, balance=500.0)
sink.map((MapFunction<User, User>) user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.toString());
return user;
}).returns(User.class);
sink.print();
}
public static class CusPartitioner implements Partitioner<Integer> {
@Override
public int partition(Integer key, int numPartitions) {
if (key == 1)
numPartitions = 2;
else if (key % 2 == 0) {
numPartitions = 0;
} else {
numPartitions = 1;
}
return numPartitions;
}
}
}
2、FlatMap[DataStream->DataStream]
FlatMap 采用一条记录并输出零个,一个或多个记录。
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
/**
* @author alanchan
*
*/
public class TransformationFlatMapDemo {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
flatMapFunction3(env);
env.execute();
}
// 构造User数据源
public static DataStreamSource<String> source(StreamExecutionEnvironment env) {
List<String> info = new ArrayList<>();
info.add("i am alanchan");
info.add("i like hadoop");
info.add("i like flink");
info.add("and you ?");
DataStreamSource<String> dataSource = env.fromCollection(info);
return dataSource;
}
// 将句子以空格进行分割-内部匿名类实现
public static void flatMapFunction1(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<String> source = source(env);
SingleOutputStreamOperator<String> sink = source.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
String[] splits = value.split(" ");
for (String split : splits) {
out.collect(split);
}
}
});
sink.print();
}
// lambda实现
public static void flatMapFunction2(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<String> source = source(env);
SingleOutputStreamOperator<String> sink = source.flatMap((FlatMapFunction<String, String>) (input, out) -> {
String[] splits = input.split(" ");
for (String split : splits) {
out.collect(split);
}
}).returns(String.class);
sink.print();
}
// lambda实现
public static void flatMapFunction3(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<String> source = source(env);
SingleOutputStreamOperator<String> sink = source.flatMap((String input, Collector<String> out) -> Arrays.stream(input.split(" ")).forEach(out::collect))
.returns(String.class);
sink.print();
}
}
3、Filter
[DataStream->DataStream]
Filter 函数根据条件判断出结果。
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.source_transformation_sink.bean.User;
/**
* @author alanchan
*
*/
public class TransformationFilterDemo {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
filterFunction3(env);
env.execute();
}
// 构造User数据源
public static DataStreamSource<Integer> sourceList(StreamExecutionEnvironment env) {
List<Integer> data = new ArrayList<Integer>();
for (int i = 1; i <= 10; i++) {
data.add(i);
}
DataStreamSource<Integer> source = env.fromCollection(data);
return source;
}
// 构造User数据源
public static DataStreamSource<User> sourceUser(StreamExecutionEnvironment env) {
DataStreamSource<User> source = env.fromCollection(Arrays.asList(
new User(1, "alan1", "1", "1@1.com", 12, 1000),
new User(2, "alan2", "2", "2@2.com", 19, 200),
new User(3, "alan1", "3", "3@3.com", 28, 1500),
new User(5, "alan1", "5", "5@5.com", 15, 500),
new User(4, "alan2", "4", "4@4.com", 30, 400)));
return source;
}
// 过滤出大于5的数字,内部匿名类
public static void filterFunction1(StreamExecutionEnvironment env) throws Exception {
DataStream<Integer> source = sourceList(env);
SingleOutputStreamOperator<Integer> sink = source.map(new MapFunction<Integer, Integer>() {
public Integer map(Integer value) throws Exception {
return value + 1;
}
}).filter(new FilterFunction<Integer>() {
@Override
public boolean filter(Integer value) throws Exception {
return value > 5;
}
});
sink.print();
}
// lambda实现
public static void filterFunction2(StreamExecutionEnvironment env) throws Exception {
DataStream<Integer> source = sourceList(env);
SingleOutputStreamOperator<Integer> sink = source.map(i -> i + 1).filter(value -> value > 5);
sink.print();
}
// 查询user id大于3的记录
public static void filterFunction3(StreamExecutionEnvironment env) throws Exception {
DataStream<User> source = sourceUser(env);
SingleOutputStreamOperator<User> sink = source.filter(user -> user.getId() > 3);
sink.print();
}
}
4、KeyBy
[DataStream->KeyedStream]
KeyBy 在逻辑上是基于 key 对流进行分区。在内部,它使用 hash 函数对流进行分区。它返回 KeyedDataStream 数据流。将同一Key的数据放到同一个分区。
分区结果和KeyBy下游算子的并行度强相关。如下游算子只有一个并行度,不管怎么分,都会分到一起。
对于POJO类型,KeyBy可以通过keyBy(fieldName)指定字段进行分区。
对于Tuple类型,KeyBy可以通过keyBy(fieldPosition)指定字段进行分区。
对于一般类型,如上,KeyBy可以通过keyBy(new KeySelector {…})指定字段进行分区。
import java.util.Arrays;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.source_transformation_sink.bean.User;
/**
* @author alanchan
*
*/
public class TransformationKeyByDemo {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// env.setParallelism(4);// 设置数据分区数量
keyByFunction6(env);
env.execute();
}
// 构造User数据源
public static DataStreamSource<User> source(StreamExecutionEnvironment env) {
DataStreamSource<User> source = env.fromCollection(Arrays.asList(
new User(1, "alan1", "1", "1@1.com", 12, 1000),
new User(2, "alan2", "2", "2@2.com", 19, 200),
new User(3, "alan1", "3", "3@3.com", 28, 1500),
new User(5, "alan1", "5", "5@5.com", 15, 500),
new User(4, "alan2", "4", "4@4.com", 30, 400)));
return source;
}
// 按照name进行keyby 对于POJO类型,KeyBy可以通过keyBy(fieldName)指定字段进行分区
public static void keyByFunction1(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
KeyedStream<User, String> sink = source.keyBy(new KeySelector<User, String>() {
@Override
public String getKey(User value) throws Exception {
return value.getName();
}
});
sink.map(user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.toString());
return user;
});
sink.print();
}
// lambda 对于POJO类型,KeyBy可以通过keyBy(fieldName)指定字段进行分区
public static void keyByFunction2(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
KeyedStream<User, String> sink = source.keyBy(user -> user.getName());
// 演示keyby后的数据输出
sink.map(user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.toString());
return user;
});
sink.print();
}
// 对于Tuple类型,KeyBy可以通过keyBy(fieldPosition)指定字段进行分区。lambda
public static void keyByFunction3(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
SingleOutputStreamOperator<Tuple2<String, User>> userTemp = source.map((MapFunction<User, Tuple2<String, User>>) user -> {
return new Tuple2<String, User>(user.getName(), user);
}).returns(Types.TUPLE(Types.STRING, Types.POJO(User.class)));
KeyedStream<Tuple2<String, User>, Tuple> sink = userTemp.keyBy(0);
// 演示keyby后的数据输出
sink.map(user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.f1.toString());
return user.f1;
});
sink.print();
}
// 对于Tuple类型,KeyBy可以通过keyBy(fieldPosition)指定字段进行分区。
public static void keyByFunction4(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
SingleOutputStreamOperator<Tuple2<String, User>> userTemp = source.map(new MapFunction<User, Tuple2<String, User>>() {
@Override
public Tuple2<String, User> map(User value) throws Exception {
return new Tuple2<String, User>(value.getName(), value);
}
});
KeyedStream<Tuple2<String, User>, String> sink = userTemp.keyBy(new KeySelector<Tuple2<String, User>, String>() {
@Override
public String getKey(Tuple2<String, User> value) throws Exception {
return value.f0;
}
});
// 演示keyby后的数据输出
sink.map(user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.f1.toString());
return user.f1;
});
// sink.map(new MapFunction<Tuple2<String, User>, String>() {
//
// @Override
// public String map(Tuple2<String, User> value) throws Exception {
// System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + value.f1.toString());
// return null;
// }});
sink.print();
}
// 对于一般类型,如上,KeyBy可以通过keyBy(new KeySelector {...})指定字段进行分区。
// 按照name的前4位进行keyby
public static void keyByFunction5(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
KeyedStream<User, String> sink = source.keyBy(new KeySelector<User, String>() {
@Override
public String getKey(User value) throws Exception {
// String temp = value.getName().substring(0, 4);
return value.getName().substring(0, 4);
}
});
sink.map(user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.toString());
return user;
});
sink.print();
}
// 对于一般类型,如上,KeyBy可以通过keyBy(new KeySelector {...})指定字段进行分区。 lambda
// 按照name的前4位进行keyby
public static void keyByFunction6(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
KeyedStream<User, String> sink = source.keyBy(user -> user.getName().substring(0, 4));
sink.map(user -> {
System.out.println("当前线程ID:" + Thread.currentThread().getId() + ",user:" + user.toString());
return user;
});
sink.print();
}
}
5、Reduce
[KeyedStream->DataStream]
Reduce 返回单个的结果值,并且 reduce 操作每处理一个元素总是创建一个新值。常用的方法有 average, sum, min, max, count,使用 reduce 方法都可实现。基于ReduceFunction进行滚动聚合,并向下游算子输出每次滚动聚合后的结果。
注意: Reduce会输出每一次滚动聚合的结果。
import java.util.Arrays;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.source_transformation_sink.bean.User;
/**
* @author alanchan
*
*/
public class TransformationReduceDemo {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// env.setParallelism(4);// 设置数据分区数量
reduceFunction2(env);
env.execute();
}
// 构造User数据源
public static DataStreamSource<User> source(StreamExecutionEnvironment env) {
DataStreamSource<User> source = env.fromCollection(Arrays.asList(
new User(1, "alan1", "1", "1@1.com", 12, 1000),
new User(2, "alan2", "2", "2@2.com", 19, 200),
new User(3, "alan1", "3", "3@3.com", 28, 1500),
new User(5, "alan1", "5", "5@5.com", 15, 500),
new User(4, "alan2", "4", "4@4.com", 30, 400)));
return source;
}
// 按照name进行balance进行sum
public static void reduceFunction1(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
KeyedStream<User, String> keyedStream = source.keyBy(user -> user.getName());
SingleOutputStreamOperator<User> sink = keyedStream.reduce(new ReduceFunction<User>() {
@Override
public User reduce(User value1, User value2) throws Exception {
double balance = value1.getBalance() + value2.getBalance();
return new User(value1.getId(), value1.getName(), "", "", 0, balance);
}
});
//
sink.print();
}
// 按照name进行balance进行sum lambda
public static void reduceFunction2(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
KeyedStream<User, String> userKeyBy = source.keyBy(user -> user.getName());
SingleOutputStreamOperator<User> sink = userKeyBy.reduce((user1, user2) -> {
User user = user1;
user.setBalance(user1.getBalance() + user2.getBalance());
return user;
});
sink.print();
}
}
6、Aggregations
[KeyedStream->DataStream]
DataStream API 支持各种聚合,例如 min,max,sum 等。 这些函数可以应用于 KeyedStream 以获得 Aggregations 聚合。
Aggregate 对KeyedStream按指定字段滚动聚合并输出每一次滚动聚合后的结果。默认的聚合函数有:sum、min、minBy、max、maxBy。
注意:
max(field)与maxBy(field)的区别: maxBy返回field最大的那条数据;而max则是将最大的field的值赋值给第一条数据并返回第一条数据。同理,min与minBy。
Aggregate聚合算子会滚动输出每一次聚合后的结果
max 和 maxBy 之间的区别在于 max 返回流中的最大值,但 maxBy 返回具有最大值的键, min 和 minBy 同理。
max以第一个比较对象的比较列值进行替换,maxBy是以整个比较对象进行替换。具体见示例。
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.source_transformation_sink.bean.User;
/**
* @author alanchan
*
*/
public class TransformationAggregationsDemo {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
aggregationsFunction2(env);
env.execute();
}
// 构造User数据源
public static DataStreamSource<User> source(StreamExecutionEnvironment env) {
DataStreamSource<User> source = env.fromCollection(Arrays.asList(
new User(1, "alan1", "1", "1@1.com", 12, 1000),
new User(2, "alan2", "2", "2@2.com", 19, 200),
new User(3, "alan1", "3", "3@3.com", 28, 1500),
new User(5, "alan1", "5", "5@5.com", 15, 500),
new User(4, "alan2", "4", "4@4.com", 30, 400)));
return source;
}
//分组统计sum、max、min、maxby、minby
public static void aggregationsFunction(StreamExecutionEnvironment env) throws Exception {
DataStreamSource<User> source = source(env);
KeyedStream<User, String> userTemp= source.keyBy(user->user.getName());
DataStream sink = null;
//1、根据name进行分区统计balance之和 alan1----2500/alan2----600
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=2500.0)
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=600.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=3000.0)
sink = userTemp.sum("balance");
//2、根据name进行分区统计balance的max alan1----1500/alan2----400
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1500.0)
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=400.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1500.0)
sink = userTemp.max("balance");//1@1.com-3000 -- 2@2.com-300
//3、根据name进行分区统计balance的min alan1----500/alan2---200
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=500.0)
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
sink = userTemp.min("balance");
//4、根据name进行分区统计balance的maxBy alan2----400/alan1----1500
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 1> User(id=4, name=alan2, pwd=4, email=4@4.com, age=30, balance=400.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 16> User(id=3, name=alan1, pwd=3, email=3@3.com, age=28, balance=1500.0)
// 16> User(id=3, name=alan1, pwd=3, email=3@3.com, age=28, balance=1500.0)
sink = userTemp.maxBy("balance");
//5、根据name进行分区统计balance的minBy alan2----200/alan1----500
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 1> User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 16> User(id=1, name=alan1, pwd=1, email=1@1.com, age=12, balance=1000.0)
// 16> User(id=5, name=alan1, pwd=5, email=5@5.com, age=15, balance=500.0)
sink = userTemp.minBy("balance");
sink.print();
}
public static void aggregationsFunction2(StreamExecutionEnvironment env) throws Exception {
List list = new ArrayList<Tuple3<Integer, Integer, Integer>>();
list.add(new Tuple3<>(0,3,6));
list.add(new Tuple3<>(0,2,5));
list.add(new Tuple3<>(0,1,6));
list.add(new Tuple3<>(0,4,3));
list.add(new Tuple3<>(1,1,9));
list.add(new Tuple3<>(1,2,8));
list.add(new Tuple3<>(1,3,10));
list.add(new Tuple3<>(1,2,9));
list.add(new Tuple3<>(1,5,7));
DataStreamSource<Tuple3<Integer, Integer, Integer>> source = env.fromCollection(list);
KeyedStream<Tuple3<Integer, Integer, Integer>, Integer> tTemp= source.keyBy(t->t.f0);
DataStream<Tuple3<Integer, Integer, Integer>> sink =null;
//按照分区,以第一个Tuple3的元素为基础进行第三列值比较,如果第三列值小于第一个tuple3的第三列值,则进行第三列值替换,其他的不变
// 12> (0,3,6)
// 11> (1,1,9)
// 11> (1,1,8)
// 12> (0,3,5)
// 11> (1,1,8)
// 12> (0,3,5)
// 11> (1,1,8)
// 12> (0,3,3)
// 11> (1,1,7)
sink = tTemp.min(2);
// 按照数据分区,以第一个tuple3的元素为基础进行第三列值比较,如果第三列值小于第一个tuple3的第三列值,则进行整个tuple3的替换
// 12> (0,3,6)
// 11> (1,1,9)
// 12> (0,2,5)
// 11> (1,2,8)
// 12> (0,2,5)
// 11> (1,2,8)
// 12> (0,4,3)
// 11> (1,2,8)
// 11> (1,5,7)
sink = tTemp.minBy(2);
sink.print();
}
}
7、first、distinct、join、outjoin、cross
具体事例详见例子及结果
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import org.source_transformation_sink.bean.User;
/**
* @author chenw
*
*/
public class TransformationOthersDemo {
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
joinFunction(env);
env.execute();
}
public static void unionFunction(StreamExecutionEnvironment env) throws Exception {
List<String> info1 = new ArrayList<>();
info1.add("team A");
info1.add("team B");
List<String> info2 = new ArrayList<>();
info2.add("team C");
info2.add("team D");
List<String> info3 = new ArrayList<>();
info3.add("team E");
info3.add("team F");
List<String> info4 = new ArrayList<>();
info4.add("team G");
info4.add("team H");
DataStream<String> source1 = env.fromCollection(info1);
DataStream<String> source2 = env.fromCollection(info2);
DataStream<String> source3 = env.fromCollection(info3);
DataStream<String> source4 = env.fromCollection(info4);
source1.union(source2).union(source3).union(source4).print();
// team A
// team C
// team E
// team G
// team B
// team D
// team F
// team H
}
public static void crossFunction(ExecutionEnvironment env) throws Exception {
// cross,求两个集合的笛卡尔积,得到的结果数为:集合1的条数 乘以 集合2的条数
List<String> info1 = new ArrayList<>();
info1.add("team A");
info1.add("team B");
List<Tuple2<String,Integer>> info2 = new ArrayList<>();
info2.add(new Tuple2("W",3));
info2.add(new Tuple2("D",1));
info2.add(new Tuple2("L",0));
DataSource<String> data1 = env.fromCollection(info1);
DataSource<Tuple2<String,Integer>> data2 = env.fromCollection(info2);
data1.cross(data2).print();
// (team A,(W,3))
// (team A,(D,1))
// (team A,(L,0))
// (team B,(W,3))
// (team B,(D,1))
// (team B,(L,0))
}
public static void outerJoinFunction(ExecutionEnvironment env) throws Exception {
// Outjoin,跟sql语句中的left join,right join,full join意思一样
// leftOuterJoin,跟join一样,但是左边集合的没有关联上的结果也会取出来,没关联上的右边为null
// rightOuterJoin,跟join一样,但是右边集合的没有关联上的结果也会取出来,没关联上的左边为null
// fullOuterJoin,跟join一样,但是两个集合没有关联上的结果也会取出来,没关联上的一边为null
List<Tuple2<Integer,String>> info1 = new ArrayList<>();
info1.add(new Tuple2<>(1,"shenzhen"));
info1.add(new Tuple2<>(2,"guangzhou"));
info1.add(new Tuple2<>(3,"shanghai"));
info1.add(new Tuple2<>(4,"chengdu"));
List<Tuple2<Integer,String>> info2 = new ArrayList<>();
info2.add(new Tuple2<>(1,"深圳"));
info2.add(new Tuple2<>(2,"广州"));
info2.add(new Tuple2<>(3,"上海"));
info2.add(new Tuple2<>(5,"杭州"));
DataSource<Tuple2<Integer,String>> data1 = env.fromCollection(info1);
DataSource<Tuple2<Integer,String>> data2 = env.fromCollection(info2);
//left join
// eft join:7> (1,shenzhen,深圳)
// left join:2> (3,shanghai,上海)
// left join:8> (4,chengdu,未知)
// left join:16> (2,guangzhou,广州)
data1.leftOuterJoin(data2).where(0).equalTo(0).with(new JoinFunction<Tuple2<Integer,String>,Tuple2<Integer,String>,Tuple3<Integer,String,String>>() {
@Override
public Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second)
throws Exception {
Tuple3<Integer,String,String> tuple = new Tuple3();
if (second == null) {
tuple.setField(first.f0, 0);
tuple.setField(first.f1, 1);
tuple.setField("未知", 2);
} else {
//另外一种赋值方式,和直接用构造函数赋值相同
tuple.setField(first.f0, 0);
tuple.setField(first.f1, 1);
tuple.setField(second.f1, 2);
}
return tuple;
}}).print("left join");
//right join
// right join:2> (3,shanghai,上海)
// right join:7> (1,shenzhen,深圳)
// right join:15> (5,--,杭州)
// right join:16> (2,guangzhou,广州)
data1.rightOuterJoin(data2).where(0).equalTo(0).with(new JoinFunction<Tuple2<Integer,String>,Tuple2<Integer,String>,Tuple3<Integer,String,String>>() {
@Override
public Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second)
throws Exception {
Tuple3<Integer,String,String> tuple = new Tuple3();
if (first == null) {
tuple.setField(second.f0, 0);
tuple.setField("--", 1);
tuple.setField(second.f1, 2);
} else {
//另外一种赋值方式,和直接用构造函数赋值相同
tuple.setField(first.f0, 0);
tuple.setField(first.f1, 1);
tuple.setField(second.f1, 2);
}
return tuple;
}}).print("right join");
//fullOuterJoin
// fullOuterJoin:2> (3,shanghai,上海)
// fullOuterJoin:8> (4,chengdu,--)
// fullOuterJoin:15> (5,--,杭州)
// fullOuterJoin:16> (2,guangzhou,广州)
// fullOuterJoin:7> (1,shenzhen,深圳)
data1.fullOuterJoin(data2).where(0).equalTo(0).with(new JoinFunction<Tuple2<Integer,String>,Tuple2<Integer,String>,Tuple3<Integer,String,String>>() {
@Override
public Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second)
throws Exception {
Tuple3<Integer,String,String> tuple = new Tuple3();
if (second == null) {
tuple.setField(first.f0, 0);
tuple.setField(first.f1, 1);
tuple.setField("--", 2);
} else if (first == null) {
tuple.setField(second.f0, 0);
tuple.setField("--", 1);
tuple.setField(second.f1, 2);
} else {
//另外一种赋值方式,和直接用构造函数赋值相同
tuple.setField(first.f0, 0);
tuple.setField(first.f1, 1);
tuple.setField(second.f1, 2);
}
return tuple;
}}).print("fullOuterJoin");
}
public static void joinFunction(ExecutionEnvironment env) throws Exception {
List<Tuple2<Integer,String>> info1 = new ArrayList<>();
info1.add(new Tuple2<>(1,"shenzhen"));
info1.add(new Tuple2<>(2,"guangzhou"));
info1.add(new Tuple2<>(3,"shanghai"));
info1.add(new Tuple2<>(4,"chengdu"));
List<Tuple2<Integer,String>> info2 = new ArrayList<>();
info2.add(new Tuple2<>(1,"深圳"));
info2.add(new Tuple2<>(2,"广州"));
info2.add(new Tuple2<>(3,"上海"));
info2.add(new Tuple2<>(5,"杭州"));
DataSource<Tuple2<Integer,String>> data1 = env.fromCollection(info1);
DataSource<Tuple2<Integer,String>> data2 = env.fromCollection(info2);
//
// join:2> ((3,shanghai),(3,上海))
// join:16> ((2,guangzhou),(2,广州))
// join:7> ((1,shenzhen),(1,深圳))
data1.join(data2).where(0).equalTo(0).print("join");
// join2:2> (3,上海,shanghai)
// join2:7> (1,深圳,shenzhen)
// join2:16> (2,广州,guangzhou)
DataSet<Tuple3<Integer,String,String>> data3 =
data1.join(data2).where(0).equalTo(0).with(new JoinFunction<Tuple2<Integer,String>,Tuple2<Integer,String>,Tuple3<Integer,String,String>>() {
@Override
public Tuple3<Integer, String, String> join(Tuple2<Integer, String> first, Tuple2<Integer, String> second)
throws Exception {
return new Tuple3<Integer,String,String>(first.f0,second.f1,first.f1);
}});
data3.print("join2");
}
public static void firstFunction(ExecutionEnvironment env) throws Exception {
List<Tuple2<Integer, String>> info = new ArrayList<>();
info.add(new Tuple2(1, "Hadoop"));
info.add(new Tuple2(1, "Spark"));
info.add(new Tuple2(1, "Flink"));
info.add(new Tuple2(2, "Scala"));
info.add(new Tuple2(2, "Java"));
info.add(new Tuple2(2, "Python"));
info.add(new Tuple2(3, "Linux"));
info.add(new Tuple2(3, "Window"));
info.add(new Tuple2(3, "MacOS"));
DataSet<Tuple2<Integer, String>> dataSet = env.fromCollection(info);
// 前几个
// dataSet.first(4).print();
// (1,Hadoop)
// (1,Spark)
// (1,Flink)
// (2,Scala)
// 按照tuple2的第一个元素进行分组,查出每组的前2个
// dataSet.groupBy(0).first(2).print();
// (3,Linux)
// (3,Window)
// (1,Hadoop)
// (1,Spark)
// (2,Scala)
// (2,Java)
// 按照tpule2的第一个元素进行分组,并按照倒序排列,查出每组的前2个
dataSet.groupBy(0).sortGroup(1, Order.DESCENDING).first(2).print();
// (3,Window)
// (3,MacOS)
// (1,Spark)
// (1,Hadoop)
// (2,Scala)
// (2,Python)
}
public static void distinctFunction(ExecutionEnvironment env) throws Exception {
List list = new ArrayList<Tuple3<Integer, Integer, Integer>>();
list.add(new Tuple3<>(0, 3, 6));
list.add(new Tuple3<>(0, 2, 5));
list.add(new Tuple3<>(0, 3, 6));
list.add(new Tuple3<>(1, 1, 9));
list.add(new Tuple3<>(1, 2, 8));
list.add(new Tuple3<>(1, 2, 8));
list.add(new Tuple3<>(1, 3, 9));
DataSet<Tuple3<Integer, Integer, Integer>> source = env.fromCollection(list);
// 去除tuple3中元素完全一样的
source.distinct().print();
// (1,3,9)
// (0,3,6)
// (1,1,9)
// (1,2,8)
// (0,2,5)
// 去除tuple3中第一个元素一样的,只保留第一个
// source.distinct(0).print();
// (1,1,9)
// (0,3,6)
// 去除tuple3中第一个和第三个相同的元素,只保留第一个
// source.distinct(0,2).print();
// (0,3,6)
// (1,1,9)
// (1,2,8)
// (0,2,5)
}
public static void distinctFunction2(ExecutionEnvironment env) throws Exception {
DataSet<User> source = env.fromCollection(Arrays.asList(
new User(1, "alan1", "1", "1@1.com", 18, 3000),
new User(2, "alan2", "2", "2@2.com", 19, 200),
new User(3, "alan1", "3", "3@3.com", 18, 1000),
new User(5, "alan1", "5", "5@5.com", 28, 1500),
new User(4, "alan2", "4", "4@4.com", 20, 300)));
// source.distinct("name").print();
// User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// User(id=1, name=alan1, pwd=1, email=1@1.com, age=18, balance=3000.0)
source.distinct("name", "age").print();
// User(id=1, name=alan1, pwd=1, email=1@1.com, age=18, balance=3000.0)
// User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// User(id=5, name=alan1, pwd=5, email=5@5.com, age=28, balance=1500.0)
// User(id=4, name=alan2, pwd=4, email=4@4.com, age=20, balance=300.0)
}
public static void distinctFunction3(ExecutionEnvironment env) throws Exception {
DataSet<User> source = env.fromCollection(Arrays.asList(
new User(1, "alan1", "1", "1@1.com", 18, -1000),
new User(2, "alan2", "2", "2@2.com", 19, 200),
new User(3, "alan1", "3", "3@3.com", 18, -1000),
new User(5, "alan1", "5", "5@5.com", 28, 1500),
new User(4, "alan2", "4", "4@4.com", 20, -300)));
//针对balance增加绝对值去重
source.distinct(new KeySelector<User, Double>() {
@Override
public Double getKey(User value) throws Exception {
return Math.abs(value.getBalance());
}
}).print();
// User(id=5, name=alan1, pwd=5, email=5@5.com, age=28, balance=1500.0)
// User(id=2, name=alan2, pwd=2, email=2@2.com, age=19, balance=200.0)
// User(id=1, name=alan1, pwd=1, email=1@1.com, age=18, balance=-1000.0)
// User(id=4, name=alan2, pwd=4, email=4@4.com, age=20, balance=-300.0)
}
public static void distinctFunction4(ExecutionEnvironment env) throws Exception {
List<String> info = new ArrayList<>();
info.add("Hadoop,Spark");
info.add("Spark,Flink");
info.add("Hadoop,Flink");
info.add("Hadoop,Flink");
DataSet<String> source = env.fromCollection(info);
source.flatMap(new FlatMapFunction<String,String>(){
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
System.err.print("come in ");
for (String token : value.split(",")) {
out.collect(token);
}
}});
source.distinct().print();
}
}
8、Window
Window 函数允许按时间或其他条件对现有 KeyedStream 进行分组。 以下是以 10 秒的时间窗口聚合:
inputStream.keyBy(0).window(Time.seconds(10));
Flink 定义数据片段以便(可能)处理无限数据流。 这些切片称为窗口。 此切片有助于通过应用转换处理数据块。 要对流进行窗口化,需要分配一个可以进行分发的键和一个描述要对窗口化流执行哪些转换的函数。要将流切片到窗口,可以使用 Flink 自带的窗口分配器。 我们有选项,如 tumbling windows, sliding windows, global 和 session windows。
具体参考系列文章Flink(七)Flink四大基石之Time和WaterMaker详解与详细示例(watermaker基本使用、kafka作为数据源的watermaker使用示例以及超出最大允许延迟数据的接收实现)
9、WindowAll
windowAll 函数允许对常规数据流进行分组。 通常,这是非并行数据转换,因为它在非分区数据流上运行。
与常规数据流功能类似,也有窗口数据流功能。 唯一的区别是它们处理窗口数据流。 所以窗口缩小就像 Reduce 函数一样,Window fold 就像 Fold 函数一样,并且还有聚合。
10、Union
Union 函数将两个或多个数据流结合在一起。 这样就可以并行地组合数据流。 如果我们将一个流与自身组合,那么它会输出每个记录两次。
public static void unionFunction(ExecutionEnvironment env) throws Exception {
//Produces the union of two DataSets, which have to be of the same type. A union of more than two DataSets can be implemented with multiple union calls
List<String> info1 = new ArrayList<>();
info1.add("team A");
info1.add("team B");
List<String> info2 = new ArrayList<>();
info2.add("team C");
info2.add("team D");
List<String> info3 = new ArrayList<>();
info3.add("team E");
info3.add("team F");
List<String> info4 = new ArrayList<>();
info4.add("team G");
info4.add("team H");
DataSet<String> source1 = env.fromCollection(info1);
DataSet<String> source2 = env.fromCollection(info2);
DataSet<String> source3 = env.fromCollection(info3);
DataSet<String> source4 = env.fromCollection(info4);
source1.union(source2).union(source3).union(source4).print();
// team A
// team C
// team E
// team G
// team B
// team D
// team F
// team H
}
11、Window join
可以通过一些 key 将同一个 window 的两个数据流 join 起来。
#在 5 秒的窗口中连接两个流,其中第一个流的第一个属性的连接条件等于另一个流的第二个属性
inputStream.join(inputStream1)
.where(0).equalTo(1)
.window(Time.seconds(5))
.apply (new JoinFunction () {...});
12、Split
此功能根据条件将流拆分为两个或多个流。 当获得混合流并且可能希望单独处理每个数据流时,可以使用此方法。新版本使用OutputTag替代。
SplitStream<Integer> split = inputStream.split(new OutputSelector<Integer>() {
@Override
public Iterable<String> select(Integer value) {
List<String> output = new ArrayList<String>();
if (value % 2 == 0) {
output.add("even");
}
else {
output.add("odd");
}
return output;
}
});
13、Select
此功能允许您从拆分流中选择特定流。新版本使用OutputTag替代。
SplitStream<Integer> split;
DataStream<Integer> even = split.select("even");
DataStream<Integer> odd = split.select("odd");
DataStream<Integer> all = split.select("even","odd");
14、Project
Project 函数允许从事件流中选择属性子集,并仅将所选元素发送到下一个处理流。
DataStream<Tuple4<Integer, Double, String, String>> in = // [...]
DataStream<Tuple2<String, String>> out = in.project(3,2);
上述函数从给定记录中选择属性号 2 和 3。 以下是示例输入和输出记录:
(1,10.0,A,B)=> (B,A)
(2,20.0,C,D)=> (D,C)
以上,将常用的operator作为单独的示例,使用内部匿名类和lambda各自实现了,有些不常见的或在新版本将替换掉的则叙述不多。