目录
一、简单示例与程序模板
1、一个简单示例
2、程序架构
二、聚合查询
1、分组聚合
2、窗口聚合
3、开窗(Over)聚合
三、TopN
1、普通TopN
2、窗口TopN
四、自定义函数(UDF)
1. 整体调用流程
(1)注册函数
(2)使用 Table API 调用函数
(3)在 SQL 中调用函数
2. 标量函数(Scalar Functions)
3. 表函数(Table Functions)
4. 聚合函数(Aggregate Functions)
5. 表聚合函数(Table Aggregate Functions)
一、简单示例与程序模板
1、一个简单示例
public static void main(String[] args) throws Exception {
// 获取流执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 1. 读取数据源
SingleOutputStreamOperator<Event> eventStream = env
.fromElements(
new Event("Alice", "./home", 1000L),
new Event("Bob", "./cart", 1000L),
new Event("Alice", "./prod?id=1", 5 * 1000L),
new Event("Cary", "./home", 60 * 1000L),
new Event("Bob", "./prod?id=3", 90 * 1000L),
new Event("Alice", "./prod?id=7", 105 * 1000L)
);
// 2. 获取表环境
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 3. 将数据流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream);
// 4. 用执行SQL 的方式提取数据
Table resultTable1 = tableEnv.sqlQuery("select url, user from " + eventTable);
// 5. 基于Table直接转换
Table resultTable2 = eventTable.select($("user"), $("url"))
.where($("user").isEqual("Alice"));
// 6. 将表转换成数据流,打印输出
tableEnv.toDataStream(resultTable1).print("result1");
tableEnv.toDataStream(resultTable2).print("result2");
// 执行程序
env.execute();
}
2、程序架构
public static void main(String[] args) {
// 创建表环境
TableEnvironment tableEnv = ...;
// 创建输入表,连接外部系统读取数据
tableEnv.executeSql("CREATE TEMPORARY TABLE inputTable ... WITH ( 'connector'
= ... )");
// 注册一个表,连接到外部系统,用于输出
tableEnv.executeSql("CREATE TEMPORARY TABLE outputTable ... WITH ( 'connector'
= ... )");
// 执行 SQL 对表进行查询转换,得到一个新的表
Table table1 = tableEnv.sqlQuery("SELECT ... FROM inputTable... ");
// 使用 Table API 对表进行查询转换,得到一个新的表
Table table2 = tableEnv.from("inputTable").select(...);
// 将得到的结果写入输出表
TableResult tableResult = table1.executeInsert("outputTable");
}
二、聚合查询
1、分组聚合
// 1. 分组聚合
Table aggTable = tableEnv.sqlQuery("SELECT user_name, COUNT(1) FROM clickTable GROUP BY user_name");
// 2. 分组窗口聚合
Table groupWindowResultTable = tableEnv.sqlQuery("SELECT " +
"user_name, " +
"COUNT(1) AS cnt, " +
"TUMBLE_END(et, INTERVAL '10' SECOND) as endT " +
"FROM clickTable " +
"GROUP BY " + // 使用窗口和用户名进行分组
" user_name, " +
" TUMBLE(et, INTERVAL '10' SECOND)" // 定义1小时滚动窗口
);
2、窗口聚合
在 Flink 的 Table API 和 SQL 中,窗口的计算是通过“窗口聚合”( window aggregation )来实现的。与分组聚合类似,窗口聚合也需要调用 SUM() 、 MAX() 、 MIN() 、 COUNT() 一类的聚合函数,通过 GROUP BY 子句来指定分组的字段。只不过窗口聚合时,需要将窗口信息作
为分组 key 的一部分定义出来。1.13 版本开始使用了“窗口表值函数”(Windowing TVF),窗口本身返回的是就是一个表,所以窗口会出现在 FROM后面,GROUP BY 后面的则是窗口新增的字段 window_start 和 window_end。
// 3. 窗口聚合
// 3.1 滚动窗口
Table tumbleWindowResultTable = tableEnv.sqlQuery("SELECT user_name, COUNT(url) AS cnt, " +
" window_end AS endT " +
"FROM TABLE( " +
" TUMBLE( TABLE clickTable, DESCRIPTOR(et), INTERVAL '10' SECOND)" +
") " +
"GROUP BY user_name, window_start, window_end "
);
// 3.2 滑动窗口
Table hopWindowResultTable = tableEnv.sqlQuery("SELECT user_name, COUNT(url) AS cnt, " +
" window_end AS endT " +
"FROM TABLE( " +
" HOP( TABLE clickTable, DESCRIPTOR(et), INTERVAL '5' SECOND, INTERVAL '10' SECOND)" +
") " +
"GROUP BY user_name, window_start, window_end "
);
// 3.3 累积窗口
Table cumulateWindowResultTable = tableEnv.sqlQuery("SELECT user_name, COUNT(url) AS cnt, " +
" window_end AS endT " +
"FROM TABLE( " +
" CUMULATE( TABLE clickTable, DESCRIPTOR(et), INTERVAL '5' SECOND, INTERVAL '10' SECOND)" +
") " +
"GROUP BY user_name, window_start, window_end "
);
3、开窗(Over)聚合
// 4. 开窗聚合
Table overWindowResultTable = tableEnv.sqlQuery("SELECT user_name, " +
" avg(ts) OVER (" +
" PARTITION BY user_name " +
" ORDER BY et " +
" ROWS BETWEEN 3 PRECEDING AND CURRENT ROW" +
") AS avg_ts " +
"FROM clickTable");
SELECT user, ts,
COUNT(url) OVER (
PARTITION BY user
ORDER BY ts
RANGE BETWEEN INTERVAL '1' HOUR PRECEDING AND CURRENT ROW
) AS cnt
FROM EventTable
这里我们以 ts 作为时间属性字段,对 EventTable 中的每行数据都选取它之前 1 小时的所
有数据进行聚合,统计每个用户访问 url 的总次数,并重命名为 cnt 。最终将表中每行的 user ,ts 以及扩展出 cnt 提取出来。
三、TopN
1、普通TopN
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 在创建表的DDL中直接定义时间属性
String createDDL = "CREATE TABLE clickTable (" +
" `user` STRING, " +
" url STRING, " +
" ts BIGINT, " +
" et AS TO_TIMESTAMP( FROM_UNIXTIME(ts / 1000) ), " +
" WATERMARK FOR et AS et - INTERVAL '1' SECOND " +
") WITH (" +
" 'connector' = 'filesystem', " +
" 'path' = 'input/clicks.csv', " +
" 'format' = 'csv' " +
")";
tableEnv.executeSql(createDDL);
// 普通Top N,选取当前所有用户中浏览量最大的2个
Table topNResultTable = tableEnv.sqlQuery("SELECT user, cnt, row_num " +
"FROM (" +
" SELECT *, ROW_NUMBER() OVER (" +
" ORDER BY cnt DESC" +
" ) AS row_num " +
" FROM (SELECT user, COUNT(url) AS cnt FROM clickTable GROUP BY user)" +
") WHERE row_num <= 2");
tableEnv.toChangelogStream(topNResultTable).print("top 2: ");
env.execute();
}
2、窗口TopN
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 读取数据源,并分配时间戳、生成水位线
SingleOutputStreamOperator<Event> eventStream = env
.fromElements(
new Event("Alice", "./home", 1000L),
new Event("Bob", "./cart", 1000L),
new Event("Alice", "./prod?id=1", 25 * 60 * 1000L),
new Event("Alice", "./prod?id=4", 55 * 60 * 1000L),
new Event("Bob", "./prod?id=5", 3600 * 1000L + 60 * 1000L),
new Event("Cary", "./home", 3600 * 1000L + 30 * 60 * 1000L),
new Event("Cary", "./prod?id=7", 3600 * 1000L + 59 * 60 * 1000L)
)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<Event>forMonotonousTimestamps()
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 创建表环境
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 将数据流转换成表,并指定时间属性
Table eventTable = tableEnv.fromDataStream(
eventStream,
$("user"),
$("url"),
$("timestamp").rowtime().as("ts")
// 将timestamp指定为事件时间,并命名为ts
);
// 为方便在SQL中引用,在环境中注册表EventTable
tableEnv.createTemporaryView("EventTable", eventTable);
// 定义子查询,进行窗口聚合,得到包含窗口信息、用户以及访问次数的结果表
String subQuery =
"SELECT window_start, window_end, user, COUNT(url) as cnt " +
"FROM TABLE ( " +
"TUMBLE( TABLE EventTable, DESCRIPTOR(ts), INTERVAL '1' HOUR )) " +
"GROUP BY window_start, window_end, user ";
// 定义Top N的外层查询
String topNQuery =
"SELECT * " +
"FROM (" +
"SELECT *, " +
"ROW_NUMBER() OVER ( " +
"PARTITION BY window_start, window_end " +
"ORDER BY cnt desc " +
") AS row_num " +
"FROM (" + subQuery + ")) " +
"WHERE row_num <= 2";
// 执行SQL得到结果表
Table result = tableEnv.sqlQuery(topNQuery);
tableEnv.toDataStream(result).print();
env.execute();
}
四、自定义函数(UDF)
1. 整体调用流程
(1)注册函数
注册函数时需要调用表环境的 createTemporarySystemFunction() 方法,传入注册的函数名
以及 UDF 类的 Class 对象:
// 注册函数
tableEnv.createTemporarySystemFunction("MyFunction", MyFunction.class);
(2)使用 Table API 调用函数
在 Table API 中,需要使用 call() 方法来调用自定义函数:
tableEnv.from("MyTable").select(call("MyFunction", $("myField")));
(3)在 SQL 中调用函数
tableEnv.sqlQuery("SELECT MyFunction(myField) FROM MyTable");
2. 标量函数(Scalar Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").rowtime().as("ts"));
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 注册自定义标量函数
tableEnv.createTemporarySystemFunction("MyHash", MyHash.class);
// 4. 调用UDF查询转换
Table resultTable = tableEnv.sqlQuery("select user, MyHash(user) from EventTable");
// 5. 输出到控制台
tableEnv.executeSql("create table output (" +
"uname STRING, " +
"myhash INT ) " +
"WITH (" +
"'connector' = 'print')");
resultTable.executeInsert("output");
}
// 自定义一个ScalarFunction
public static class MyHash extends ScalarFunction {
public int eval(String str){
return str.hashCode();
}
}
3. 表函数(Table Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").rowtime().as("ts"));
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 注册自定义表函数
tableEnv.createTemporarySystemFunction("MySplit", MySplit.class);
// 4. 调用UDF查询转换
Table resultTable = tableEnv.sqlQuery("select user, url, word, length " +
"from EventTable, LATERAL TABLE( MySplit(url) ) AS T(word, length)");
// 5. 输出到控制台
tableEnv.executeSql("create table output (" +
"uname STRING, " +
"url STRING, " +
"word STRING, " +
"length INT) " +
"WITH (" +
"'connector' = 'print')");
resultTable.executeInsert("output");
}
// 自定义一个TableFunction,注意有泛型,这里输出的是两个字段,二元组
public static class MySplit extends TableFunction<Tuple2<String, Integer>>{
public void eval(String str){
String[] fields = str.split("\\?"); // 转义问号,以及反斜杠本身
for (String field : fields){
collect(Tuple2.of(field, field.length()));
}
}
}
4. 聚合函数(Aggregate Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").as("ts"),
$("rt").rowtime());
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 注册自定义表函数
tableEnv.createTemporarySystemFunction("WeightedAverage", WeightedAverage.class);
// 4. 调用UDF查询转换,这里权重直接给1
Table resultTable = tableEnv.sqlQuery("select user, " +
" WeightedAverage(ts, 1) as weighted_avg " +
"from EventTable " +
"group by user");
// 5. 输出到控制台
tableEnv.executeSql("create table output (" +
"uname STRING, " +
"weighted_avg BIGINT) " +
"WITH (" +
"'connector' = 'print')");
resultTable.executeInsert("output");
}
// 单独定义一个累加器类型
public static class WeightedAvgAccumulator {
public long sum = 0; // 加权和
public int count = 0; // 数据个数
}
// 自定义一个AggregateFunction,求加权平均值
public static class WeightedAverage extends AggregateFunction<Long, WeightedAvgAccumulator>{
@Override
public Long getValue(WeightedAvgAccumulator accumulator) {
if (accumulator.count == 0)
return null; // 防止除数为0
else
return accumulator.sum / accumulator.count;
}
@Override
public WeightedAvgAccumulator createAccumulator() {
return new WeightedAvgAccumulator();
}
// 累加计算方法,类似于add
public void accumulate(WeightedAvgAccumulator accumulator, Long iValue, Integer iWeight){
accumulator.sum += iValue * iWeight; // 这个值要算iWeight次
accumulator.count += iWeight;
}
}
5. 表聚合函数(Table Aggregate Functions)
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
// 1. 自定义数据源,从流转换
SingleOutputStreamOperator<Event> eventStream = env.addSource(new ClickSource())
.assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forBoundedOutOfOrderness(Duration.ZERO)
.withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
@Override
public long extractTimestamp(Event element, long recordTimestamp) {
return element.timestamp;
}
})
);
// 2. 将流转换成表
Table eventTable = tableEnv.fromDataStream(eventStream,
$("user"),
$("url"),
$("timestamp").as("ts"),
$("rt").rowtime());
tableEnv.createTemporaryView("EventTable", eventTable);
// 3. 开滚动窗口聚合,得到每个用户在每个窗口中的浏览量
Table windowAggTable = tableEnv.sqlQuery("select user, count(url) as cnt, " +
"window_end " +
"from TABLE(" +
" TUMBLE( TABLE EventTable, DESCRIPTOR(rt), INTERVAL '10' SECOND )" +
")" +
"group by user," +
" window_start," +
" window_end");
tableEnv.createTemporaryView("AggTable", windowAggTable);
// 4. 注册表聚合函数函数
tableEnv.createTemporarySystemFunction("Top2", Top2.class);
// 5. 在Table API中调用函数
Table resultTable = tableEnv.from("AggTable")
.groupBy($("window_end"))
.flatAggregate(call("Top2", $("cnt")).as("value", "rank"))
.select($("window_end"), $("value"), $("rank"));
// 6. 输出到控制台
tableEnv.toChangelogStream(resultTable).print();
env.execute();
}
// 聚合累加器的类型定义,包含最大的第一和第二两个数据
public static class Top2Accumulator {
public Long first;
public Long second;
}
// 自定义表聚合函数,查询一组数中最大的两个,返回值为(数值,排名)的二元组
public static class Top2 extends TableAggregateFunction<Tuple2<Long, Integer>, Top2Accumulator> {
@Override
public Top2Accumulator createAccumulator() {
Top2Accumulator acc = new Top2Accumulator();
acc.first = Long.MIN_VALUE; // 为方便比较,初始值给最小值
acc.second = Long.MIN_VALUE;
return acc;
}
// 每来一个数据调用一次,判断是否更新累加器
public void accumulate(Top2Accumulator acc, Long value) {
if (value > acc.first) {
acc.second = acc.first;
acc.first = value;
} else if (value > acc.second) {
acc.second = value;
}
}
// 输出(数值,排名)的二元组,输出两行数据
public void emitValue(Top2Accumulator acc, Collector<Tuple2<Long, Integer>> out) {
if (acc.first != Long.MIN_VALUE) {
out.collect(Tuple2.of(acc.first, 1));
}
if (acc.second != Long.MIN_VALUE) {
out.collect(Tuple2.of(acc.second, 2));
}
}
}