Scalar Function(标量函数)
标量函数:一进一出。
package day07;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.ScalarFunction;
/**
* @desc: 需求:实现一个sum求和函数,函数名叫:mySum
*/
public class Demo01_ScalarFunction {
public static void main(String[] args) throws Exception {
//1.构建流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
t_env.getConfig().set("parallelism.default","1");
//2.创建源表
/**
* | num1 | num2 |
* | 10 | 50 |
* | 20 | 60 |
* | 30 | 70 |
* | 40 | 80 |
*/
t_env.executeSql("create table source(" +
"num1 int," +
"num2 int" +
") with (" +
"'connector' = 'socket'," +
"'hostname' = 'node1'," +
"'port' = '9999'," +
"'format' = 'csv'" +
")");
//3.创建目标表
/**
* | num |
* | 60 |
* | 80 |
* | 100 |
* | 120 |
*/
t_env.executeSql("create table sink(" +
"num int" +
") with (" +
"'connector' = 'print'" +
")");
//4.数据处理
t_env.createTemporaryFunction("mySum",MyScalarFunction.class);
t_env.executeSql("insert into sink select mySum(num1,num2) from source").await();
//5.启动流式任务
env.execute();
}
/**
* 实现类,继承自ScalerFunction
*/
public static class MyScalarFunction extends ScalarFunction {
//ScalarFunction必须实现eval方法。才能实现自定义函数。
public Integer eval(Integer a, Integer b) {
return a + b;
}
}
}
运行结果如下:
Table Function(表值函数)
表值函数:一进多出。
package day07;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableFunction;
/**
* @desc: 需求:实现一个类似flatMap方法,函数名叫myFlatMap
*/
public class Demo02_TableFunction {
public static void main(String[] args) throws Exception {
//1.构建流式环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
t_env.getConfig().set("parallelism.default","1");
//2.构建source表
/**
* | num |
* | 3 | -> 0,1,2
* | 2 | -> 0,1
* | 4 | -> 0,1,2,3
*/
t_env.executeSql("create table source(" +
"num int" +
") with (" +
"'connector' = 'socket'," +
"'hostname' = 'node1'," +
"'port' = '9999'," +
"'format' ='csv'" +
") ");
//3.构建sink表
/**
* | num |
* | 0 |
* | 1 |
* | 2 |
* | 0 |
* | 1 |
* | 0 |
* | 1 |
* | 2 |
* | 3 |
*/
t_env.executeSql("create table sink(" +
"num int" +
") with (" +
"'connector' = 'print'" +
")");
//.4数据处理
t_env.createTemporaryFunction("myFlatMap",MyTableFunction.class);
t_env.executeSql("insert into sink select t from source left join lateral table(myFlatMap(num)) as tmp(t) on true").await();
//5.启动流式任务
env.execute();
}
/**
* 自定义的class,继承自TableFunction
*/
public static class MyTableFunction extends TableFunction<Integer> {
/**
* TableFunction中必须实现的eval方法
* @param num 输入的数据
*/
public void eval(Integer num) {
for (int i = 0; i < num; i++) {
collect(i);
}
}
}
}
运行结果如下:
Aggregate Function(聚合函数)
package day07;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.AggregateFunction;
/**
* @desc: 需求:实现一个类似于count的函数,函数名叫myCount。
*/
public class Demo03_AggregateFunction {
public static void main(String[] args) throws Exception {
//1.构建流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
t_env.getConfig().set("parallelism.default","1");
//2.构建source表
/**
* | word |
* | hello |
* | hive |
* | hadoop |
* | spark |
*/
t_env.executeSql("create table source(" +
"word varchar" +
") with (" +
"'connector' = 'socket'," +
"'hostname' = 'node1'," +
"'port' = '9999'," +
"'format' = 'csv'" +
")");
//3.构建sink表
/**
* | word | counts |
* | hello | 1 |
* | hive | 1 |
* | hadoop | 1 |
* | spark | 1 |
*/
t_env.executeSql("create table sink(" +
"word varchar," +
"counts int" +
") with (" +
"'connector' = 'print'" +
")");
//4.数据处理
t_env.createTemporaryFunction("myCount",MyAggregateFunction.class);
t_env.executeSql("insert into sink select word,myCount(1) from source group by word").await();
//5.启动流式任务
env.execute();
}
/**
* T :最终聚合的结果类型
* ACC:accmulator:累加器(就是一个对象[变量]而已)
*/
public static class MyAggregateFunction extends AggregateFunction<Integer,MyAccumulator> {
/**
* 返回最终的累加结果
* @param accumulator 累加器
* @return 最终结果
*/
@Override
public Integer getValue(MyAccumulator accumulator) {
return accumulator.counts;
}
/**
* 创建累加器
* @return
*/
@Override
public MyAccumulator createAccumulator() {
return new MyAccumulator();
}
/**
* 对中间结果进行累加操作
* @param accumulator 累加器
* @param num 中间结果
*/
public void accumulate(MyAccumulator accumulator, Integer num) {
accumulator.counts += num;
}
}
public static class MyAccumulator {
//定义一个变量,用于累加
public int counts;
}
}
运行结果如下:
Table Aggregate Function(表值聚合函数)
表值聚合函数:多进多出。
package day07;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Expressions;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableAggregateFunction;
import org.apache.flink.util.Collector;
import scala.Int;
/**
* @desc: 需求:求top2,单词的top2.
* 分析
* #第一条数据
* hadoop,1 => hadoop,1
*
* #第二条数据
* hadoop,1 => hadoop,2
* hadoop,2 hadoop,1
*
* #第三条数据
* hadoop,1
* hadoop,2 => hadoop,3
* hadoop,3 hadoop,2
*
* #第四条数据
* hadoop,1
* hadoop,2 => hadoop,4
* hadoop,3 hadoop,3
* hadoop,4
*/
public class Demo04_TableAggregateFunction {
public static void main(String[] args) throws Exception {
//1.构建流式执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
t_env.getConfig().set("parallelism.default","1");
//2.构建source表
/**
* | word | counts |
* | hadoop | 1 |
* | hadoop | 2 |
* | hadoop | 3 |
* | hadoop | 4 |
*/
t_env.executeSql("create table source(" +
"word varchar," +
"counts int" +
") with (" +
"'connector' = 'socket'," +
"'hostname' = 'node1'," +
"'port' = '9999'," +
"'format' = 'csv'" +
")");
//3.构建sink表
t_env.executeSql("create table sink(" +
"word varchar," +
"counts int" +
") with (" +
"'connector' = 'print'" +
")");
//4.数据处理
//多进多出的需求,需要使用flatAggregate方法
/**
* from:加载源表的数据
* groupBy:根据word分组
* top2:自定义的top2函数,并且作用在counts列,top2函数作用在counts后的结果列的别名为counts
* select:选择word单词和top2作用后的结果列counts
* executeInsert:执行插入操作,到sink表中
*/
t_env.createTemporaryFunction("top2",MyTableAggregateFunction.class);
t_env.from("source").groupBy(Expressions.$("word"))
.flatAggregate(Expressions.call("top2",Expressions.$("counts")).as("counts"))
.select(Expressions.$("word"),Expressions.$("counts"))
.executeInsert("sink")
.await();
//5.启动流式任务
env.execute();
}
/**
* T:最终的结果类型
* ACC:在聚合期间用于临时聚合的结果类型(累加器),核心就是一个用来计算的变量。
*/
public static class MyTableAggregateFunction extends TableAggregateFunction<Integer, MyAccumulator> {
/**
* 创建累加器
* @return 最终累加器对象
*/
@Override
public MyAccumulator createAccumulator() {
return new MyAccumulator();
}
/**
* 累加计算业务逻辑
* @param accumulator 累加器的结果
* @param num 新的数据
*/
public void accumulate(MyAccumulator accumulator, Integer num) {
//假设num的值比累加器的最大值first还大,那么这个值肯定要留下来。
if (num > accumulator.first) {
accumulator.second = accumulator.first;
accumulator.first = num;
//first 10
//second 5
//num 8
} else if (num > accumulator.second) {
accumulator.second = num;
}
}
/**
* 最终的返回结果(从累加器中取结果返回)
* @param accumulator 累加器
* @param out 输出对象,返回结果
*/
public void emitValue(MyAccumulator accumulator, Collector<Integer> out) {
//如果累加器中的第一个值不是Integer.MIN_VALUE,表示累加器中第一个值有新的数据,所以要输出。
if (accumulator.first != Integer.MIN_VALUE) {
out.collect(accumulator.first);
}
//如果累加器中的第二个值不是Integer.MIN_VALUE,表示累加器的第二个值有新的数据,所以要输出。
if (accumulator.second != Integer.MIN_VALUE) {
out.collect(accumulator.second);
}
}
}
/**
* 自定义的累加器的类
*/
public static class MyAccumulator {
/**
* 累加器最终计算一个变量而已。
* todo 由于我们需要统计top2,因此,我们可以定义2个变量来表示top2的两个结果。
*/
public int first = Integer.MIN_VALUE;
public int second = Integer.MIN_VALUE;
}
}
运行结果如下: