Flink 本地提交sql到yarn上 flink sql udf

转载

jimoshalengzhou 2023-10-11 09:17:15

文章标签 Flink 本地提交sql到yarn上 java 开发语言 hadoop apache 文章分类 Yarn 大数据

Scalar Function（标量函数）

标量函数：一进一出。

package day07;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.ScalarFunction;

/**
 * @desc: 需求：实现一个sum求和函数，函数名叫：mySum
 */
public class Demo01_ScalarFunction {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
        t_env.getConfig().set("parallelism.default","1");

        //2.创建源表
        /**
         *      |   num1    |   num2    |
         *      |   10      |    50     |
         *      |   20      |    60     |
         *      |   30      |    70     |
         *      |   40      |    80     |
         */
        t_env.executeSql("create table source(" +
                "num1 int," +
                "num2 int" +
                ") with (" +
                "'connector' = 'socket'," +
                "'hostname' = 'node1'," +
                "'port' = '9999'," +
                "'format' = 'csv'" +
                ")");

        //3.创建目标表
        /**
         *      |    num     |
         *      |     60     |
         *      |     80     |
         *      |    100     |
         *      |    120     |
         */
        t_env.executeSql("create table sink(" +
                "num int" +
                ") with (" +
                "'connector' = 'print'" +
                ")");

        //4.数据处理
        t_env.createTemporaryFunction("mySum",MyScalarFunction.class);
        t_env.executeSql("insert into sink select mySum(num1,num2) from source").await();

        //5.启动流式任务
        env.execute();
    }

    /**
     * 实现类，继承自ScalerFunction
     */
    public static class MyScalarFunction extends ScalarFunction {
        //ScalarFunction必须实现eval方法。才能实现自定义函数。
        public Integer eval（Integer a, Integer b) {
            return a + b;
        }
    }
}

运行结果如下：

Flink 本地提交sql到yarn上 flink sql udf_hadoop

Table Function（表值函数）

表值函数：一进多出。

package day07;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableFunction;

/**
 * @desc: 需求：实现一个类似flatMap方法，函数名叫myFlatMap
 */
public class Demo02_TableFunction {
    public static void main(String[] args) throws Exception {
        //1.构建流式环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
        t_env.getConfig().set("parallelism.default","1");

        //2.构建source表
        /**
         *      |    num    |
         *      |     3     | -> 0,1,2
         *      |     2     | -> 0,1
         *      |     4     | -> 0,1,2,3
         */
        t_env.executeSql("create table source(" +
                "num int" +
                ") with (" +
                "'connector' = 'socket'," +
                "'hostname' = 'node1'," +
                "'port' = '9999'," +
                "'format' ='csv'" +
                ") ");

        //3.构建sink表
        /**
         *      |    num    |
         *      |     0     |
         *      |     1     |
         *      |     2     |
         *      |     0     |
         *      |     1     |
         *      |     0     |
         *      |     1     |
         *      |     2     |
         *      |     3     |
         */
        t_env.executeSql("create table sink(" +
                "num int" +
                ") with (" +
                "'connector' = 'print'" +
                ")");

        //.4数据处理
        t_env.createTemporaryFunction("myFlatMap",MyTableFunction.class);
        t_env.executeSql("insert into sink select t from source left join lateral table(myFlatMap(num)) as tmp(t) on true").await();

        //5.启动流式任务
        env.execute();

    }

    /**
     * 自定义的class，继承自TableFunction
     */
    public static class MyTableFunction extends TableFunction<Integer> {
        /**
         * TableFunction中必须实现的eval方法
         * @param num 输入的数据
         */
        public void eval（Integer num) {
            for (int i = 0; i < num; i++) {
                collect(i);
            }
        }
    }
}

运行结果如下：

Flink 本地提交sql到yarn上 flink sql udf_hadoop_02

Aggregate Function（聚合函数）

package day07;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.AggregateFunction;

/**
 * @desc: 需求：实现一个类似于count的函数，函数名叫myCount。
 */
public class Demo03_AggregateFunction {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
        t_env.getConfig().set("parallelism.default","1");

        //2.构建source表
        /**
         *  |   word    |
         *  |   hello   |
         *  |   hive    |
         *  |  hadoop   |
         *  |  spark    |
         */
        t_env.executeSql("create table source(" +
                "word varchar" +
                ") with (" +
                "'connector' = 'socket'," +
                "'hostname' = 'node1'," +
                "'port' = '9999'," +
                "'format' = 'csv'" +
                ")");

        //3.构建sink表
        /**
         *  |   word    |   counts  |
         *  |   hello   |     1     |
         *  |   hive    |     1     |
         *  |   hadoop  |     1     |
         *  |   spark   |     1     |
         */
        t_env.executeSql("create table sink(" +
                "word varchar," +
                "counts int" +
                ") with (" +
                "'connector' = 'print'" +
                ")");

        //4.数据处理
        t_env.createTemporaryFunction("myCount",MyAggregateFunction.class);
        t_env.executeSql("insert into sink select word,myCount(1) from source group by word").await();

        //5.启动流式任务
        env.execute();

    }

    /**
     * T :最终聚合的结果类型
     * ACC:accmulator：累加器（就是一个对象[变量]而已）
     */
    public static class MyAggregateFunction extends AggregateFunction<Integer,MyAccumulator> {
        /**
         * 返回最终的累加结果
         * @param accumulator 累加器
         * @return 最终结果
         */
        @Override
        public Integer getValue(MyAccumulator accumulator) {
            return accumulator.counts;
        }

        /**
         * 创建累加器
         * @return
         */
        @Override
        public MyAccumulator createAccumulator() {
            return new MyAccumulator();
        }

        /**
         * 对中间结果进行累加操作
         * @param accumulator 累加器
         * @param num 中间结果
         */
        public void accumulate(MyAccumulator accumulator, Integer num) {
            accumulator.counts += num;
        }
    }

    public static class MyAccumulator {
        //定义一个变量，用于累加
        public int counts;
    }
}

运行结果如下：

Flink 本地提交sql到yarn上 flink sql udf_hadoop_03

Table Aggregate Function（表值聚合函数）

表值聚合函数：多进多出。

package day07;

import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Expressions;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.table.functions.TableAggregateFunction;
import org.apache.flink.util.Collector;
import scala.Int;

/**
 * @desc: 需求：求top2，单词的top2.
 * 分析
 * #第一条数据
 * hadoop,1	  =>  hadoop,1
 *
 * #第二条数据
 * hadoop,1	  =>  hadoop,2
 * hadoop,2		  hadoop,1
 *
 * #第三条数据
 * hadoop,1
 * hadoop,2	  =>  hadoop,3
 * hadoop,3		  hadoop,2
 *
 * #第四条数据
 * hadoop,1
 * hadoop,2      =>  hadoop,4
 * hadoop,3		  hadoop,3
 * hadoop,4
 */
public class Demo04_TableAggregateFunction {
    public static void main(String[] args) throws Exception {
        //1.构建流式执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment t_env = StreamTableEnvironment.create(env);
        t_env.getConfig().set("parallelism.default","1");

        //2.构建source表
        /**
         *  |   word    |   counts  |
         *  |   hadoop  |      1    |
         *  |   hadoop  |      2    |
         *  |   hadoop  |      3    |
         *  |   hadoop  |      4    |
         */
        t_env.executeSql("create table source(" +
                "word varchar," +
                "counts int" +
                ") with (" +
                "'connector' = 'socket'," +
                "'hostname' = 'node1'," +
                "'port' = '9999'," +
                "'format' = 'csv'" +
                ")");

        //3.构建sink表
        t_env.executeSql("create table sink(" +
                "word varchar," +
                "counts int" +
                ") with (" +
                "'connector' = 'print'" +
                ")");

        //4.数据处理
        //多进多出的需求，需要使用flatAggregate方法
        /**
         * from:加载源表的数据
         * groupBy：根据word分组
         * top2：自定义的top2函数，并且作用在counts列，top2函数作用在counts后的结果列的别名为counts
         * select:选择word单词和top2作用后的结果列counts
         * executeInsert：执行插入操作，到sink表中
         */
        t_env.createTemporaryFunction("top2",MyTableAggregateFunction.class);
        t_env.from("source").groupBy(Expressions.$("word"))
                .flatAggregate(Expressions.call("top2",Expressions.$("counts")).as("counts"))
                .select(Expressions.$("word"),Expressions.$("counts"))
                .executeInsert("sink")
                .await();

        //5.启动流式任务
        env.execute();

    }

    /**
     * T：最终的结果类型
     * ACC：在聚合期间用于临时聚合的结果类型（累加器），核心就是一个用来计算的变量。
     */
    public static class MyTableAggregateFunction extends TableAggregateFunction<Integer, MyAccumulator> {
        /**
         * 创建累加器
         * @return 最终累加器对象
         */
        @Override
        public MyAccumulator createAccumulator() {
            return new MyAccumulator();
        }

        /**
         * 累加计算业务逻辑
         * @param accumulator 累加器的结果
         * @param num 新的数据
         */
        public void accumulate(MyAccumulator accumulator, Integer num) {
            //假设num的值比累加器的最大值first还大，那么这个值肯定要留下来。
            if (num > accumulator.first) {
                accumulator.second = accumulator.first;
                accumulator.first = num;
                //first 10
                //second 5
                //num 8
            } else if (num > accumulator.second) {
                accumulator.second = num;
            }
        }

        /**
         * 最终的返回结果（从累加器中取结果返回）
         * @param accumulator 累加器
         * @param out 输出对象，返回结果
         */
        public void emitValue(MyAccumulator accumulator, Collector<Integer> out) {
            //如果累加器中的第一个值不是Integer.MIN_VALUE，表示累加器中第一个值有新的数据，所以要输出。
            if (accumulator.first != Integer.MIN_VALUE) {
                out.collect(accumulator.first);
            }
            //如果累加器中的第二个值不是Integer.MIN_VALUE，表示累加器的第二个值有新的数据，所以要输出。
            if (accumulator.second != Integer.MIN_VALUE) {
                out.collect(accumulator.second);
            }
        }
    }

    /**
     * 自定义的累加器的类
     */
    public static class MyAccumulator {
        /**
         * 累加器最终计算一个变量而已。
         * todo 由于我们需要统计top2，因此，我们可以定义2个变量来表示top2的两个结果。
         */
        public int first = Integer.MIN_VALUE;
        public int second = Integer.MIN_VALUE;
    }
}

运行结果如下：

Flink 本地提交sql到yarn上 flink sql udf_java_04