大多处理数据的流程是   1)加载数据源数据   2)转换处理 3) 输出结果 

1 映射类算子

1.1 map算子

map(new MapFunction )

MapFunction: (x)-> y   [1条变1条]

/**
 * @Date: 22.11.8

 * @Description:
 */
public class Base_API_MapFunction {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8888);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);

        // 加载网络数据流
        DataStreamSource<String> ds = see.socketTextStream("linux01", 9999);

        // 使用转换算子 map 处理数据  一条一条的处理数据

        /**
         * 示例一
         */
        SingleOutputStreamOperator<String> sos = ds.map(new MapFunction<String, String>() {
            // 每条数据调用一次
            @Override
            public String map(String line) throws Exception {
                // 将每条数据转换成大写
                return line.toUpperCase();
            }
        });
        /**
         * 示例二
         */
        SingleOutputStreamOperator<Tuple2<String, String>> sos2 = ds.map(new MapFunction<String, Tuple2<String, String>>() {
            @Override
            public Tuple2<String, String> map(String line) throws Exception {
                // 将接收的数据 封装成二元组
                String[] split = line.split("\\s+");
                Tuple2<String, String> tp2 = Tuple2.of(split[0], split[1]);
                return tp2;
            }
        });

        /**
         * 示例三
         * MapFunction 接口中只有一个抽象各个  可以使用Lamda表达式的方式处理数据
         *   public interface MapFunction<T, O> extends Function, Serializable {
         *         O map (T value) throws Exception;
         *    }
         *}
         */
        SingleOutputStreamOperator<Tuple2<String, String>> sos3=  ds.map(line->{
            String[] arr = line.split("\\s+");
            return  Tuple2.of(arr[0] , arr[1]) ;
        }) .returns(TypeInformation.of(new TypeHint<Tuple2<String, String>>() {})) ; // 指定返回值的数据类型
           // 或者 .returns(new TypeHint<Tuple2<String, String>>() {})  ; // 指定返回值的数据类型
        sos3.print("map后的数据: ") ;
        see.execute() ;
        
    }
}

如果是调用map方法时传入Lambda表达式,需要在调用map方法后,在调用returns方法指定返回的数据的类型。不然Flink无法自动推断出返回的数据类型,会出现异常。

1.2 flatMap扁平映射 

flatMap( new FlatMapFcuntion)

FlatMapFunction: x-> x1, x2,x3,x4  [1条变多条,并展平]

/**
 * @Date: 22.11.8

 * @Description:
 */
public class Base_API_FlatMapFunction {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8888);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);
        // 加载网络数据流
        DataStreamSource<String> ds = see.socketTextStream("linux01", 9999);

        /**
         * 示例一
         * 将接收到的一行数据  扁平化处理
         * 组装成  (单词和1)
         */
        SingleOutputStreamOperator<Tuple2<String, Integer>> soo = ds.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] arr = line.split("\\s+");
                for (String word : arr) {
                    Tuple2<String, Integer> tp = Tuple2.of(word, 1);
                    // 接收到一条数据 ,将一条数据转成成 多条数据后  使用Collector 收集多条数据
                    out.collect(tp);
                }
            }
        });

        /**
         * 示例二
         *   使用lambda 表达式处理数据
         *   不会自动推断返回值数据 类型  可以使用returns 指定返回值数据类型
         */
        SingleOutputStreamOperator<Tuple2<String, Integer>> soo2 = ds.flatMap((String line, Collector<Tuple2<String, Integer>> out) -> {
            String[] arr = line.split("\\s+");
            for (String s : arr) {
                out.collect(Tuple2.of(s, 1));
            }
        }).returns(Types.TUPLE(Types.STRING, Types.INT));
        soo2.print("扁平化后的数据: ");
        see.execute("flatMap函数示例") ;
        
    }
}

1.3 project 投影

该算子只能对Tuple类型数据使用,project方法的功能类似sql中的"select 字段";

该方法只有Java的API有,Scala的API没此方法。

/**
 * @Date: 22.11.8

 * @Description:
 */
public class _3Base_API_Project {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8888);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);

        /**
         * project 投影(DataStream → DataStream)
         * 该算子只能对Tuple类型数据使用,project方法的功能类似sql中的"select 字段";
         * 该方法只有Java的API有,Scala的API没此方法。
         */
        DataStreamSource<Tuple4<Integer, String, String, Integer>> ds = see.fromElements(
                Tuple4.of(1, "YY", "F", 100),
                Tuple4.of(2, "DY", "F", 99)

        );
        //  处理每条数据  ,返回每条数据中的指定位置的属性值  
        // 只要 id 和 name 
        SingleOutputStreamOperator<Tuple> res = ds.project(0, 1);
        res.print() ;
        see.execute("project函数") ;
        
    }
}

2 过滤算子

  filter过滤(DataStream → DataStream)

filter(new FilterFunction)

FilterFunction :  x -> true/false

/**
 * @Date: 22.11.8

 */
public class _4Base_API_FilterFunction {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8888);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);

        // 获取数据源
        DataStreamSource<Integer> ds = see.fromElements(1, 2, 3, 4, 5, 6, 7, 8, 9);
        /**
         *  过滤出偶数
         */
        SingleOutputStreamOperator<Integer> res = ds.filter(new FilterFunction<Integer>() {
            @Override
            public boolean filter(Integer value) throws Exception {
                return value % 2 == 0;
            }
        });
        //使用 lambda表达式   过滤出奇数
        SingleOutputStreamOperator<Integer> res2 = ds.filter(e -> {
            return e % 2 == 1;
        });

        /**
         * 示例 ----- 
         * 过滤出分数大于60的人
         */
        DataStreamSource<YY> yyds = see.fromElements(
                new YY(1, "DY", 100),
                new YY(2, "XY", 100),
                new YY(3, "HH", 10),
                new YY(4, "XH", 12)
        );
        SingleOutputStreamOperator<YY> res3 = yyds.filter(new FilterFunction<YY>() {
            @Override
            public boolean filter(YY yy) throws Exception {
                return yy.getScore() > 60;
            }
        });

        res3.print();
        see.execute("filter function");

    }
}

@Data
@NoArgsConstructor
@AllArgsConstructor
@ToString
class YY {
    private int id;
    private String name;
    private double score;
}

3 分组算子 

flink api导出excel flink文档_API

 keyBy按key分组(DataStream → KeyedStream)

/**
 * @Date: 22.11.8
 *
 * @Description:
 */
public class _5Base_API_KeyBy {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8888);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);
        // 加载网络数据流
        DataStreamSource<String> ds = see.socketTextStream("linux01", 9999);

        SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = ds.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] arr = line.split("\\s+");
                for (String word : arr) {
                    Tuple2<String, Integer> tp = Tuple2.of(word, 1);
                    // 接收到一条数据 ,将一条数据转成成 多条数据后  使用Collector 收集多条数据
                    out.collect(tp);
                }
            }
        });

        /**
         * 对数据流进行分组 
         *    -- 按照单词分组
         */
        
        // 按照单词分组
        wordAndOne.keyBy(0) ;
        // 按照单词分组   KeyedStream<T, KEY>
        KeyedStream<Tuple2<String, Integer>, String> res = wordAndOne.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
            @Override
            public String getKey(Tuple2<String, Integer> value) throws Exception {
                return value.f0;
            }
        });
        //根据自定义数据类型中的某个属性进行分组 
        DataStreamSource<YY2> ds2 = see.fromElements(
                new YY2(1, "DY", "NM_BT", 100),
                new YY2(2, "XY", "NM_BT", 100),
                new YY2(3, "HH", "SD_HZ", 10),
                new YY2(4, "XH", "SD_HZ", 12)
        );

        /**
         * 对数据流进行分组 
         *    -- 根据Bean的属性 
         */
        ds2.keyBy(new KeySelector<YY2, String>() {
            @Override
            public String getKey(YY2 value) throws Exception {
                return value.getCity();
            }
        }) ;
        
        ds2.keyBy(YY2::getCity) ;
        res.print() ;
        see.execute() ;
        
    }
}

@Data
@NoArgsConstructor
@AllArgsConstructor
@ToString
class YY2 {
    private int id;
    private String name;
    private  String  city ;
    private double score;
}

4 滚动聚合算子

  1. 此处所说的滚动聚合算子,是多个聚合算子的统称,有sum、min、minBy、max、maxBy;
  2. 这些算子的底层逻辑都是维护一个聚合值,并使用每条流入的数据对聚合值进行滚动更新;
  3. 这些算子都只能在KeyedStream上调用(就是必须keyby后调用);

4.1 sum

package com.blok;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @Date: 22.11.8
 
 */
public class _6Base_API_Sum{
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8898);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);
        // 加载网络数据流
        DataStreamSource<String> ds = see.socketTextStream("linux01", 9999);

        SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = ds.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String line, Collector<Tuple2<String, Integer>> out) throws Exception {
                String[] arr = line.split("\\s+");
                for (String word : arr) {
                    Tuple2<String, Integer> tp = Tuple2.of(word, 1);
                    // 接收到一条数据 ,将一条数据转成成 多条数据后  使用Collector 收集多条数据
                    out.collect(tp);
                }
            }
        });

        /**
         * 对数据流进行分组
         *    -- 按照单词分组
         */

        // 按照单词分组
        wordAndOne.keyBy(0) ;
        // 按照单词分组   KeyedStream<T, KEY>
        KeyedStream<Tuple2<String, Integer>, String> res = wordAndOne.keyBy(new KeySelector<Tuple2<String, Integer>, String>() {
            @Override
            public String getKey(Tuple2<String, Integer> value) throws Exception {
                return value.f0;
            }
        });
        //------------------------------------------------------------------------------
         // 滚动聚合: 随着数据的流出 结果数据源源的进行数据叠加
        // 统计单词出现的次数
        SingleOutputStreamOperator<Tuple2<String, Integer>> sum = res.sum("1");
        SingleOutputStreamOperator<Tuple2<String, Integer>> sum2 = res.sum(1);

        //------------------------------------------------------------------------------

        //根据自定义数据类型中的某个属性进行分组
        DataStreamSource<YY2> ds2 = see.fromElements(
                new YY2(1, "DY", "NM_BT", 100),
                new YY2(2, "XY", "NM_BT", 100),
                new YY2(3, "HH", "SD_HZ", 10),
                new YY2(4, "XH", "SD_HZ", 12)
        );

        /**
         * 对数据流进行分组
         *    -- 根据Bean的属性
         */
        KeyedStream<YY2, String> keyed = ds2.keyBy(new KeySelector<YY2, String>() {
            @Override
            public String getKey(YY2 value) throws Exception {
                return value.getCity();
            }
        });

        //------------------------------------------------------------------------------
        // 滚动聚合: 随着数据的流出 结果数据源源的进行数据叠加
        // 统计每组的总分  根据组内Bean的属性
        SingleOutputStreamOperator<YY2> score = keyed.sum("score");
        score.print() ;
        //------------------------------------------------------------------------------
        see.execute() ;

    }
}

4.2 min/minBy/max/maxBy

这两个算子都是求最小值;min和minBy的区别在于:

  1. min的返回值,最小值字段以外,其他字段是第一条输入数据的值;
  2. minBy返回值,就是最小值字段所在的那条数据;

底层原理:滚动更新时是更新一个字段,还是更新整条数据的区别;

package com.blok;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;

/**
 * @Date: 22.11.8
 
public class _7Base_API_MaxMin {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8898);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);
        // 加载网络数据流
        DataStreamSource<String> ds = see.socketTextStream("linux01", 9999);

        /**
         * public class YY2 {
         *     private int id;
         *     private String name;
         *     private  String  city ;
         *     private double score;
         * }
         */
        SingleOutputStreamOperator<YY2> beans = ds.map(new MapFunction<String, YY2>() {
            @Override
            public YY2 map(String value) throws Exception {
                String[] arr = value.split(",");
                YY2 yy = new YY2(Integer.parseInt(arr[0]), arr[1], arr[2], Double.parseDouble(arr[3]));
                return yy;
            }
        });
        //将 数据按照城市分组
        KeyedStream<YY2, String> keyed = beans.keyBy(YY2::getCity);
        //---------------------------------------------------------------
        /**
         * min 返回的是第一条数据 但是  会修改第一条数据 指定的属性信息
         * max 返回的是第一条数据 但是  会修改第一条数据 指定的属性信息
         *
         * 分数最小的数据: 1,yy1,NM_BT,98
         * 分数最大的数据: 1,yy1,NM_BT,100
         */
        // 获取分数最低的 信息
        SingleOutputStreamOperator<YY2> minScoreInfo = keyed.min("score");
        SingleOutputStreamOperator<YY2> maxScoreInfo = keyed.max("score");
        /**
         * 测试数据
         * 1,yy1,NM_BT,99
         * 2,yy2,NM_BT,100
         * 3,yy3,NM_BT,98
         * 4,yy4,NM_BT,98.5
         * 1,hh1,SD_HZ,99
         * 2,hh2,SD_HZ,100
         * 3,hh3,SD_HZ,98
         * 4,hh4,SD_HZ,98.5
         * 5,hh5,SD_HZ,101
         */
        //---------------------------------------------------------------
        /**
         * maxBy  minBy返回的就是那条指定属性最大(最小)的数据
         *
         * 分数最大的数据: > YY2(id=2, name=yy2, city=NM_BT, score=100.0)
         * 分数最小的数据: > YY2(id=3, name=yy3, city=NM_BT, score=98.0)
         */
        SingleOutputStreamOperator<YY2> minScoreInfoBy = keyed.minBy("score");
        SingleOutputStreamOperator<YY2> maxScoreInfoBy = keyed.maxBy("score");
        minScoreInfoBy.print("分数最小的数据: ") ;
        maxScoreInfoBy.print("分数最大的数据: ") ;

        see.execute() ;

    }
}

4.3 reduce

它的滚动聚合逻辑没有写死,而是由用户通过ReduceFunction来传入。

/**
 * @Date: 22.11.8
 * @Author: Hang.Nian.YY

public class _8Base_API_Reduce {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.setInteger("rest.port", 8898);
        StreamExecutionEnvironment see = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(conf);
        see.setParallelism(1);
        // 加载网络数据流
        DataStreamSource<String> ds = see.socketTextStream("linux01", 9999);

        /**
         * public class YY2 {
         *     private int id;
         *     private String name;
         *     private  String  city ;
         *     private double score;
         * }
         */
        SingleOutputStreamOperator<YY2> beans = ds.map(new MapFunction<String, YY2>() {
            @Override
            public YY2 map(String value) throws Exception {
                String[] arr = value.split(",");
                YY2 yy = new YY2(Integer.parseInt(arr[0]), arr[1], arr[2], Double.parseDouble(arr[3]));
                return yy;
            }
        });

        // 将数据beans分组
        KeyedStream<YY2, String> keyed = beans.keyBy(YY2::getCity);
        SingleOutputStreamOperator<YY2> reduced = keyed.reduce(new ReduceFunction<YY2>() {
            @Override
            public YY2 reduce(YY2 value1, YY2 value2) throws Exception {
                YY2 yy2 = new YY2();
                yy2.setScore(value1.getScore() + value2.getScore());
                yy2.setCity(value1.getCity());
                return yy2;
            }
        });
        reduced.print("聚合后的结果");
        see.execute() ;

    }
}