本文基于Java8启动简单的Flink应用。

新建一个Maven项目,在pom.xml中添加:

<properties>
    <flinkVersion>1.10.1</flinkVersion>
</properties>

<dependencies>
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-java</artifactId>
        <version>${flinkVersion}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.flink</groupId>
        <artifactId>flink-streaming-java_2.12</artifactId>
        <version>${flinkVersion}</version>
    </dependency>
</dependencies>

1. 批处理

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class BatchProcessing {
    public static void main(String[] args) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        DataSet<String> dataSet = env.fromElements("Hello World", "Hello Flink", "Flink Hello");
        dataSet
                // 将输入数据分词后转化为(word, count)的Tuple2形式
                .flatMap(new MyFlatMapper())
                // 根据Tuple2的第一项分组
                .groupBy(0)
                // 对每组Tuple2的第二项求和
                .sum(1)
                .print();
    }

    public static class MyFlatMapper implements FlatMapFunction<String, Tuple2<String, Integer>> {
        @Override
        public void flatMap(String sentence, Collector<Tuple2<String, Integer>> collector) throws Exception {
            String[] words = sentence.split(" ");
            for(String word : words) {
                collector.collect(new Tuple2<>(word, 1));
            }
        }
    }
}

输出为:

(Hello,3)
(World,1)
(Flink,2)

可见批处理是将所有数据读取后再执行操作。

2. 流处理

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class StreamProcessing {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStream<String> dataStream = env.fromElements("Hello World", "Hello Flink", "Flink Hello");
        dataStream
                .flatMap(new BatchProcessing.MyFlatMapper())
                .keyBy(0)
                .sum(1)
                .print();
        env.execute();
    }
}

输出为:

8> (Flink,1)
1> (Hello,1)
2> (World,1)
8> (Flink,2)
1> (Hello,2)
1> (Hello,3)

与批处理不同,流处理每读取一个数据便会执行操作,且支持并行:每个输出前面的数字表示线程编号。