Sink
1. Sink输出
1.1 预定义Sink
1.1.1 基于控制台和文件的Sink
API
1.ds.print 直接输出到控制台
2.ds.printToErr() 直接输出到控制台,用红色
3.ds.writeAsText("本地/HDFS的path",WriteMode.OVERWRITE).setParallelism(1)
注意:
在输出到path的时候,可以在前面设置并行度,如果
并行度>1,则path为目录
并行度=1,则path为文件名
代码演示:
package cn.gec.sink;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* Desc
* 1.ds.print 直接输出到控制台
* 2.ds.printToErr() 直接输出到控制台,用红色
* 3.ds.collect 将分布式数据收集为本地集合
* 4.ds.setParallelism(1).writeAsText("本地/HDFS的path",WriteMode.OVERWRITE)
*/
public class SinkDemo01 {
public static void main(String[] args) throws Exception {
//1.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.source
//DataStream<String> ds = env.fromElements("hadoop", "flink");
DataStream<String> ds = env.readTextFile("data/input/words.txt");
//3.transformation
//4.sink
ds.print();
ds.printToErr();
ds.writeAsText("data/output/test", FileSystem.WriteMode.OVERWRITE);
//注意:
//Parallelism=1为文件
//Parallelism>1为文件夹
//5.execute
env.execute();
}
}
数据会按分区放到指定目录下,可通过setParallelism()来限制sink,一个sink子任务输出一个文件
因为writeAsText不支持流式写入,只有设置了并行度为1,才能实现顺序写入文件,所以该方法已被官方标为弃用。
1.2 自定义Sink
1.2.1 MySQL
需求:
- 将Flink集合中的数据通过自定义Sink保存到MySQL
代码实现:
package cn.gec.sink;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
/**
* Desc
* 使用自定义sink将数据保存到MySQL
*/
public class SinkDemo02_MySQL {
public static void main(String[] args) throws Exception {
//1.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.Source
DataStream<Student> studentDS = env.fromElements(new Student(null, "tonyma", 18));
//3.Transformation
//4.Sink“
studentDS.addSink(new MySQLSink());
//5.execute
env.execute();
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class Student {
private Integer id;
private String name;
private Integer age;
}
public static class MySQLSink extends RichSinkFunction<Student> {
private Connection conn = null;
private PreparedStatement ps = null;
@Override
public void open(Configuration parameters) throws Exception {
//加载驱动,开启连接
//Class.forName("com.mysql.jdbc.Driver");
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/bigdata?useUnicode=true&characterEncoding=utf8&useSSL=false&serverTimezone=GMT", "root", "root");
String sql = "INSERT INTO `t_student` (`id`, `name`, `age`) VALUES (null, ?, ?)";
ps = conn.prepareStatement(sql);
}
// invoke每来一条数据,就会调用连接,执行sql
@Override
public void invoke(Student value, Context context) throws Exception {
//给ps中的?设置具体值
//Parameter index out of range (0 < 1 ).
ps.setString(1,value.getName());
ps.setInt(2,value.getAge());
//执行sql
ps.executeUpdate();
}
@Override
public void close() throws Exception {
if (conn != null) conn.close();
if (ps != null) ps.close();
}
}
}
open方法一个生命周期的开始,一般可用来编写连接设置
invoke方法一般写具体的执行操作
close方法为一个生命周期的结束,一般用来关闭连接
1.2.2 Connectors
https://nightlies.apache.org/flink/flink-docs-release-1.13/
JDBC
- 配置pom.xm
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc_2.11</artifactId>
<version>1.13.6</version>
</dependency>
- 代码如下:
package com.gec.connectors;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.connector.jdbc.JdbcConnectionOptions;
import org.apache.flink.connector.jdbc.JdbcSink;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* Desc 演示Flink官方提供的JdbcSink
*/
public class JDBCDemo {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
//TODO 1.source
DataStream<Student> studentDS = env.fromElements(new Student(null, "tony2", 18));
//TODO 2.transformation
//TODO 3.sink
studentDS.addSink(JdbcSink.sink(
"INSERT INTO `t_student` (`id`, `name`, `age`) VALUES (null, ?, ?)",
(ps, value) -> {//第一个参数ps为SQL语句,第二个参数value为传过来的事件数据studentDS
ps.setString(1, value.getName());
ps.setInt(2, value.getAge());
},
//jdbc batch批量insert时,批量size怎样取较合适
// 在对接流数据时,不添加此参数可能会不执行sql语句
JdbcExecutionOptions.builder()
// 批大小。默认的批大小为5000
.withBatchSize(1000)
// 自动提交劈的时间间隔。毫秒值。默认是0,不会定时提交
.withBatchIntervalMs(200)
// 重复次数,默认为3
.withMaxRetries(5)
.build(),
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:mysql://localhost:3306/bigdata")
.withUsername("root")
.withPassword("root")
//MySQL版本8以上的用com.mysql.cj.jdbc.Driver
.withDriverName("com.mysql.jdbc.Driver")
.build()));
//TODO 4.execute
env.execute();
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class Student {
private Integer id;
private String name;
private Integer age;
}
}
接口有四个参数,其中第三个参数executionOptions可以省略使用默认值,具体样例参看1、JdbcSink方式
- sql
String类型,一个SQL语句模板,就是通常使用的PreparedStatement那种形式,例如:insert into wordcount (wordcl, countcl) values (?,?) - statementBuilder
JdbcStatementBuilder类型,作用是完成流数据与SQL具体列的对应,基于上一个参数的PreparedStatement形式,完成对应关系 - executionOptions
Flink Jdbc输出的执行规则,主要设置执行触发机制,主要设置三个参数:数据量触发阈值、时间触发阈值、最大重试次数。其中,数据量触发默认为5000,时间触发默认为0,即关闭时间触发。注意触发阈值不要设置的过低,否则可能造成数据库的阻塞。 - connectionOptions
JdbcConnectionOptions类型,用于设置数据库连接属性,包括Url、Driver、Username、Password等
JdbcSink解析:
KAFKA
配置pom文件
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.11</artifactId>
<version>1.13.6</version>
</dependency>
参数设置
kafka指令
kafka-server-start.sh -daemon /export/servers/kafka/config/server.properties
kafka-topics.sh --zookeeper hadoop111:2181/kafka --list
kafka-topics.sh --zookeeper hadoop111:2181/kafka --create --replication-factor 1 --partitions 1 --topic flinkkafka
bin/kafka-console-producer.sh --broker-list hadoop-001:9092 --topic flinkkafka
bin/kafka-console-consumer.sh --bootstrap-server hadoop-001:9092 --from-beginning --topic flinkkafka
Kafka Consumer
package com.gec.connectors;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.Properties;
/**
* Desc 演示Flink-Connectors-KafkaComsumer/Source
*/
public class KafkaComsumerDemo {
public static void main(String[] args) throws Exception {
//TODO 0.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC);
//TODO 1.source
//准备kafka连接参数
Properties props = new Properties();
props.setProperty("bootstrap.servers", "hadoop-001:9092");//集群地址
props.setProperty("group.id", "flink");//消费者组id
props.setProperty("auto.offset.reset","latest");//latest有offset记录从记录位置开始消费,没有记录从最新的/最后的消息开始消费(没有设置offset自动保存时,设置后都是从最新的位置开始消费) /earliest有offset记录从记录位置开始消费,没有记录从最早的/最开始的消息开始消费(没有设置offset自动保存时,设置后都是从最新的位置开始消费)
props.setProperty("flink.partition-discovery.interval-millis","5000");//会开启一个后台线程每隔5s检测一下Kafka的分区情况,实现动态分区检测
props.setProperty("enable.auto.commit", "true");//设置offset自动提交(提交到默认主题,后续学习了Checkpoint后随着Checkpoint存储在Checkpoint和默认主题中)
props.setProperty("auto.commit.interval.ms", "2000");//自动提交的时间间隔
//使用连接参数创建FlinkKafkaConsumer/kafkaSource
FlinkKafkaConsumer<String> kafkaSource = new FlinkKafkaConsumer<String>("flink_kafka", new SimpleStringSchema(), props);
//使用kafkaSource
DataStream<String> kafkaDS = env.addSource(kafkaSource);
//TODO 2.transformation
//TODO 3.sink
kafkaDS.print();
//TODO 4.execute
env.execute();
}
}
Kafka Producer
- 将Flink集合中的数据通过自定义Sink保存到Kafka
package com.gec.connectors;
import com.alibaba.fastjson.JSON;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import java.util.Properties;
/**
* Desc
* 使用自定义sink-官方提供的flink-connector-kafka_2.12-将数据保存到Kafka
*/
public class ConnectorsDemo_KafkaProducer {
public static void main(String[] args) throws Exception {
//1.env
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//2.Source
DataStreamSource<Student> studentDS = env.fromElements(new Student(1, "tonyma", 18));
//3.Transformation
//注意:目前来说我们使用Kafka使用的序列化和反序列化都是直接使用最简单的字符串,所以先将Student转为字符串
//可以直接调用Student的toString,也可以转为JSON
SingleOutputStreamOperator<String> jsonDS = studentDS.map(new MapFunction<Student, String>() {
@Override
public String map(Student value) throws Exception {
//String str = value.toString();
String jsonStr = JSON.toJSONString(value);
return jsonStr;
}
});
//4.Sink
jsonDS.print();
//根据参数创建KafkaProducer/KafkaSink
Properties props = new Properties();
props.setProperty("bootstrap.servers", "node1:9092");
FlinkKafkaProducer<String> kafkaSink = new FlinkKafkaProducer<>("flink_kafka", new SimpleStringSchema(), props);
jsonDS.addSink(kafkaSink);
//5.execute
env.execute();
// /export/server/kafka/bin/kafka-console-consumer.sh --bootstrap-server node1:9092 --topic flink_kafka
}
@Data
@NoArgsConstructor
@AllArgsConstructor
public static class Student {
private Integer id;
private String name;
private Integer age;
}
}