Debezium 格式
对于上游是 Debezium 数据格式的数据,如 FlinkCDC 或 Kafka 中 Debezium 格式数据,可以使用 JsonDebeziumSchemaSerializer 序列化。
// enable checkpoint
env.enableCheckpointing(10000);
Properties props = new Properties();
props.setProperty("format", "json");
props.setProperty("read_json_by_line", "true");
DorisOptions dorisOptions = DorisOptions.builder()
.setFenodes("127.0.0.1:8030")
.setTableIdentifier("test.student")
.setUsername("root")
.setPassword("").build();
DorisExecutionOptions.Builder executionBuilder = DorisExecutionOptions.builder();
executionBuilder.setLabelPrefix("label-prefix")
.setStreamLoadProp(props)
.setDeletable(true);
DorisSink.Builder<String> builder = DorisSink.builder();
builder.setDorisReadOptions(DorisReadOptions.builder().build())
.setDorisExecutionOptions(executionBuilder.build())
.setDorisOptions(dorisOptions)
.setSerializer(JsonDebeziumSchemaSerializer.builder().setDorisOptions(dorisOptions).build());
env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source")
.sinkTo(builder.build());完整代码如下——
package org.apache.doris.flink.example;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.cdc.connectors.mysql.source.MySqlSource;
import org.apache.flink.cdc.connectors.shaded.org.apache.kafka.connect.json.JsonConverterConfig;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.doris.flink.cfg.DorisExecutionOptions;
import org.apache.doris.flink.cfg.DorisOptions;
import org.apache.doris.flink.cfg.DorisReadOptions;
import org.apache.doris.flink.sink.DorisSink;
import org.apache.doris.flink.sink.writer.serializer.JsonDebeziumSchemaSerializer;
import org.apache.doris.flink.utils.DateToStringConverter;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
public class CDCSchemaChangeExample {
public static void main(String[] args) throws Exception {
Map<String, Object> customConverterConfigs = new HashMap<>();
customConverterConfigs.put(JsonConverterConfig.DECIMAL_FORMAT_CONFIG, "numeric");
JsonDebeziumDeserializationSchema schema =
new JsonDebeziumDeserializationSchema(false, customConverterConfigs);
MySqlSource<String> mySqlSource =
MySqlSource.<String>builder()
.hostname("127.0.0.1")
.port(3306)
.databaseList("test") // set captured database
.tableList("test.t1") // set captured table
.username("root")
.password("123456")
.debeziumProperties(DateToStringConverter.DEFAULT_PROPS)
.deserializer(schema)
.serverTimeZone("Asia/Shanghai")
.includeSchemaChanges(true) // converts SourceRecord to JSON String
.build();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// enable checkpoint
env.enableCheckpointing(10000);
Properties props = new Properties();
props.setProperty("format", "json");
props.setProperty("read_json_by_line", "true");
DorisOptions dorisOptions =
DorisOptions.builder()
.setFenodes("127.0.0.1:8030")
.setTableIdentifier("test.t1")
.setUsername("root")
.setPassword("")
.build();
DorisExecutionOptions.Builder executionBuilder = DorisExecutionOptions.builder();
executionBuilder
.setLabelPrefix("label-doris" + UUID.randomUUID())
.setStreamLoadProp(props)
.setDeletable(true);
DorisSink.Builder<String> builder = DorisSink.builder();
builder.setDorisReadOptions(DorisReadOptions.builder().build())
.setDorisExecutionOptions(executionBuilder.build())
.setDorisOptions(dorisOptions)
.setSerializer(
JsonDebeziumSchemaSerializer.builder()
.setDorisOptions(dorisOptions)
.setNewSchemaChange(true)
.build());
env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source") // .print();
.sinkTo(builder.build());
env.execute("Print MySQL Snapshot + Binlog");
}
}代码功能概述
上面的这个代码示例实现了如下功能:实时捕获 MySQL 数据库的变更数据(CDC),并将其同步到 Doris 数据仓库中。
核心组件解析
1. 数据源配置(MySQL CDC)
MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
.hostname("127.0.0.1") // MySQL 地址
.port(3306) // MySQL 端口
.databaseList("test") // 监听的数据库
.tableList("test.t1") // 监听的表
.username("root") // 用户名
.password("123456") // 密码
.includeSchemaChanges(true) // 包含 schema 变更
.build();- 使用 Flink CDC 连接器捕获 MySQL 的 binlog
-
includeSchemaChanges(true)表示会捕获表结构变更
2. 数据序列化配置
Map<String, Object> customConverterConfigs = new HashMap<>();
customConverterConfigs.put(JsonConverterConfig.DECIMAL_FORMAT_CONFIG, "numeric");
JsonDebeziumDeserializationSchema schema =
new JsonDebeziumDeserializationSchema(false, customConverterConfigs);- 配置 JSON 反序列化器
- 将 decimal 类型格式化为 numeric 格式
3. Doris 目标配置
DorisOptions dorisOptions = DorisOptions.builder()
.setFenodes("127.0.0.1:8030") // Doris FE 节点
.setTableIdentifier("test.t1") // 目标表
.setUsername("root") // Doris 用户名
.setPassword("") // Doris 密码
.build();4. Doris 执行配置
DorisExecutionOptions.Builder executionBuilder = DorisExecutionOptions.builder();
executionBuilder
.setLabelPrefix("label-doris" + UUID.randomUUID()) // 唯一标签前缀
.setStreamLoadProp(props) // Stream Load 属性
.setDeletable(true); // 支持删除操作5. 数据序列化器(关键特性)
JsonDebeziumSchemaSerializer.builder()
.setDorisOptions(dorisOptions)
.setNewSchemaChange(true) // 启用新 schema 变更处理
.build()-
setNewSchemaChange(true)表示支持自动处理表结构变更
数据处理流程
- 数据捕获:从 MySQL 的 binlog 实时捕获数据变更
- 格式转换:将 Debezium 格式数据转换为 JSON
- Schema 变更处理:自动处理表结构变更
- 数据写入:通过 Stream Load 将数据写入 Doris
重要特性
- 实时同步:基于 CDC 的实时数据同步
- Schema 演化:自动处理源表结构变更
- 精确一次语义:通过 checkpoint 保证数据一致性
- 删除操作支持:
setDeletable(true)支持同步删除操作
















