Debezium 格式

对于上游是 Debezium 数据格式的数据,如 FlinkCDC 或 Kafka 中 Debezium 格式数据,可以使用 JsonDebeziumSchemaSerializer 序列化。

// enable checkpoint
env.enableCheckpointing(10000);

Properties props = new Properties();
props.setProperty("format", "json");
props.setProperty("read_json_by_line", "true");
DorisOptions dorisOptions = DorisOptions.builder()
        .setFenodes("127.0.0.1:8030")
        .setTableIdentifier("test.student")
        .setUsername("root")
        .setPassword("").build();

DorisExecutionOptions.Builder  executionBuilder = DorisExecutionOptions.builder();
executionBuilder.setLabelPrefix("label-prefix")
        .setStreamLoadProp(props)
        .setDeletable(true);

DorisSink.Builder<String> builder = DorisSink.builder();
builder.setDorisReadOptions(DorisReadOptions.builder().build())
        .setDorisExecutionOptions(executionBuilder.build())
        .setDorisOptions(dorisOptions)
        .setSerializer(JsonDebeziumSchemaSerializer.builder().setDorisOptions(dorisOptions).build());

env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source")
        .sinkTo(builder.build());

完整代码如下——

参考:CDCSchemaChangeExample.java

package org.apache.doris.flink.example;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.cdc.connectors.mysql.source.MySqlSource;
import org.apache.flink.cdc.connectors.shaded.org.apache.kafka.connect.json.JsonConverterConfig;
import org.apache.flink.cdc.debezium.JsonDebeziumDeserializationSchema;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

import org.apache.doris.flink.cfg.DorisExecutionOptions;
import org.apache.doris.flink.cfg.DorisOptions;
import org.apache.doris.flink.cfg.DorisReadOptions;
import org.apache.doris.flink.sink.DorisSink;
import org.apache.doris.flink.sink.writer.serializer.JsonDebeziumSchemaSerializer;
import org.apache.doris.flink.utils.DateToStringConverter;

import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;

public class CDCSchemaChangeExample {

    public static void main(String[] args) throws Exception {

        Map<String, Object> customConverterConfigs = new HashMap<>();
        customConverterConfigs.put(JsonConverterConfig.DECIMAL_FORMAT_CONFIG, "numeric");
        JsonDebeziumDeserializationSchema schema =
                new JsonDebeziumDeserializationSchema(false, customConverterConfigs);

        MySqlSource<String> mySqlSource =
                MySqlSource.<String>builder()
                        .hostname("127.0.0.1")
                        .port(3306)
                        .databaseList("test") // set captured database
                        .tableList("test.t1") // set captured table
                        .username("root")
                        .password("123456")
                        .debeziumProperties(DateToStringConverter.DEFAULT_PROPS)
                        .deserializer(schema)
                        .serverTimeZone("Asia/Shanghai")
                        .includeSchemaChanges(true) // converts SourceRecord to JSON String
                        .build();
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // enable checkpoint
        env.enableCheckpointing(10000);

        Properties props = new Properties();
        props.setProperty("format", "json");
        props.setProperty("read_json_by_line", "true");
        DorisOptions dorisOptions =
                DorisOptions.builder()
                        .setFenodes("127.0.0.1:8030")
                        .setTableIdentifier("test.t1")
                        .setUsername("root")
                        .setPassword("")
                        .build();

        DorisExecutionOptions.Builder executionBuilder = DorisExecutionOptions.builder();
        executionBuilder
                .setLabelPrefix("label-doris" + UUID.randomUUID())
                .setStreamLoadProp(props)
                .setDeletable(true);

        DorisSink.Builder<String> builder = DorisSink.builder();
        builder.setDorisReadOptions(DorisReadOptions.builder().build())
                .setDorisExecutionOptions(executionBuilder.build())
                .setDorisOptions(dorisOptions)
                .setSerializer(
                        JsonDebeziumSchemaSerializer.builder()
                                .setDorisOptions(dorisOptions)
                                .setNewSchemaChange(true)
                                .build());

        env.fromSource(mySqlSource, WatermarkStrategy.noWatermarks(), "MySQL Source") // .print();
                .sinkTo(builder.build());

        env.execute("Print MySQL Snapshot + Binlog");
    }
}

代码功能概述

上面的这个代码示例实现了如下功能:实时捕获 MySQL 数据库的变更数据(CDC),并将其同步到 Doris 数据仓库中。

核心组件解析

1. 数据源配置(MySQL CDC)

MySqlSource<String> mySqlSource = MySqlSource.<String>builder()
    .hostname("127.0.0.1")          // MySQL 地址
    .port(3306)                     // MySQL 端口
    .databaseList("test")           // 监听的数据库
    .tableList("test.t1")           // 监听的表
    .username("root")               // 用户名
    .password("123456")             // 密码
    .includeSchemaChanges(true)     // 包含 schema 变更
    .build();
  • 使用 Flink CDC 连接器捕获 MySQL 的 binlog
  • includeSchemaChanges(true) 表示会捕获表结构变更

2. 数据序列化配置

Map<String, Object> customConverterConfigs = new HashMap<>();
customConverterConfigs.put(JsonConverterConfig.DECIMAL_FORMAT_CONFIG, "numeric");
JsonDebeziumDeserializationSchema schema = 
    new JsonDebeziumDeserializationSchema(false, customConverterConfigs);
  • 配置 JSON 反序列化器
  • 将 decimal 类型格式化为 numeric 格式

3. Doris 目标配置

DorisOptions dorisOptions = DorisOptions.builder()
    .setFenodes("127.0.0.1:8030")   // Doris FE 节点
    .setTableIdentifier("test.t1")   // 目标表
    .setUsername("root")             // Doris 用户名
    .setPassword("")                 // Doris 密码
    .build();

4. Doris 执行配置

DorisExecutionOptions.Builder executionBuilder = DorisExecutionOptions.builder();
executionBuilder
    .setLabelPrefix("label-doris" + UUID.randomUUID())  // 唯一标签前缀
    .setStreamLoadProp(props)        // Stream Load 属性
    .setDeletable(true);             // 支持删除操作

5. 数据序列化器(关键特性)

JsonDebeziumSchemaSerializer.builder()
    .setDorisOptions(dorisOptions)
    .setNewSchemaChange(true)        // 启用新 schema 变更处理
    .build()
  • setNewSchemaChange(true) 表示支持自动处理表结构变更

数据处理流程

  1. 数据捕获:从 MySQL 的 binlog 实时捕获数据变更
  2. 格式转换:将 Debezium 格式数据转换为 JSON
  3. Schema 变更处理:自动处理表结构变更
  4. 数据写入:通过 Stream Load 将数据写入 Doris

重要特性

  • 实时同步:基于 CDC 的实时数据同步
  • Schema 演化:自动处理源表结构变更
  • 精确一次语义:通过 checkpoint 保证数据一致性
  • 删除操作支持setDeletable(true) 支持同步删除操作