FlinkSQL Kafka to Hive

原创

訾零LY 2021-08-31 13:47:50 博主文章分类：Flink ©著作权

文章标签 flink hive apache scala sql 文章分类 运维

©著作权归作者所有：来自51CTO博客作者訾零LY的原创作品，请联系作者获取转载授权，否则将追究法律责任

基于CDH平台，Hive2.1.1-CDH6.2.0 ，写入Hive简单操作。

import java.time.Duration

import com.sm.common.conf.PropManager
import com.sm.constants.Constants
import com.sm.utils.FlinkUtils
import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
import org.apache.flink.streaming.api.environment.{CheckpointConfig, ExecutionCheckpointingOptions}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.SqlDialect
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment
import org.apache.flink.table.catalog.hive.HiveCatalog
import org.apache.log4j.Level
import org.slf4j.LoggerFactory

/**
  * flink sql kafka to hive
  *
  * create by LiuJinHe 2020/10/22
  */
object FlinkSqlKafka2Hive {
  private var logger: org.slf4j.Logger = _

  def main(args: Array[String]): Unit = {
    logger = LoggerFactory.getLogger(this.getClass.getSimpleName)
    org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
    org.apache.log4j.Logger.getLogger("org.apache").setLevel(Level.INFO)

    // 初始化 stream 环境
    // 本地测试，需要 flink-runtime-web 依赖
    val env = StreamExecutionEnvironment.createLocalEnvironmentWithWebUI()
    //    val env = StreamExecutionEnvironment.getExecutionEnvironment

    //    参数读取
    //    val params = ParameterTool.fromArgs(args)
    //    env.getConfig.setGlobalJobParameters(params)

    // 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次
    //    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 3))
    // 本地测试线程 1
    env.setParallelism(1)

    // 事件处理的时间，由系统时间决定
    env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)

    // 创建 streamTable 环境
    val tableEnv: StreamTableEnvironment = StreamTableEnvironment.create(env, FlinkUtils.getSettings)

    // checkpoint 设置
    val tableConfig = tableEnv.getConfig.getConfiguration
    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_MODE, CheckpointingMode.EXACTLY_ONCE)
    // checkpoint的超时时间周期，1 分钟做一次checkpoint
    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_INTERVAL, Duration.ofSeconds(30))
    // checkpoint的超时时间, 检查点一分钟内没有完成将被丢弃
    //    tableConfig.set(ExecutionCheckpointingOptions.CHECKPOINTING_TIMEOUT, Duration.ofSeconds(60))
    // checkpoint 最小间隔，两个检查点之间至少间隔 30 秒
    //    tableConfig.set(ExecutionCheckpointingOptions.MIN_PAUSE_BETWEEN_CHECKPOINTS, Duration.ofSeconds(30))
    // 同一时间只允许进行一个检查点
    //    tableConfig.set(ExecutionCheckpointingOptions.MAX_CONCURRENT_CHECKPOINTS, Integer.valueOf(1))
    // 手动cancel时是否保留checkpoint
    //    tableConfig.set(ExecutionCheckpointingOptions.EXTERNALIZED_CHECKPOINT,
    //      CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)

    // 设置状态的最小空闲时间和最大的空闲时间
    //    tableEnv.getConfig.setIdleStateRetentionTime(Time.hours(12), Time.hours(24))

    // 加载配置
    val catalog_name = PropManager.getProp(Constants.CATALOG_NAME)
    val database = PropManager.getProp(Constants.DEFAULT_DATABASE)
    val schemaDataBase = PropManager.getProp(Constants.SCHEMA_DATABASE)

    // 构造 hive catalog
    val hiveCatalog = new HiveCatalog(
      catalog_name,
      database,
      PropManager.getProp(Constants.HIVE_CONF_DIR)
    )

    tableEnv.registerCatalog(catalog_name, hiveCatalog)
    tableEnv.useCatalog(catalog_name)

    // 构造 kafka source, 用 DEFAULT
    tableEnv.getConfig.setSqlDialect(SqlDialect.DEFAULT)
    tableEnv.useDatabase(schemaDataBase)

    // 如果catalog中 kafka source 已建表，先删除
    tableEnv.executeSql("drop table if exists kafkaSourceTable")
    // kafka source table
    val kafkaSourceDDL =
      """
        |create table kafka_source_Table (
        | user_name string,
        | user_id bigint,
        | `time` bigint,
        | ts as to_timestamp(from_unixtime(`time` / 1000, 'yyyy-MM-dd HH:mm:ss')),
        | rt as proctime()
        |) with (
        | 'connector' = 'kafka',
        | 'topic' = 'flink_test',
        | 'properties.bootstrap.servers' = '192.168.100.39:9092',
        | 'properties.group.id' = 'flink-test-group',
        | 'format' = 'json',
        | 'scan.startup.mode' = 'group-offsets'
        |)
      """.stripMargin
    tableEnv.executeSql(kafkaSourceDDL)

    // hive catalog
    tableEnv.getConfig.setSqlDialect(SqlDialect.HIVE)
    tableEnv.useDatabase(database)
    // 删除表，已有 hive 表的情况下不用删除和建表
    // tableEnv.executeSql("drop table if exists test_hive_table")
    // hive sink table
    // 每次 checkpoint 完成后才会真正写入 Hive, 生成一个文件
    val hiveSinkDDL =
    """
      |create table hive_sink_table (
      | user_name string,
      | user_id bigint,
      | `time` bigint,
      | `date` string
      |)
      |partitioned by (`date` string)
      |row format delimited fields terminated by '\t'
      |stored as orc
      |location 'hdfs://BigdataCluster/user/hive/warehouse/test_data.db/test/test_hive_table'
      |tblproperties (
      | 'orc.compress'='SNAPPY',
      | 'partition.time-extractor.timestamp-pattern' = '$date 00:00:00',
      | 'sink.partition-commit.trigger' = 'process-time',
      | 'sink.partition-commit.policy.kind' = 'metastore,success-file'
      |)
    """.stripMargin
    tableEnv.executeSql(hiveSinkDDL)
    // 'sink.partition-commit.delay' = '10 s',

    // 写数据
    val insertSql =
      """
        |insert into hive_sink_table
        |select
        | user_name,
        | user_id,
        | `time`,
        | date_format(ts, 'yyyy-MM-dd') as `date`
        |from schema_data.kafka_source_Table 
      """.stripMargin
    tableEnv.executeSql(insertSql)


    // 测试打印输入数据, 1. 以 connector=print 打印; 2. 转成 dataStream.print
    //    val querySql =
    //      """
    //        |select
    //        | user_name,
    //        | user_id,
    //        | date_format(ts, 'yyyy-MM-dd') as `date`,
    //        | date_format(ts, 'HH'),
    //        | date_format(ts, 'mm')
    //        |from kafkaSourceTable
    //      """.stripMargin
    //    val table = tableEnv.sqlQuery(querySql)
    //    table.printSchema()
    //
    //    val resultDStream = tableEnv.toAppendStream[Row](table)
    //    resultDStream.print()
    //    env.execute("flink sql kafka 2 hive")
  }
}

pom

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

    <artifactId>realtime-flink-platform-test</artifactId>
    <groupId>com.sm.com.sm.job</groupId>
    <version>1.0-SNAPSHOT</version>
    <modelVersion>4.0.0</modelVersion>


    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
        <scala.version>2.11.12</scala.version>
        <scala.binary.version>2.11</scala.binary.version>
        <flink.version>1.11.2</flink.version>
        <hive.version>2.1.1-cdh6.2.0</hive.version>
        <hadoop.version>3.0.0-cdh6.2.0</hadoop.version>
    </properties>

    <repositories>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
            <releases>
                <enabled>true</enabled>
            </releases>
            <snapshots>
                <enabled>false</enabled>
            </snapshots>
        </repository>
    </repositories>

    <dependencies>
        <dependency>
            <groupId>com.sm.common</groupId>
            <artifactId>common</artifactId>
            <version>1.0-SNAPSHOT</version>
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>

        <!-- Flink Dependency -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-runtime-web_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-common</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-scala-bridge_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-json</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-orc_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-sql-connector-kafka_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-sql-connector-hive-2.2.0_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>

        <!--        Hive Dependency-->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-serde</artifactId>
            <version>${hive.version}</version>
            <!--            <scope>provided</scope>-->
            <exclusions>
                <exclusion>
                    <groupId>org.apache.logging.log4j</groupId>
                    <artifactId>log4j-slf4j-impl</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hive</groupId>
                    <artifactId>hive-common</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hive</groupId>
                    <artifactId>hive-service-rpc</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.parquet</groupId>
                    <artifactId>parquet-hadoop-bundle</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
            <!--            <scope>provided</scope>-->
            <exclusions>
                <exclusion>
                    <groupId>org.apache.logging.log4j</groupId>
                    <artifactId>log4j-slf4j-impl</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hive</groupId>
                    <artifactId>hive-llap-tez</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!--        Hadoop Dependency-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
            <scope>provided</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <!-- 该插件用于将Scala代码编译成class文件 -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>4.2.0</version>
                <executions>
                    <execution>
                        <goals>
                            <!--声明绑定到maven的compile阶段-->
                            <goal>compile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

提交集群时，在 $FLINK_HOME/lib 下放入

flink-sql-connector-hive-2.2.0_2.11-1.11.2.jar

上一篇：Scala GuavaUtils

下一篇：FlinkSQL 数据去重，读写HBase，Kafka

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯