1 工程目录

Flink实例(十一): connectors(十)hbase 读写 (一)_scala

 

 pom.xml

Flink实例(十一): connectors(十)hbase 读写 (一)_flink_02Flink实例(十一): connectors(十)hbase 读写 (一)_hadoop_03
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu.flinktutorial</groupId>
    <artifactId>flinktutorial</artifactId>
    <version>1.0-SNAPSHOT</version>

    <repositories>
        <repository>
            <id>apache.snapshots</id>
            <name>Apache Development Snapshot Repository</name>
            <url>https://repository.apache.org/content/repositories/snapshots/</url>
            <releases>
                <enabled>false</enabled>
            </releases>
            <snapshots>
                <enabled>true</enabled>
            </snapshots>
        </repository>
    </repositories>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <flink.version>1.10.0</flink.version>
        <scala.binary.version>2.11</scala.binary.version>
        <scala.version>2.11.12</scala.version>
    </properties>

    <dependencies>
        <!-- Apache Flink dependencies -->
        <!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-hbase_2.11</artifactId>
            <version>1.10.0</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-core</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-common</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-client</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-yarn-common</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-mapreduce-client-core</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.apache.hadoop</groupId>
                    <artifactId>hadoop-auth</artifactId>
                </exclusion>
            </exclusions>
        </dependency>


        <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>3.1.3</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <!--shaded主要是解决jar包冲突-->
            <artifactId>hbase-shaded-client</artifactId>
            <version>1.3.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.phoenix</groupId>
            <artifactId>phoenix-core</artifactId>
            <version>5.0.0-HBase-2.0</version>
            <exclusions>
                <exclusion>
                    <artifactId>hbase-client</artifactId>
                    <groupId>org.apache.hbase</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-cli</artifactId>
                    <groupId>commons-cli</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-codec</artifactId>
                    <groupId>commons-codec</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-io</artifactId>
                    <groupId>commons-io</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jackson-annotations</artifactId>
                    <groupId>com.fasterxml.jackson.core</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jackson-databind</artifactId>
                    <groupId>com.fasterxml.jackson.core</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jsr305</artifactId>
                    <groupId>com.google.code.findbugs</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>guava</artifactId>
                    <groupId>com.google.guava</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>guice</artifactId>
                    <groupId>com.google.inject</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>guice-servlet</artifactId>
                    <groupId>com.google.inject.extensions</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>2.0.5</version>
            <exclusions>
                <exclusion>
                    <artifactId>commons-codec</artifactId>
                    <groupId>commons-codec</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-io</artifactId>
                    <groupId>commons-io</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jackson-databind</artifactId>
                    <groupId>com.fasterxml.jackson.core</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jersey-server</artifactId>
                    <groupId>com.sun.jersey</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>com.lmax</groupId>
            <artifactId>disruptor</artifactId>
            <version>3.3.6</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.0.5</version>
            <exclusions>
                <exclusion>
                    <artifactId>commons-codec</artifactId>
                    <groupId>commons-codec</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jackson-databind</artifactId>
                    <groupId>com.fasterxml.jackson.core</groupId>
                </exclusion>
            </exclusions>
        </dependency>

            <dependency>
                <groupId>redis.clients</groupId>
                <artifactId>jedis</artifactId>
                <version>2.8.1</version>
            </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.56</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-jdbc_2.11</artifactId>
            <version>1.9.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>jsr305</artifactId>
                    <groupId>com.google.code.findbugs</groupId>
                </exclusion>
            </exclusions>
            <!--<scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>commons-cli</artifactId>
                    <groupId>commons-cli</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>commons-io</artifactId>
                    <groupId>commons-io</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jsr305</artifactId>
                    <groupId>com.google.code.findbugs</groupId>
                </exclusion>
            </exclusions>
            <!--<scope>provided</scope>-->
        </dependency>

        <!-- Scala Library, provided by Flink as well. -->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
            <!--<scope>provided</scope>-->
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.11_2.11</artifactId>
            <version>1.10.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.bahir</groupId>
            <artifactId>flink-connector-redis_2.11</artifactId>
            <version>1.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-elasticsearch6_2.11</artifactId>
            <version>1.10.0</version>
            <exclusions>
                <exclusion>
                    <artifactId>commons-codec</artifactId>
                    <groupId>commons-codec</groupId>
                </exclusion>
                <exclusion>
                    <artifactId>jackson-core</artifactId>
                    <groupId>com.fasterxml.jackson.core</groupId>
                </exclusion>
            </exclusions>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.44</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-cep-scala_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!-- Add connector dependencies here. They must be in the default scope (compile). -->

        <!-- Example:

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka-0.10_${scala.binary.version}</artifactId>
            <version>${flink.version}</version>
        </dependency>
        -->

        <!-- Add logging framework, to produce console output when running in the IDE. -->
        <!-- These dependencies are excluded from the application JAR by default. -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.7</version>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
            <scope>runtime</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <!-- We use the maven-shade plugin to create a fat jar that contains all necessary dependencies. -->
            <!-- Change the value of <mainClass>...</mainClass> if your program entry point changes. -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.1.1</version>
                <executions>
                    <!-- Run shade goal on package phase -->
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <artifactSet>
                                <excludes>
                                    <exclude>org.apache.flink:force-shading</exclude>
                                    <exclude>com.google.code.findbugs:jsr305</exclude>
                                    <exclude>org.slf4j:*</exclude>
                                    <exclude>log4j:*</exclude>
                                </excludes>
                            </artifactSet>
                            <filters>
                                <filter>
                                    <!-- Do not copy the signatures in the META-INF folder.
                                    Otherwise, this might cause SecurityExceptions when using the JAR. -->
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>com.atguigu.WordCountFromBatch</mainClass>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>

            <!-- Java Compiler -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>

            <!-- Scala Compiler -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <args>
                        <arg>-nobootcp</arg>
                    </args>
                </configuration>
            </plugin>

            <!-- Eclipse Scala Integration -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-eclipse-plugin</artifactId>
                <version>2.8</version>
                <configuration>
                    <downloadSources>true</downloadSources>
                    <projectnatures>
                        <projectnature>org.scala-ide.sdt.core.scalanature</projectnature>
                        <projectnature>org.eclipse.jdt.core.javanature</projectnature>
                    </projectnatures>
                    <buildcommands>
                        <buildcommand>org.scala-ide.sdt.core.scalabuilder</buildcommand>
                    </buildcommands>
                    <classpathContainers>
                        <classpathContainer>org.scala-ide.sdt.launching.SCALA_CONTAINER</classpathContainer>
                        <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
                    </classpathContainers>
                    <excludes>
                        <exclude>org.scala-lang:scala-library</exclude>
                        <exclude>org.scala-lang:scala-compiler</exclude>
                    </excludes>
                    <sourceIncludes>
                        <sourceInclude>**/*.scala</sourceInclude>
                        <sourceInclude>**/*.java</sourceInclude>
                    </sourceIncludes>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.codehaus.mojo</groupId>
                <artifactId>build-helper-maven-plugin</artifactId>
                <version>1.7</version>
                <executions>
                    <!-- Add src/main/scala to eclipse build path -->
                    <execution>
                        <id>add-source</id>
                        <phase>generate-sources</phase>
                        <goals>
                            <goal>add-source</goal>
                        </goals>
                        <configuration>
                            <sources>
                                <source>src/main/scala</source>
                            </sources>
                        </configuration>
                    </execution>
                    <!-- Add src/test/scala to eclipse build path -->
                    <execution>
                        <id>add-test-source</id>
                        <phase>generate-test-sources</phase>
                        <goals>
                            <goal>add-test-source</goal>
                        </goals>
                        <configuration>
                            <sources>
                                <source>src/test/scala</source>
                            </sources>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>




</project>
View Code

2 flink 读取hbase

HbaseUtil(scala读写hbase)

Flink实例(十一): connectors(十)hbase 读写 (一)_flink_02Flink实例(十一): connectors(十)hbase 读写 (一)_hadoop_03
package com.atguigu.flink.utils

import java.io.IOException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.JavaConverters._


object HbaseUtil {
  def main(args: Array[String]): Unit = {
    val conf = HbaseUtil.getHbaseConf()
    val hbconn = HbaseUtil.getHbaseConn(conf)
    val table:Table = hbconn.getTable(TableName.valueOf("student"))
    // 查询rowkey为1001的行数据
    HbaseUtil.getSingleRow(table,"1001")
    // 对table进行全表扫描
    //HbaseUtil.queryAll(table)

    hbconn.close()

  }


  // hbase配置
  def getHbaseConf(): Configuration = {
    val configuration = HBaseConfiguration.create()
    configuration.set("hbase.zookeeper.quorum", "192.168.1.122:2181,192.168.1.133:2181,192.168.1.144:2181")
    configuration
  }


  //hbase创建连接
  def getHbaseConn(conf:Configuration): Connection={
    val conn = ConnectionFactory.createConnection(conf)
    conn
  }

  //创建一个hbase表
  def createTable(conn:Connection,tableName: String, columnFamilys: Array[String]) = {
    //创建 HBaseAdmin 对象
    val adminTable:Admin = conn.getAdmin
    //操作的表名
    val tName = TableName.valueOf(tableName)
    //当表不存在的时候创建Hbase表
    if (!adminTable.tableExists(tName)) {
      //创建Hbase表模式
      val descriptor = new HTableDescriptor(tName)
      //创建列簇i
      for (columnFamily <- columnFamilys) {
        descriptor.addFamily(new HColumnDescriptor(columnFamily))
      }
      //创建表
      adminTable.createTable(descriptor)
      println("create successful!!")
    }else{
      print("table already existed")
    }
    adminTable.close()
  }

  // 删除一个表
  def dropTable(conn:Connection,tableName: String) = {
    //创建 HBaseAdmin 对象
    val adminTable:Admin = conn.getAdmin
    //操作的表名
    val tName = TableName.valueOf(tableName)
    //当表不存在的时候创建Hbase表
    if (adminTable.tableExists(tName)) {
      // 停用表
      adminTable.disableTable(tName)
      // 删除表
      adminTable.deleteTable(tName);
    }else{
      print("table does not exist")
    }
    adminTable.close()
  }

  //获取表
  def getHbaseTable(conn:Connection,tableName: String): Table={
    //创建 HBaseAdmin 对象
    val adminTable = conn.getAdmin
    //操作的表名
    val tName = TableName.valueOf(tableName)
    //当表不存在的时候创建Hbase表
    if (adminTable.tableExists(tName)) {
         val table = conn.getTable(tName)
         adminTable.close()
         return table
    }else {
      print("table does not exist")
      adminTable.close()
      return null
    }

  }


  //表添加数据
  def addRowData(table: Table, rowKey: String, columnFamily: String, quorm: String, value: String): Unit ={
    val rowPut: Put = new Put(Bytes.toBytes(rowKey))
    if (value == null) {
      rowPut.addColumn(columnFamily.getBytes, quorm.getBytes, "".getBytes())
    } else {
      rowPut.addColumn(columnFamily.getBytes, quorm.getBytes, value.getBytes)
    }
    table.put(rowPut)
  }



  //查询全部
  def queryAll(table: Table): ResultScanner = {
      // 包含起始行键,不包含结束行键,但是如果真的想查询出末尾的那个行键,那么,可以在末尾行键上拼接一个不可见的字节(\000)
      // val scan = new Scan("10".getBytes(), "10000".getBytes());
      //val scan = new Scan("10".getBytes(), "10000\001".getBytes())
     //val results: ResultScanner = table.getScanner(scan)


      val s = new Scan()
      val results: ResultScanner = table.getScanner(s)
      val iterator = results.iterator()
      while (iterator.hasNext){
        val result = iterator.next()
        val rowKey = Bytes.toString(result.getRow)
        print("rowkey",rowKey)
        val sb: StringBuffer = new StringBuffer()
        // 一行里面的所有cell将会被遍历
        for (cell:Cell <- result.listCells().asScala){
          // 列名
          val columnKey = Bytes.toString(cell.getQualifierArray, cell.getQualifierOffset, cell.getQualifierLength)
          // 列值
          val value = Bytes.toString(cell.getValueArray, cell.getValueOffset, cell.getValueLength)
          sb.append(value).append("_")
        }
    }
    return results
  }

  //查询某一行数据
  def getSingleRow(table: Table, rowKey: String): Result ={
    // Get对象 指定行健
    val get: Get = new Get(Bytes.toBytes(rowKey))

    // 行健为rowKey的全部数据
    val result: Result = table.get(get)

    // 从结果中取用户指定的某个列的value
    //val value =result.getValue("info".getBytes(), "age".getBytes())
    //print("single value",new String(value))

    for (rowKv <- result.rawCells()) {
      // 列族
      println("Famiily:" + new String(rowKv.getFamilyArray, rowKv.getFamilyOffset, rowKv.getFamilyLength, "UTF-8"))
      // 列名
      println("Qualifier:" + new String(rowKv.getQualifierArray, rowKv.getQualifierOffset, rowKv.getQualifierLength, "UTF-8"))
      // 时间戳
      println("TimeStamp:" + rowKv.getTimestamp)
      // rowkey
      println("rowkey:" + new String(rowKv.getRowArray, rowKv.getRowOffset, rowKv.getRowLength, "UTF-8"))
      // 列值
      println("Value:" + new String(rowKv.getValueArray, rowKv.getValueOffset, rowKv.getValueLength, "UTF-8"))
    }
    return result
  }

}
View Code

HbaseSource

package com.atguigu.flink.source

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.utils.HbaseUtil
import org.apache.flink.configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.JavaConverters._

class HbaseSource extends RichSourceFunction[SensorReading]{

  private var conn: Connection = null
  private var table: Table = null
  private var scan: Scan = null

  /**
   * 建立HBase连接
   * @param parameters
   */
  override def open(parameters: configuration.Configuration): Unit = {
    val conf = HbaseUtil.getHbaseConf()
    conn = HbaseUtil.getHbaseConn(conf)
    table = conn.getTable(TableName.valueOf("sensor"))

  }

  /**
   * run方法来自java的接口文件SourceFunction,使用IDEA工具Ctrl + o 无法便捷获取到该方法,直接override会提示
   * @param sourceContext
   */

  override def run(sourceContext: SourceFunction.SourceContext[SensorReading]): Unit = {
    scan = new Scan()
    val results: ResultScanner = table.getScanner(scan)
    val iterator = results.iterator()
    while (iterator.hasNext){
      var result = iterator.next()
      // 获取rowkey
      var rowKey = Bytes.toString(result.getRow)
      // 通过rowkey找到行数据
      var get: Get = new Get(Bytes.toBytes(rowKey))
      var element: Result = table.get(get)
      // 通过列族和列名找到对应值
      var id:String = new String(element.getValue("info".getBytes(), "id".getBytes()))
      var curTime= new String(element.getValue("info".getBytes(), "timestamp".getBytes())).toLong
      var timepreture= new String(element.getValue("info".getBytes(), "timepreture".getBytes())).toDouble
      // 发送数据
      sourceContext.collect(SensorReading(id,curTime,timepreture))

    }

  }

  /**
   * 必须重写
   */
  override def cancel(): Unit = {

  }

  /**
   * 关闭hbase的连接,关闭table表
   */
  override def close(): Unit = {
    try {
      if (table != null) {
        table.close()
      }
      if (conn != null) {
        conn.close()
      }
    } catch {
      case e:Exception => println(e.getMessage)
    }
  }
}

主程序 HbaseSourceSinkApp

package com.atguigu.flink.app

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.sink.HbaseSink
import org.apache.flink.streaming.api.scala
import org.apache.flink.streaming.api.scala._
import com.atguigu.flink.source.HbaseSource

object HbaseSourceSinkApp {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //调用addSource以此来作为数据输入端
    val stream: scala.DataStream[SensorReading] = env.addSource(new HbaseSource)

    //调用addSink以此来作为数据输出端
    stream.addSink(new HbaseSink)

    // 打印流
    stream.print()

    // 执行主程序
    env.execute()
  }
}

 

3 flink 写入 hbase

HbaseSink

package com.atguigu.flink.sink

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.utils.HbaseUtil
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink._
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes

/**
 * @Author: Yang JianQiu
 * @Date: 2019/3/1 1:34
 *
 *        写入HBase
 *        第一种:继承RichSinkFunction重写父类方法
 *
 *        注意:由于flink是一条一条的处理数据,所以我们在插入hbase的时候不能来一条flush下,
 *        不然会给hbase造成很大的压力,而且会产生很多线程导致集群崩溃,所以线上任务必须控制flush的频率。
 *
 *        解决方案:我们可以在open方法中定义一个变量,然后在写入hbase时比如500条flush一次,或者加入一个list,判断list的大小满足某个阀值flush一下
 */


class HbaseSink extends RichSinkFunction[SensorReading]{
  private var conn: Connection = null
  private var table: Table = null
  private var scan: Scan = null
  var mutator: BufferedMutator = null
  var count = 0
  var rowKey_test = 2000

  /**
   * 建立HBase连接
   *
   * @param parameters
   */
  override def open(parameters: Configuration): Unit ={
    val conf = HbaseUtil.getHbaseConf()
    conn = HbaseUtil.getHbaseConn(conf)
    val tableName: TableName = TableName.valueOf("psensor")
    val params: BufferedMutatorParams = new BufferedMutatorParams(tableName)
    //设置缓存1m,当达到1m时数据会自动刷到hbase
    params.writeBufferSize(1024 * 1024) //设置缓存的大小
    mutator = conn.getBufferedMutator(params)
    count = 0

  }

  /**
   * 处理获取的hbase数据
   *
   */

  override def invoke(value: SensorReading): Unit = {
    val cf1 = "info"
    var id = value.id
    var curtime = value.timestamp.toString
    var timperature = value.timepreture.toString

    val put: Put = new Put(Bytes.toBytes(rowKey_test.toString))
    put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("id"), Bytes.toBytes(id))
    put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("timestamp"), Bytes.toBytes(curtime))
    put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("timeperature"), Bytes.toBytes(timperature))
    mutator.mutate(put)
    //每满2条刷新一下数据
    if (count >= 2){
      mutator.flush()
      count = 0
    }
    count = count + 1
    rowKey_test = rowKey_test + 1
  }

  /**
   * 关闭
   */
  override def close(): Unit = {
    if (conn != null) conn.close()
  }


}

主程序 HbaseSourceSinkApp

package com.atguigu.flink.app

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.sink.HbaseSink
import org.apache.flink.streaming.api.scala
import org.apache.flink.streaming.api.scala._
import com.atguigu.flink.source.HbaseSource

object HbaseSourceSinkApp {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //调用addSource以此来作为数据输入端
    val stream: scala.DataStream[SensorReading] = env.addSource(new HbaseSource)

    //调用addSink以此来作为数据输出端
    stream.addSink(new HbaseSink)

    // 打印流
    stream.print()

    // 执行主程序
    env.execute()
  }
}

注意:端到端的一致性

  像Cassandra、HBase和Redis这样的KV数据库一般经常用来作为Sink,用以实现端到端的Exactly-Once。需要注意的是,并不是说一个KV数据库就百分百支持幂等写。幂等写对KV对有要求,那就是Key-Value必须是可确定性(Deterministic)计算的。假如我们设计的Key是:name + curTimestamp,每次执行数据重发时,生成的Key都不相同,会产生多次结果,整个操作不是幂等的。因此,为了追求端到端的Exactly-Once,我们设计业务逻辑时要尽量使用确定性的计算逻辑和数据模型。 

 

2 flink 读取hbase

HbaseUtil(scala读写hbase)

Flink实例(十一): connectors(十)hbase 读写 (一)_flink_02Flink实例(十一): connectors(十)hbase 读写 (一)_hadoop_03
package com.atguigu.flink.utils

import java.io.IOException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.JavaConverters._


object HbaseUtil {
  def main(args: Array[String]): Unit = {
    val conf = HbaseUtil.getHbaseConf()
    val hbconn = HbaseUtil.getHbaseConn(conf)
    val table:Table = hbconn.getTable(TableName.valueOf("student"))
    // 查询rowkey为1001的行数据
    HbaseUtil.getSingleRow(table,"1001")
    // 对table进行全表扫描
    //HbaseUtil.queryAll(table)

    hbconn.close()

  }


  // hbase配置
  def getHbaseConf(): Configuration = {
    val configuration = HBaseConfiguration.create()
    configuration.set("hbase.zookeeper.quorum", "192.168.1.122:2181,192.168.1.133:2181,192.168.1.144:2181")
    configuration
  }


  //hbase创建连接
  def getHbaseConn(conf:Configuration): Connection={
    val conn = ConnectionFactory.createConnection(conf)
    conn
  }

  //创建一个hbase表
  def createTable(conn:Connection,tableName: String, columnFamilys: Array[String]) = {
    //创建 HBaseAdmin 对象
    val adminTable:Admin = conn.getAdmin
    //操作的表名
    val tName = TableName.valueOf(tableName)
    //当表不存在的时候创建Hbase表
    if (!adminTable.tableExists(tName)) {
      //创建Hbase表模式
      val descriptor = new HTableDescriptor(tName)
      //创建列簇i
      for (columnFamily <- columnFamilys) {
        descriptor.addFamily(new HColumnDescriptor(columnFamily))
      }
      //创建表
      adminTable.createTable(descriptor)
      println("create successful!!")
    }else{
      print("table already existed")
    }
    adminTable.close()
  }

  // 删除一个表
  def dropTable(conn:Connection,tableName: String) = {
    //创建 HBaseAdmin 对象
    val adminTable:Admin = conn.getAdmin
    //操作的表名
    val tName = TableName.valueOf(tableName)
    //当表不存在的时候创建Hbase表
    if (adminTable.tableExists(tName)) {
      // 停用表
      adminTable.disableTable(tName)
      // 删除表
      adminTable.deleteTable(tName);
    }else{
      print("table does not exist")
    }
    adminTable.close()
  }

  //获取表
  def getHbaseTable(conn:Connection,tableName: String): Table={
    //创建 HBaseAdmin 对象
    val adminTable = conn.getAdmin
    //操作的表名
    val tName = TableName.valueOf(tableName)
    //当表不存在的时候创建Hbase表
    if (adminTable.tableExists(tName)) {
         val table = conn.getTable(tName)
         adminTable.close()
         return table
    }else {
      print("table does not exist")
      adminTable.close()
      return null
    }

  }


  //表添加数据
  def addRowData(table: Table, rowKey: String, columnFamily: String, quorm: String, value: String): Unit ={
    val rowPut: Put = new Put(Bytes.toBytes(rowKey))
    if (value == null) {
      rowPut.addColumn(columnFamily.getBytes, quorm.getBytes, "".getBytes())
    } else {
      rowPut.addColumn(columnFamily.getBytes, quorm.getBytes, value.getBytes)
    }
    table.put(rowPut)
  }



  //查询全部
  def queryAll(table: Table): ResultScanner = {
      // 包含起始行键,不包含结束行键,但是如果真的想查询出末尾的那个行键,那么,可以在末尾行键上拼接一个不可见的字节(\000)
      // val scan = new Scan("10".getBytes(), "10000".getBytes());
      //val scan = new Scan("10".getBytes(), "10000\001".getBytes())
     //val results: ResultScanner = table.getScanner(scan)


      val s = new Scan()
      val results: ResultScanner = table.getScanner(s)
      val iterator = results.iterator()
      while (iterator.hasNext){
        val result = iterator.next()
        val rowKey = Bytes.toString(result.getRow)
        print("rowkey",rowKey)
        val sb: StringBuffer = new StringBuffer()
        // 一行里面的所有cell将会被遍历
        for (cell:Cell <- result.listCells().asScala){
          // 列名
          val columnKey = Bytes.toString(cell.getQualifierArray, cell.getQualifierOffset, cell.getQualifierLength)
          // 列值
          val value = Bytes.toString(cell.getValueArray, cell.getValueOffset, cell.getValueLength)
          sb.append(value).append("_")
        }
    }
    return results
  }

  //查询某一行数据
  def getSingleRow(table: Table, rowKey: String): Result ={
    // Get对象 指定行健
    val get: Get = new Get(Bytes.toBytes(rowKey))

    // 行健为rowKey的全部数据
    val result: Result = table.get(get)

    // 从结果中取用户指定的某个列的value
    //val value =result.getValue("info".getBytes(), "age".getBytes())
    //print("single value",new String(value))

    for (rowKv <- result.rawCells()) {
      // 列族
      println("Famiily:" + new String(rowKv.getFamilyArray, rowKv.getFamilyOffset, rowKv.getFamilyLength, "UTF-8"))
      // 列名
      println("Qualifier:" + new String(rowKv.getQualifierArray, rowKv.getQualifierOffset, rowKv.getQualifierLength, "UTF-8"))
      // 时间戳
      println("TimeStamp:" + rowKv.getTimestamp)
      // rowkey
      println("rowkey:" + new String(rowKv.getRowArray, rowKv.getRowOffset, rowKv.getRowLength, "UTF-8"))
      // 列值
      println("Value:" + new String(rowKv.getValueArray, rowKv.getValueOffset, rowKv.getValueLength, "UTF-8"))
    }
    return result
  }

}
View Code

HbaseSource

package com.atguigu.flink.source

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.utils.HbaseUtil
import org.apache.flink.configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes
import scala.collection.JavaConverters._

class HbaseSource extends RichSourceFunction[SensorReading]{

  private var conn: Connection = null
  private var table: Table = null
  private var scan: Scan = null

  /**
   * 建立HBase连接
   * @param parameters
   */
  override def open(parameters: configuration.Configuration): Unit = {
    val conf = HbaseUtil.getHbaseConf()
    conn = HbaseUtil.getHbaseConn(conf)
    table = conn.getTable(TableName.valueOf("sensor"))

  }

  /**
   * run方法来自java的接口文件SourceFunction,使用IDEA工具Ctrl + o 无法便捷获取到该方法,直接override会提示
   * @param sourceContext
   */

  override def run(sourceContext: SourceFunction.SourceContext[SensorReading]): Unit = {
    scan = new Scan()
    val results: ResultScanner = table.getScanner(scan)
    val iterator = results.iterator()
    while (iterator.hasNext){
      var result = iterator.next()
      // 获取rowkey
      var rowKey = Bytes.toString(result.getRow)
      // 通过rowkey找到行数据
      var get: Get = new Get(Bytes.toBytes(rowKey))
      var element: Result = table.get(get)
      // 通过列族和列名找到对应值
      var id:String = new String(element.getValue("info".getBytes(), "id".getBytes()))
      var curTime= new String(element.getValue("info".getBytes(), "timestamp".getBytes())).toLong
      var timepreture= new String(element.getValue("info".getBytes(), "timepreture".getBytes())).toDouble
      // 发送数据
      sourceContext.collect(SensorReading(id,curTime,timepreture))

    }

  }

  /**
   * 必须重写
   */
  override def cancel(): Unit = {

  }

  /**
   * 关闭hbase的连接,关闭table表
   */
  override def close(): Unit = {
    try {
      if (table != null) {
        table.close()
      }
      if (conn != null) {
        conn.close()
      }
    } catch {
      case e:Exception => println(e.getMessage)
    }
  }
}

主程序 HbaseSourceSinkApp

package com.atguigu.flink.app

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.sink.HbaseSink
import org.apache.flink.streaming.api.scala
import org.apache.flink.streaming.api.scala._
import com.atguigu.flink.source.HbaseSource

object HbaseSourceSinkApp {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //调用addSource以此来作为数据输入端
    val stream: scala.DataStream[SensorReading] = env.addSource(new HbaseSource)

    //调用addSink以此来作为数据输出端
    stream.addSink(new HbaseSink)

    // 打印流
    stream.print()

    // 执行主程序
    env.execute()
  }
}

 

3 flink 写入 hbase

HbaseSink

package com.atguigu.flink.sink

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.utils.HbaseUtil
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink._
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.util.Bytes

/**
 * @Author: Yang JianQiu
 * @Date: 2019/3/1 1:34
 *
 *        写入HBase
 *        第一种:继承RichSinkFunction重写父类方法
 *
 *        注意:由于flink是一条一条的处理数据,所以我们在插入hbase的时候不能来一条flush下,
 *        不然会给hbase造成很大的压力,而且会产生很多线程导致集群崩溃,所以线上任务必须控制flush的频率。
 *
 *        解决方案:我们可以在open方法中定义一个变量,然后在写入hbase时比如500条flush一次,或者加入一个list,判断list的大小满足某个阀值flush一下
 */


class HbaseSink extends RichSinkFunction[SensorReading]{
  private var conn: Connection = null
  private var table: Table = null
  private var scan: Scan = null
  var mutator: BufferedMutator = null
  var count = 0
  var rowKey_test = 2000

  /**
   * 建立HBase连接
   *
   * @param parameters
   */
  override def open(parameters: Configuration): Unit ={
    val conf = HbaseUtil.getHbaseConf()
    conn = HbaseUtil.getHbaseConn(conf)
    val tableName: TableName = TableName.valueOf("psensor")
    val params: BufferedMutatorParams = new BufferedMutatorParams(tableName)
    //设置缓存1m,当达到1m时数据会自动刷到hbase
    params.writeBufferSize(1024 * 1024) //设置缓存的大小
    mutator = conn.getBufferedMutator(params)
    count = 0

  }

  /**
   * 处理获取的hbase数据
   *
   */

  override def invoke(value: SensorReading): Unit = {
    val cf1 = "info"
    var id = value.id
    var curtime = value.timestamp.toString
    var timperature = value.timepreture.toString

    val put: Put = new Put(Bytes.toBytes(rowKey_test.toString))
    put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("id"), Bytes.toBytes(id))
    put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("timestamp"), Bytes.toBytes(curtime))
    put.addColumn(Bytes.toBytes(cf1), Bytes.toBytes("timeperature"), Bytes.toBytes(timperature))
    mutator.mutate(put)
    //每满2条刷新一下数据
    if (count >= 2){
      mutator.flush()
      count = 0
    }
    count = count + 1
    rowKey_test = rowKey_test + 1
  }

  /**
   * 关闭
   */
  override def close(): Unit = {
    if (conn != null) conn.close()
  }


}

主程序 HbaseSourceSinkApp

package com.atguigu.flink.app

import com.atguigu.flink.bean.SensorReading
import com.atguigu.flink.sink.HbaseSink
import org.apache.flink.streaming.api.scala
import org.apache.flink.streaming.api.scala._
import com.atguigu.flink.source.HbaseSource

object HbaseSourceSinkApp {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //调用addSource以此来作为数据输入端
    val stream: scala.DataStream[SensorReading] = env.addSource(new HbaseSource)

    //调用addSink以此来作为数据输出端
    stream.addSink(new HbaseSink)

    // 打印流
    stream.print()

    // 执行主程序
    env.execute()
  }
}

注意:端到端的一致性

  像Cassandra、HBase和Redis这样的KV数据库一般经常用来作为Sink,用以实现端到端的Exactly-Once。需要注意的是,并不是说一个KV数据库就百分百支持幂等写。幂等写对KV对有要求,那就是Key-Value必须是可确定性(Deterministic)计算的。假如我们设计的Key是:name + curTimestamp,每次执行数据重发时,生成的Key都不相同,会产生多次结果,整个操作不是幂等的。因此,为了追求端到端的Exactly-Once,我们设计业务逻辑时要尽量使用确定性的计算逻辑和数据模型。