主类:

/**
 * TODO:精确一次:
 *    如果是聚合类运算:    使用事务,将聚合的结果和offset一起保存
 *    如果是非聚合类的运算:   可以使用  at least once +  幂等输出 实现 精确一次
 *    --
 *    at least once:  取消offset的自动提交 +  将offset维护到kafka
 *    幂等输出:   使用hbase等
 */

object hdfsAuditToLogToHbase {

  val columnIdentifier: Array[String] = Array(
    HBASE_HDFSAUDITLOG_TABLE_COLUMN1,
    HBASE_HDFSAUDITLOG_TABLE_COLUMN2,
    HBASE_HDFSAUDITLOG_TABLE_COLUMN3,
    HBASE_HDFSAUDITLOG_TABLE_COLUMN4
  )


  def main(args: Array[String]) {

    //1、配置spark环境
    val sparkConf = new SparkConf().setAppName("hdfs_audit_log_to_hbase")
        //形参1:conf:SparkConf:spark配置对象 形参2:batchDuration: Duration:采集时间(时效性)
    val sc = new StreamingContext(sparkConf, Seconds(60))
    //2、从kafka读取对应数据流
    val kstream: InputDStream[ConsumerRecord[String, String]] = HdfsAudiUtil.getKafkaStream("oth_bigdata-Monitor_hdfs-audit", sc)
    var offsetRange: Array[OffsetRange]=null
    //!!!偏移量的提交只能使用foreachRdd来处理

    //3、对数据进行过滤+格式转换+从headers中获取对应数据
    //返回类型: (cluster, cmd, user, dir, ip, time)
    val lines =
      kstream
        .map(rdd=> {
          //获取kafka的偏移量
          offsetRange = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
          rdd
        })
        .filter(x => x.value().contains("rpc@@@@@"))
        .map(x => ExtractAudit(x.value().trim(),x.headers()))
        .filter(x=> x._2=="delete"||x._2=="rename")
    //4、数据输出
    lines.foreachRDD(rdd=>{
      //RDD为空时,无需再向下执行,否则在分区中还需要获取数据库连接(无用操作)
      if (!rdd.isEmpty()) {
        //一个分区执行一批SQL
        rdd.foreachPartition(partition => {
          //每个分区都会创建一个task任务线程,分区多,资源利用率高
          //可通过参数配置分区数:"--conf spark.default.parallelism=20"
          if (!partition.isEmpty) {
            //partition和record共同位于本地计算节点Worker,故无需序列化发送conn和statement
            //如果多个分区位于一个Worker中,则共享连接(位于同一内存资源中)
            //获取HBase连接  分区创建一个连接,分区不跨节点,不需要序列化
            val conn = HBaseUtil.getHBaseConn
            if (conn == null) {
              println("conn is null.")  //在Worker节点的Executor中打印
            } else {
              println("conn is not null." + Thread.currentThread().getName())
              //对每个分区的数据进行处理
              partition.foreach(record => {
                //每个分区中的记录在同一线程中处理
//                println("record : " + Thread.currentThread().getName())
                //通过工具类包装将数据写入hbase中
                //数据record格式 (cluster, cmd, user, dir, ip, time)
               HBaseUtil.putToHBase(conn,HBASE_HDFSAUDITLOG_TABLE_NAME
                 ,HBaseUtil.makeRowKey(record._4,record._6,record._3)
                 ,HBASE_HDFSAUDITLOG_COLUMNFAMILY
                 ,columnIdentifier
                 ,Array(record._4,record._6,record._3,record._2))
              })

              //关闭HBase连接(此处每个partition任务结束都会执行,会频繁开关连接,耗费资源)
              HBaseUtil.closeHbaseConn()
            }
          }
        })
        //关闭HBase连接(此处只在Driver节点执行,故无效)
        //        HbaseUtil.closeHbaseConn()
      }

      //更新kafka offset
      //提交offset  只有初始的DataStreaming是kafkaDS,提交offset---用初始的 ds 去提交offset
      kstream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRange)
    })

    sc.start()
    sc.awaitTermination()

  }



  //数据处理,得到cmd,ugi,ip,dir的值   --方法需要改写(从headers中获取对应的cluster数据)
  def ExtractAudit(record: String, headers: Headers) = {
    try {
      //按空白部分(一个或多个空格)切割,过滤不包含“=”的数据
      val elements: Array[String] =
        record.split("\\s+")
        .filter(x => x.contains("="))
      //获取时间-->格式: 20220415
      val time: String = record.split("\\s+")(0).replaceAll("-", "")
      var kvs: Map[String, String] = Map()
      //如果包含“=”,获取“=”前面的字符串
      for (ele <- elements) {
        val ind = ele.indexOf("=")
        kvs += (ele.substring(0, ind) -> ele.substring(ind + 1))
      }

      //cmd的数据一定是可能包含write_op的格式--例:delete
      var cmd = kvs("cmd")

      val user = kvs("ugi")
      val ip = kvs("ip").substring(1)
//      val write_op = List("create", "delete", "mkdirs", "rename", "setAcl", "setOwner", "setPermission", "setReplication", "setTimes"

      val dir = GetParentDir(kvs("src")).toString

   //根据获取到的host,得到cluster  --在这里进行改写从headers中获取
      val cluster_host: (String, String) = HdfsAudiUtil.headersAnalysis(headers)
      val cluster: String = cluster_host._1
      val host: String = cluster_host._2
      (cluster, cmd, user, dir, ip,time)
    } catch {
      case ex: Exception =>
        println(ex)
        ("kong", "kong", "kong", "kong", "kong","kong")
    }

  }

  @scala.annotation.tailrec
  def findpost(ori: String, str: String, start: Int, count: Int): Int = {
    val index = ori.indexOf(str, start)
    if (index == -1) return -1
    if (count <= 1) return index
    findpost(ori, str, index + 1, count - 1)
  }

  //获取dir中路径,如果包括“warehouse”获取到“warehouse”后面2层路径,获取到“user”后面1层路径,获取到“/tmp/logs”后面3层路径,其他获取4层
  def GetParentDir(ori: String): String = {
    var index = -1
    if (ori.contains("/warehouse/")) index = findpost(ori, "/", ori.indexOf("/warehouse/") + 1, 3)
    else if (ori.startsWith("/user/")) index = findpost(ori, "/", 0, 3)
    else if (ori.startsWith("/tmp/logs/")) index = findpost(ori, "/", 0, 4)
    else index = findpost(ori, "/", 0, 5)

    if (index != -1) ori.substring(0, index) else ori
  }

}

常量类

object constCode {
  //hdfs审计日志写入HBASE的表的设计  dir,time,user,cmd
  val HBASE_HDFSAUDITLOG_TABLE_NAME: String = "XXXX"
  val HBASE_HDFSAUDITLOG_COLUMNFAMILY: String = "data"
  val HBASE_HDFSAUDITLOG_TABLE_COLUMN1: String = "dir"
  val HBASE_HDFSAUDITLOG_TABLE_COLUMN2: String = "time"
  val HBASE_HDFSAUDITLOG_TABLE_COLUMN3: String = "user"
  val HBASE_HDFSAUDITLOG_TABLE_COLUMN4: String = "cmd"


}

rowkey设计--MD5

/**
 * @author Alain
 * @date 2022/4/15 
 * @description
 */
object MD5 {

  def hashMD5(content: String): String = {
    val md5 = MessageDigest.getInstance("MD5")
    val encoded = md5.digest((content).getBytes)
    encoded.map("%02x".format(_)).mkString
  }

  def main(args: Array[String]): Unit = {
    val param = ("ALL","ALL","ALL","ALL","ALL","ALL","ALL").productIterator.mkString(",")
    println(param)
    val md5 = hashMD5(param)
    println("20220101_1_"+md5)  //20220101_1_b9af90d9b37ac0de6c0098d2dc118a75
  }


}