主类:
/**
* TODO:精确一次:
* 如果是聚合类运算: 使用事务,将聚合的结果和offset一起保存
* 如果是非聚合类的运算: 可以使用 at least once + 幂等输出 实现 精确一次
* --
* at least once: 取消offset的自动提交 + 将offset维护到kafka
* 幂等输出: 使用hbase等
*/
object hdfsAuditToLogToHbase {
val columnIdentifier: Array[String] = Array(
HBASE_HDFSAUDITLOG_TABLE_COLUMN1,
HBASE_HDFSAUDITLOG_TABLE_COLUMN2,
HBASE_HDFSAUDITLOG_TABLE_COLUMN3,
HBASE_HDFSAUDITLOG_TABLE_COLUMN4
)
def main(args: Array[String]) {
//1、配置spark环境
val sparkConf = new SparkConf().setAppName("hdfs_audit_log_to_hbase")
//形参1:conf:SparkConf:spark配置对象 形参2:batchDuration: Duration:采集时间(时效性)
val sc = new StreamingContext(sparkConf, Seconds(60))
//2、从kafka读取对应数据流
val kstream: InputDStream[ConsumerRecord[String, String]] = HdfsAudiUtil.getKafkaStream("oth_bigdata-Monitor_hdfs-audit", sc)
var offsetRange: Array[OffsetRange]=null
//!!!偏移量的提交只能使用foreachRdd来处理
//3、对数据进行过滤+格式转换+从headers中获取对应数据
//返回类型: (cluster, cmd, user, dir, ip, time)
val lines =
kstream
.map(rdd=> {
//获取kafka的偏移量
offsetRange = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd
})
.filter(x => x.value().contains("rpc@@@@@"))
.map(x => ExtractAudit(x.value().trim(),x.headers()))
.filter(x=> x._2=="delete"||x._2=="rename")
//4、数据输出
lines.foreachRDD(rdd=>{
//RDD为空时,无需再向下执行,否则在分区中还需要获取数据库连接(无用操作)
if (!rdd.isEmpty()) {
//一个分区执行一批SQL
rdd.foreachPartition(partition => {
//每个分区都会创建一个task任务线程,分区多,资源利用率高
//可通过参数配置分区数:"--conf spark.default.parallelism=20"
if (!partition.isEmpty) {
//partition和record共同位于本地计算节点Worker,故无需序列化发送conn和statement
//如果多个分区位于一个Worker中,则共享连接(位于同一内存资源中)
//获取HBase连接 分区创建一个连接,分区不跨节点,不需要序列化
val conn = HBaseUtil.getHBaseConn
if (conn == null) {
println("conn is null.") //在Worker节点的Executor中打印
} else {
println("conn is not null." + Thread.currentThread().getName())
//对每个分区的数据进行处理
partition.foreach(record => {
//每个分区中的记录在同一线程中处理
// println("record : " + Thread.currentThread().getName())
//通过工具类包装将数据写入hbase中
//数据record格式 (cluster, cmd, user, dir, ip, time)
HBaseUtil.putToHBase(conn,HBASE_HDFSAUDITLOG_TABLE_NAME
,HBaseUtil.makeRowKey(record._4,record._6,record._3)
,HBASE_HDFSAUDITLOG_COLUMNFAMILY
,columnIdentifier
,Array(record._4,record._6,record._3,record._2))
})
//关闭HBase连接(此处每个partition任务结束都会执行,会频繁开关连接,耗费资源)
HBaseUtil.closeHbaseConn()
}
}
})
//关闭HBase连接(此处只在Driver节点执行,故无效)
// HbaseUtil.closeHbaseConn()
}
//更新kafka offset
//提交offset 只有初始的DataStreaming是kafkaDS,提交offset---用初始的 ds 去提交offset
kstream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRange)
})
sc.start()
sc.awaitTermination()
}
//数据处理,得到cmd,ugi,ip,dir的值 --方法需要改写(从headers中获取对应的cluster数据)
def ExtractAudit(record: String, headers: Headers) = {
try {
//按空白部分(一个或多个空格)切割,过滤不包含“=”的数据
val elements: Array[String] =
record.split("\\s+")
.filter(x => x.contains("="))
//获取时间-->格式: 20220415
val time: String = record.split("\\s+")(0).replaceAll("-", "")
var kvs: Map[String, String] = Map()
//如果包含“=”,获取“=”前面的字符串
for (ele <- elements) {
val ind = ele.indexOf("=")
kvs += (ele.substring(0, ind) -> ele.substring(ind + 1))
}
//cmd的数据一定是可能包含write_op的格式--例:delete
var cmd = kvs("cmd")
val user = kvs("ugi")
val ip = kvs("ip").substring(1)
// val write_op = List("create", "delete", "mkdirs", "rename", "setAcl", "setOwner", "setPermission", "setReplication", "setTimes"
val dir = GetParentDir(kvs("src")).toString
//根据获取到的host,得到cluster --在这里进行改写从headers中获取
val cluster_host: (String, String) = HdfsAudiUtil.headersAnalysis(headers)
val cluster: String = cluster_host._1
val host: String = cluster_host._2
(cluster, cmd, user, dir, ip,time)
} catch {
case ex: Exception =>
println(ex)
("kong", "kong", "kong", "kong", "kong","kong")
}
}
@scala.annotation.tailrec
def findpost(ori: String, str: String, start: Int, count: Int): Int = {
val index = ori.indexOf(str, start)
if (index == -1) return -1
if (count <= 1) return index
findpost(ori, str, index + 1, count - 1)
}
//获取dir中路径,如果包括“warehouse”获取到“warehouse”后面2层路径,获取到“user”后面1层路径,获取到“/tmp/logs”后面3层路径,其他获取4层
def GetParentDir(ori: String): String = {
var index = -1
if (ori.contains("/warehouse/")) index = findpost(ori, "/", ori.indexOf("/warehouse/") + 1, 3)
else if (ori.startsWith("/user/")) index = findpost(ori, "/", 0, 3)
else if (ori.startsWith("/tmp/logs/")) index = findpost(ori, "/", 0, 4)
else index = findpost(ori, "/", 0, 5)
if (index != -1) ori.substring(0, index) else ori
}
}
常量类
object constCode {
//hdfs审计日志写入HBASE的表的设计 dir,time,user,cmd
val HBASE_HDFSAUDITLOG_TABLE_NAME: String = "XXXX"
val HBASE_HDFSAUDITLOG_COLUMNFAMILY: String = "data"
val HBASE_HDFSAUDITLOG_TABLE_COLUMN1: String = "dir"
val HBASE_HDFSAUDITLOG_TABLE_COLUMN2: String = "time"
val HBASE_HDFSAUDITLOG_TABLE_COLUMN3: String = "user"
val HBASE_HDFSAUDITLOG_TABLE_COLUMN4: String = "cmd"
}
rowkey设计--MD5
/**
* @author Alain
* @date 2022/4/15
* @description
*/
object MD5 {
def hashMD5(content: String): String = {
val md5 = MessageDigest.getInstance("MD5")
val encoded = md5.digest((content).getBytes)
encoded.map("%02x".format(_)).mkString
}
def main(args: Array[String]): Unit = {
val param = ("ALL","ALL","ALL","ALL","ALL","ALL","ALL").productIterator.mkString(",")
println(param)
val md5 = hashMD5(param)
println("20220101_1_"+md5) //20220101_1_b9af90d9b37ac0de6c0098d2dc118a75
}
}