spark(scala)获取fsimage,并分析出fsimage中的小文件信息
项目要求:
- 获取fsimage并解析, 做聚合操作
- 采用spark并发处理
// pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>flink-test</artifactId>
<groupId>com.sm</groupId>
<version>0.0.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>com.sm</groupId>
<artifactId>02-flink-scala</artifactId>
<properties>
<flink-scala.version>1.6.2</flink-scala.version>
<flink-streaming-scala.version>1.6.2</flink-streaming-scala.version>
<flink-client.version>1.6.2</flink-client.version>
<scala.version>2.11</scala.version>
<spark-streaming.version>2.4.4</spark-streaming.version>
<logback-classic.version>1.2.3</logback-classic.version>
<scala-logging.version>3.9.2</scala-logging.version>
<spark-sql.version>2.4.4</spark-sql.version>
<spark-hive.version>2.4.4</spark-hive.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_${scala.version}</artifactId>
<version>${flink-scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.version}</artifactId>
<version>${flink-streaming-scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.version}</artifactId>
<version>${flink-client.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version}</artifactId>
<version>${spark-streaming.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark-sql.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.version}</artifactId>
<version>${spark-hive.version}</version>
</dependency>
<!--logger-->
<dependency>
<groupId>com.typesafe.scala-logging</groupId>
<artifactId>scala-logging_${scala.version}</artifactId>
<version>${scala-logging.version}</version>
</dependency>
</dependencies>
</project>
代码如下:
// OfflineFSImage.scala
package com.sm
import java.io.File
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import scala.collection.mutable.ListBuffer
/**
* 〈一句话功能简述〉<br>
* 〈功能详细描述〉
*
* @author
* @see [相关类/方法](可选)
* @since [产品/模块版本] (可选)
* @date 2020/1/13
*/
object OfflineFSImage{
private[this] val appName: String = "OfflineFSImage"
private[this] val warehouseLocation: String = new File("spark-warehouse").getAbsolutePath
private[this] val hiveTableName: String = "hive表" // 存放所有hive的元数据信息
private[this] val hiveTableNameFinalResult: String = "final_result"
val fileImageSchema = StructType(
Seq(
StructField("cluster", StringType, true)
,StructField("path", StringType, true)
,StructField("user", StringType, true)
,StructField("sum", LongType, true)
,StructField("allsum", LongType, true)
,StructField("minmodificationtime", StringType, true)
,StructField("maxmodificationtime", StringType, true)
,StructField("minaccesstime", StringType, true)
,StructField("maxaccesstime", StringType, true)
,StructField("smallfilecount", IntegerType, true)
,StructField("allfilecount", IntegerType, true)
,StructField("ishive", StringType, true)
)
)
// 主方法入口
def main(args: Array[String]): Unit = {
var inputPath: String = ""
if (args.length < 1) {
println("输入参数为空,将采用默认的输入参数: hdfs://aaa:9000/user/cj0114/fsimage_001.csv")
inputPath = "hdfs://aaa:9000/user/cj0114/fsimage_001.csv"
} else {
inputPath = args(0)
}
val startTime = System.currentTimeMillis()
val sparkSession = SparkSession
.builder()
// .master("spark://bbb:7077")
.appName(appName)
.config("spark.sql.warehouse.dir", warehouseLocation)
.enableHiveSupport()
.getOrCreate()
val fsImageRDD: RDD[Array[String]] = sparkSession
.sparkContext
.textFile(inputPath)
.filter(!_.contains("NSQUOTA")) // 过滤fsimage中的第一行
.map(_.split("\\^"))
// fsimage文件存在异常数据
val errorMsg: RDD[Array[String]] = fsImageRDD.filter(_.length < 12)
println("errorMsg.size:" + errorMsg.count() + "个")
val rightMsg: RDD[Array[String]] = fsImageRDD.filter(_.length >= 12)
println("rightMsg.size:" + rightMsg.count() + "个")
// 获取hive表中的数据
val hiveTableFrame: DataFrame = sparkSession.sql("select * from " + hiveTableName)
// 将hive表中数据列subpath封装为树对象 进行broadcast
val subPathList: Array[String] = hiveTableFrame.rdd.map(row => {
row.getString(4)
}).collect()
println("subPathList=" + subPathList.length)
val node: TreeNode = DirectoryTree.addToTree(subPathList)
// 广播树数据
val broadcastValue = sparkSession.sparkContext.broadcast(node)
// 文件行
val fileRDD: RDD[(String, Row)] = rightMsg
.filter(!_(9).startsWith("d")) // 获取目录中没有.Trash的文件
.flatMap(splitPathByArray(_, broadcastValue.value))
.map(line => (line.getAs[String](1), line))
.reduceByKey((x,y) => { // x是总和
// cluster path user sum 总大小(包含副本的)
// min(修改时间) max(修改时间) min(创建时间) max(创建时间) 小文件(个数) 总文件(个数) hive(是否是hive)
Row(x.getString(0),
x.getString(1),
(if (x.getString(2).equals(y.getString(2))) x.getString(2) else "multiUser"),
x.getLong(3) + y.getLong(3),
x.getLong(4) + y.getLong(4),
(if (x.getString(5) > y.getString(5)) y.getString(5) else x.getString(5)),
(if (x.getString(6) > y.getString(6)) x.getString(6) else y.getString(6)),
(if (x.getString(7) > y.getString(7)) y.getString(7) else x.getString(7)),
(if (x.getString(8) > y.getString(8)) x.getString(8) else y.getString(8)),
x.getInt(9) + y.getInt(9),
x.getInt(10) + y.getInt(10),
x.getString(11))
}, 30)
// 数据量较大 无法执行collect操作 会降driver端内存撑爆
// .collect() // 收集分布在各个worker的数据到driver节点
// .map(_._2)
val directRDD: RDD[(String, Row)] = rightMsg
.filter(_(9).startsWith("d")) // 获取目录中没有.Trash的文件
.map(array => {
var pathSelf: String = array(0) // Path
// 判断是否存在
var isHiveTable = false;
if (DirectoryTree.findInTree(pathSelf, node)) {
isHiveTable = true;
}
(pathSelf, Row("cluster",
array(0),
array(10),
0L,
0L, // 计算文件的总大小=文件数 * 副本数
array(2),
array(2),
array(3),
array(3),
-1, // 标记是否是小文件
0, // 固定值, 方便计算总的个数
(if (isHiveTable) "是" else "否"))) // 是否是hive表)
})
// 输出目录存在,则删除
// val output: Path = new Path("hdfs://aaa:9000/user/2/offlineFSImageOutput")
// val output: Path = new Path("hdfs://a/user/1/hdfs_fsimage/offlineFSImageOutput/")
// val hdfs: FileSystem = org.apache.hadoop.fs.FileSystem.get(new URI("hdfs://aaa:9000"), new Configuration())
// val hdfs: FileSystem = org.apache.hadoop.fs.FileSystem.get(new URI("hdfs://a"), new Configuration())
//
// 删除输出目录
// if (hdfs.exists(output)) {
// hdfs.delete(output, true)
// }
// fileRDD.saveAsTextFile("hdfs://aaa:9000/user/2/offlineFSImageOutput")
// finalResult.saveAsTextFile("hdfs://a/user/4/hdfs_fsimage/offlineFSImageOutput/")
// fileRDD和directRDD会出现重合的地方 需要在directRDD中去除
val distinctRDD: RDD[(String, Row)] = directRDD.subtractByKey(fileRDD)
val finalRdd: RDD[Row] = fileRDD.union(distinctRDD).map(_._2)
println("finalRdd=" + finalRdd.count())
// 20200323 写入到hive表中
val fileRDDDF1: DataFrame = sparkSession.createDataFrame(finalRdd, fileImageSchema)
fileRDDDF1.createOrReplaceTempView("final_result_tmp")
sparkSession.sql("truncate table " + hiveTableNameFinalResult)
sparkSession.sqlContext.sql("insert into table " + hiveTableNameFinalResult + " select * from final_result_tmp")
sparkSession.close()
println("花费时间: " + (System.currentTimeMillis() - startTime) + " ms")
}
/**
* 将文件路径切割 /a/b/c.txt => ['/a','/a/b'] ,并且计算包括副本的总大小和是否是小文件
* @param fileLine
* @return
*/
@throws(classOf[NullPointerException])
def splitPathByArray(fileLine: Array[String], node: TreeNode): ListBuffer[Row] = {
if (fileLine == null || fileLine.length == 0) {
println("当前行为空或路径为空") // TODO 日志记录
}
var pathSelf: String = fileLine(0) // Path
// 判断是否存在
var isHiveTable = false;
if (DirectoryTree.findInTree(pathSelf, node)) {
isHiveTable = true;
}
val result = new ListBuffer[Row]
var loopControl: Boolean = true
var index: Int = -1
var splitResult: String = ""
while (loopControl) {
index = pathSelf.lastIndexOf("/")
if (index == 0 || index == -1) {
loopControl = false
} else {
splitResult = pathSelf.substring(0, index)
pathSelf = splitResult
// Path,Replication,ModificationTime,AccessTime,PreferredBlockSize,
// BlocksCount,FileSize,NSQUOTA,DSQUOTA,Permission,UserName,GroupName,
// 总文件大小,是否是小文件,文件固定(1)
//
// cluster path user sum 总大小(包含副本的)
// min(修改时间) max(修改时间) min(创建时间) max(创建时间) 小文件(个数) 总文件(个数) hive(是否是hive)
result += Row("cluster",
splitResult,
fileLine(10),
fileLine(6).toLong,
fileLine(1).toInt * fileLine(6).toLong, // 计算文件的总大小=文件数 * 副本数
fileLine(2),
fileLine(2),
fileLine(3),
fileLine(3),
(if (fileLine(6).toLong <= 1 * 1024 * 1024) 1 else 0), // 标记是否是小文件
1, // 固定值, 方便计算总的个数
(if (isHiveTable) "是" else "否") // 是否是hive表
)
}
}
return result
}
}
// DirectoryTree.scala
package com.sm
import scala.collection.mutable
import scala.collection.mutable.HashMap
/**
* 〈一句话功能简述〉<br>
* 〈功能详细描述〉
*
* @author miracle
* @see [相关类/方法](可选)
* @since [产品/模块版本] (可选)
* @date 2020/3/23
*/
object DirectoryTree {
val treeNode: TreeNode = new TreeNode("/")
/*
* 在树上查找节点
*/
def findInTree(directory: String, root: TreeNode): Boolean = {
return recursionfindInTree(directory.replaceFirst("/", ""), root.getNodeList);
}
def recursionfindInTree(directory: String, children: HashMap[String, TreeNode]): Boolean = {
if (directory == null || "".compareTo(directory) == 0 || children.size == 0) {
return true;
}
var currentDirect: String = ""
var lastDirect: String = ""
var firstSeperator: Int = directory.indexOf("/");
if (firstSeperator == -1) {
currentDirect = directory;
lastDirect = null;
} else {
currentDirect = directory.substring(0, firstSeperator);
lastDirect = directory.substring(firstSeperator + 1);
}
val exists = children.get(currentDirect)
if (exists != None) {
return recursionfindInTree(lastDirect, exists.get.nodeList);
} else {
return false;
}
}
def println(): String = {
val str: mutable.StringBuilder = new mutable.StringBuilder()
treeNode.nodeList.foreach(str.append)
return str.mkString
}
/*
* 根据list构建树
*/
def addToTree(list: Array[String]): TreeNode = {
list.foreach(directory => {
recursionAddToTree(directory.replaceFirst("/", ""), treeNode.getNodeList);
})
return treeNode
}
def recursionAddToTree(directory: String, children: HashMap[String, TreeNode]): Unit = {
if (directory == null || "".compareTo(directory) == 0) {
return ;
}
var currentDirect: String = ""
var lastDirect: String = ""
var firstSeperator: Int = directory.indexOf("/");
if (firstSeperator == -1) {
currentDirect = directory;
lastDirect = null;
} else {
currentDirect = directory.substring(0, firstSeperator);
lastDirect = directory.substring(firstSeperator + 1);
}
val exists = children.get(currentDirect)
if (exists != None) {
recursionAddToTree(lastDirect, exists.get.nodeList);
} else {
val newTreeNode: TreeNode = new TreeNode(currentDirect)
children.put(currentDirect, newTreeNode);
recursionAddToTree(lastDirect, newTreeNode.getNodeList);
}
}
}
// TreeNode.scala
package com.sm
import scala.collection.mutable.HashMap
/**
* 〈一句话功能简述〉<br>
* 〈功能详细描述〉
*
* @author miracle
* @see [相关类/方法](可选)
* @since [产品/模块版本] (可选)
* @date 2020/3/23
*/
class TreeNode(urlParam: String) extends Serializable {
var url: String = urlParam
var nodeList: HashMap[String, TreeNode] = new HashMap[String, TreeNode]()
def getUrl = url
def getNodeList = nodeList
def setUrl(value: String): Unit = url = value
def setNodeList(value: HashMap[String, TreeNode]): Unit = nodeList = value
}