原始数据
格式为:timestamp | json(嵌套)
1570384823445|{"cp_game_id":xxx,"category":"cp_api","event":{"event_time":"xxx","event_name":"money_flow"},"data":{"GAME_SERVER":"xxx","ROLE_ID":"xxx","ROLE_NAME":"xxx","ROLE":"","SCHOOL":"","COMBAT":0,"ROLE_RANK":xxx,"ROLE_VIP":xxx,"ITEM_ID":"","COUNT":0,"AFTER_COUNT":0,"MONEY_TYPE":"元宝","MONEY":xxx,"AFTER_MONEY":xxx,"ADD_OR_REDUCE":1,"REASON":"游历消耗元宝","SUB_REASON":""}}
代码
package com.sm.cleandata
import java.io.File
import java.util.Properties
import com.sm.conf.ConfigurationManager
import com.sm.constants.Constants
import com.sm.utils.DateUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.storage.StorageLevel
import org.slf4j.LoggerFactory
/**
* create by LiuJinHe 2019/10/8
*/
object LoadCpOdsToDwd {
private val warehouseLocation: String = new File("spark-warehouse").getAbsolutePath
private val logger = LoggerFactory.getLogger("LoadCpOdsToDwd")
private var prop: Properties = new Properties()
private var db: String = _
private var tables: String = _
private var table: String = _
private var yesterday: String = _
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark_project.jetty").setLevel(Level.WARN)
// 拉取数据日期,格式为2019-10-08,默认为前一天
yesterday = DateUtils.getYesterdayDate
if (args.length == 1) {
yesterday = args(0)
}
// 初始化spark
val spark = initSparkSession
// 加载配置
prop = loadProp(Constants.HIVE_ODS)
db = prop.getProperty("hive.ods.database")
table = prop.getProperty("hive.ods.api.table")
// 读取数据
val dataFrame = spark.sql(s"select * from $db.$table where `date`='$yesterday'")
/**
* 1570384823445|
* {
* "cp_game_id":xxx,
* "category":"cp_api",
* "event":{"event_time":"xxx","event_name":"xxx"},
* "data":{"GAME_SERVER":"xxx","ROLE_ID":"xxx","ROLE_NAME":"xxx","ROLE":"","SCHOOL":"","COMBAT":0,"ROLE_RANK":xxx,"ROLE_VIP":xxx,"ITEM_ID":"","COUNT":xxx,"AFTER_COUNT":xxx,"MONEY_TYPE":"元宝","MONEY":xxx,"AFTER_MONEY":xxx,"ADD_OR_REDUCE":1,"REASON":"xxx","SUB_REASON":""}
* }
*/
/**
* root
* |-- line: string (nullable = true)
* |-- date: string (nullable = true)
*/
// 导入隐式转换和sparksql函数
import spark.implicits._
import org.apache.spark.sql.functions._
// 分隔字段
val jsonDF = dataFrame.withColumn("line", split(col("line"), "\\|"))
.select(
col("line").getItem(0).alias("timeserver"),
col("line").getItem(1).alias("json"),
col("date")
).drop("line")
// 解析json,方法一
jsonDF.select($"timeserver",
get_json_object($"json", "$.cp_game_id").as("cp_game_id"),
get_json_object($"json", "$.category").as("category"),
get_json_object($"json", "$.event.event_time").as("event_time"),
get_json_object($"json", "$.event.event_name").as("event_name"),
get_json_object($"json", "$.data").as("data"),
$"date"
).show()
/**
* 结果: 只解析了event,data根据event判断后再解析
* +-------------+----------+--------+-------------+----------+--------------------+----------+
* | timeserver|cp_game_id|category| event_time|event_name| data| date|
* +-------------+----------+--------+-------------+----------+--------------------+----------+
* |1570464001730| 633| cp_api|1570463997000|money_flow|{"GAME_SERVER":"5...|2019-10-08|
* |1570464001730| 633| cp_api|1570463998000|money_flow|{"GAME_SERVER":"5...|2019-10-08|
* |1570464002082| 633| cp_api|1570463997000| role_rank|{"GAME_SERVER":"5...|2019-10-08|
* |1570464002082| 633| cp_api|1570463999000| role_rank|{"GAME_SERVER":"5...|2019-10-08|
* +-------------+----------+--------+-------------+----------+--------------------+----------+
*/
// 解析json,方法二
val jsonSchema = new StructType()
.add("cp_game_id", StringType)
.add("category", StringType)
.add("event",
new StructType()
.add("event_time", StringType)
.add("event_name", StringType))
.add("data", StringType)
jsonDF.select($"timeserver", from_json($"json", jsonSchema) as "jsons", $"date")
.select(
$"timeserver",
$"jsons.cp_game_id",
$"jsons.category",
$"jsons.event.event_time",
$"jsons.event.event_name",
$"jsons.data",
$"date"
).show()
/**
* 结果: 只解析了event,data根据event判断后再解析
* +-------------+----------+--------+-------------+----------+--------------------+----------+
* | timeserver|cp_game_id|category| event_time|event_name| data| date|
* +-------------+----------+--------+-------------+----------+--------------------+----------+
* |1570464001730| 633| cp_api|1570463997000|money_flow|{"GAME_SERVER":"5...|2019-10-08|
* |1570464001730| 633| cp_api|1570463998000|money_flow|{"GAME_SERVER":"5...|2019-10-08|
* |1570464002082| 633| cp_api|1570463997000| role_rank|{"GAME_SERVER":"5...|2019-10-08|
* |1570464002082| 633| cp_api|1570463999000| role_rank|{"GAME_SERVER":"5...|2019-10-08|
* +-------------+----------+--------+-------------+----------+--------------------+----------+
*/
/* 解析json,方法三 sql方式
select
tmp.cp_game_id,
tmp.category,
get_json_object(tmp.event,"$.event_name") event_name,
get_json_object(tmp.event,"$.event_time") event_time,
tmp.time_server,
tmp.data,
tmp.`date`
from
(
select
get_json_object(split(line,'\\|')[1],"$.cp_game_id") cp_game_id,
get_json_object(split(line,'\\|')[1],"$.category") category,
get_json_object(split(line,'\\|')[1],"$.event") event,
split(line,'\\|')[0] time_server,
get_json_object(split(line,'\\|')[1],"$.data") data,
`date`
from sm_data.ods_cp_api_log
) as tmp;
*/
spark.stop()
}
// 加载配置
private def loadProp(db: String): Properties = {
val props = new Properties()
val in = ConfigurationManager.getClass.getClassLoader.getResourceAsStream(db)
props.load(in)
props
}
def initSparkSession: SparkSession = SparkSession.builder()
.appName(s"${this.getClass.getSimpleName}")
.master(Constants.SPARK_LOCAL_MODE)
.config("spark.sql.warehouse.dir", warehouseLocation)
.config("hive.exec.dynamic.partition", "true")
.config("hive.exec.dynamic.partition.mode", "nonstrict")
.config("hive.exec.max.dynamic.partitions", 2000)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.kryoserializer.buffer", "1024m")
.config("spark.kryoserializer.buffer.max", "2046m")
.config("spark.io.compression.codec", "snappy")
.config("spark.sql.codegen", "true")
.config("spark.sql.unsafe.enabled", "true")
.config("spark.shuffle.manager", "tungsten-sort")
.enableHiveSupport()
.getOrCreate()
}