package com.ctyun.dwi
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import com.shujia.utils.Geography
object DwiResRegnMergelocationMskDay {
def main(args: Array[String]): Unit = {
//基于ods的数据构建 位置数据融合表
val spark: SparkSession = SparkSession
.builder()
.appName("DwiResRegnMergelocationMskDay")
.enableHiveSupport() //开启Hive的支持
.getOrCreate()
//导入隐式转换及函数
import spark.implicits._
import org.apache.spark.sql.functions._
//在Spark SQL中使用自定义函数
/**
* 传入两个点的经纬度 计算距离
*/
val calculateLength: UserDefinedFunction = udf((longi1: Double, lati1: Double, longi2: Double, lati2: Double) => {
Geography.calculateLength(longi1, lati1, longi2, lati2)
})
//读取hive中ods层中的oidd的数据
val oidd: DataFrame = spark.table("ods.ods_oidd")
//1、将开始时间、结束时间分成两列
oidd
//withColumn可以给数据增加一列
.withColumn("start_t",split($"start_time",",")(1))//提取业务的开始时间
.withColumn("end_t",split($"start_time",",")(0))//提取业务的结束时间
//2、基于开始时间排序,取每一条数据的前一条数据 作为新的一列 lag
.withColumn("last_lg",lag($"longi",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的经度
.withColumn("last_lat",lag($"lati",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的纬度
//3、基于经纬度计算距离
.withColumn("distance",calculateLength($"longi",$"lati",$"last_lg",$"last_lat"))
//将结果保存到文件
.write
.format("csv")
.option("sep","\t")
.mode(SaveMode.Overwrite)
.save("/daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/")
}
/**
* 1、使用maven将代码打成jar包并上传
* 2、如果开启了Spark的historySever服务需要给/user/spark/applicationHistory目录通过acl设置权限
* hdfs dfs -setfacl -R -m user:dwi:rwx /user/spark/applicationHistory
* 3、使用spark-submit提交代码:
* spark-submit --master yarn-client --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar 20220527
* 4、查看结果目录的文件大小
* hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
*/
}
[dwi@master jars]$ spark-submit --master local --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar
[dwi@master jars]$ hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
package com.ctyun.dwi
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import com.shujia.utils.Geography
object DwiResRegnMergelocationMskDay {
def main(args: Array[String]): Unit = {
//基于ods的数据构建 位置数据融合表
val spark: SparkSession = SparkSession
.builder()
.appName("DwiResRegnMergelocationMskDay")
.enableHiveSupport() //开启Hive的支持
.config("spark.sql.shuffle.partitions","20")
.getOrCreate()
//导入隐式转换及函数
import spark.implicits._
import org.apache.spark.sql.functions._
//在Spark SQL中使用自定义函数
/**
* 传入两个点的经纬度 计算距离
*/
val calculateLength: UserDefinedFunction = udf((longi1: Double, lati1: Double, longi2: Double, lati2: Double) => {
Geography.calculateLength(longi1, lati1, longi2, lati2)
})
//读取hive中ods层中的oidd的数据
val oidd: DataFrame = spark.table("ods.ods_oidd")
//1、将开始时间、结束时间分成两列
oidd
//withColumn可以给数据增加一列
.withColumn("start_t",split($"start_time",",")(1))//提取业务的开始时间
.withColumn("end_t",split($"start_time",",")(0))//提取业务的结束时间
//2、基于开始时间排序,取每一条数据的前一条数据 作为新的一列 lag
.withColumn("last_lg",lag($"longi",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的经度
.withColumn("last_lat",lag($"lati",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的纬度
//3、基于经纬度计算距离
.withColumn("distance",calculateLength($"longi",$"lati",$"last_lg",$"last_lat"))
//将结果保存到文件
.write
.format("csv")
.option("sep","\t")
.mode(SaveMode.Overwrite)
.save("/daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/")
}
/**
* 1、使用maven将代码打成jar包并上传
* 2、如果开启了Spark的historySever服务需要给/user/spark/applicationHistory目录通过acl设置权限
* hdfs dfs -setfacl -R -m user:dwi:rwx /user/spark/applicationHistory
* 3、使用spark-submit提交代码:
* spark-submit --master yarn-client --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar 20220527
* 4、查看结果目录的文件大小
* hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
*/
}
[dwi@master jars]$ spark-submit --master local --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar
[dwi@master jars]$ hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
[dwi@master jars]$ sz part-00019-f049eef9-9d20-4118-8288-ce153d9c9d17-c000.csv
package com.ctyun.dwi
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import com.shujia.utils.Geography
object DwiResRegnMergelocationMskDay {
def main(args: Array[String]): Unit = {
//基于ods的数据构建 位置数据融合表
val spark: SparkSession = SparkSession
.builder()
.appName("DwiResRegnMergelocationMskDay")
.enableHiveSupport() //开启Hive的支持
.config("spark.sql.shuffle.partitions","20")
.getOrCreate()
//导入隐式转换及函数
import spark.implicits._
import org.apache.spark.sql.functions._
//在Spark SQL中使用自定义函数
/**
* 传入两个点的经纬度 计算距离
*/
val calculateLength: UserDefinedFunction = udf((longi1: Double, lati1: Double, longi2: Double, lati2: Double) => {
Geography.calculateLength(longi1, lati1, longi2, lati2)
})
//读取hive中ods层中的oidd的数据
val oidd: DataFrame = spark.table("ods.ods_oidd")
//1、将开始时间、结束时间分成两列
oidd
//withColumn可以给数据增加一列
.withColumn("start_t",split($"start_time",",")(1))//提取业务的开始时间
.withColumn("end_t",split($"start_time",",")(0))//提取业务的结束时间
//2、基于开始时间排序,取每一条数据的前一条数据 作为新的一列 lag
.withColumn("last_lg",lag($"longi",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的经度
.withColumn("last_lat",lag($"lati",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的纬度
.withColumn("last_end_time",lag($"end_t",1)over Window.partitionBy($"mdn").orderBy($"start_t"))//取上一条数据的结束时间
//3、计算相邻两条位置记录之间的时间间隔
.withColumn("diff_time",unix_timestamp($"start_t","yyyyMMddHHmmss")-unix_timestamp($"last_end_time","yyyyMMddHHmmss"))
//4、基于经纬度计算距离
.withColumn("distance",when($"last_lg".isNull,1).otherwise(calculateLength($"longi",$"lati",$"last_lg",$"last_lat")))
//5、根据距离及时间间隔计算速度
.withColumn("speed",round($"distance"/$"diff_time",3))
//将结果保存到文件
.write
.format("csv")
.option("sep","\t")
.mode(SaveMode.Overwrite)
.save("/daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/")
}
/**
* 1、使用maven将代码打成jar包并上传
* 2、如果开启了Spark的historySever服务需要给/user/spark/applicationHistory目录通过acl设置权限
* hdfs dfs -setfacl -R -m user:dwi:rwx /user/spark/applicationHistory
* 3、使用spark-submit提交代码:
* spark-submit --master yarn-client --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar 20220527
* 4、查看结果目录的文件大小
* hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
*/
}
[dwi@master jars]$ spark-submit --master local[*] --class com.ctyun.dwi.DwiResRegnMergelocationMskDay --jars common-1.0.jar dwi-1.0.jar
[dwi@master jars]$ hdfs dfs -du -h /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/
[dwi@master jars]$ sz part-00019-f049eef9-9d20-4118-8288-ce153d9c9d17-c000.csv
[dwi@master jars]$ ls
common-1.0.jar
dwi-1.0.jar
part-00019-f049eef9-9d20-4118-8288-ce153d9c9d17-c000.csv
[dwi@master jars]$ rm part-00019-f049eef9-9d20-4118-8288-ce153d9c9d17-c000.csv
[dwi@master jars]$ ls
common-1.0.jar dwi-1.0.jar
[dwi@master jars]$ hdfs dfs -get /daas/motl/dwi/dwi_res_regn_mergelocation_msk_d/part-00019-2a560f0e-c4e2-46cc-97e2-7f604cfb0918-c000.csv
[dwi@master jars]$ ls
common-1.0.jar
dwi-1.0.jar
part-00019-2a560f0e-c4e2-46cc-97e2-7f604cfb0918-c000.csv