大数据导数shell脚本,以及处理excel的powershell脚本
原创
©著作权归作者所有:来自51CTO博客作者great_yonchin的原创作品,请联系作者获取转载授权,否则将追究法律责任
Shell脚本:
1 获取数据存储路径(此处是oss路径):
cat make-csv.sh
#!/bin/bash
#
set -x
shopt -s expand_aliases
alias hive2='/usr/lib/hive-current/bin/hive -hivevar bucket=oss://cdp-pri-lakehouse-emr-stg'
cat <<EOF
=== table.txt === eg:
dwc.dwc_fact_afs_bdc_oppty_follow_detail_link_t,20240220
dwc.dwc_fact_afs_dmo2_opportunities_follow_full_t,20240221
dws.dws_fact_afs_bdc_accident_vehicle_clue_t
dws.dws_fact_afs_bdc_renewal_policy_premium_t
EOF
[ -d ./myosspath ] || mkdir -p myosspath
osspathCsv=./myosspath/osspath-parition-$(date +%Y%m%d_%H:%M).csv
#tableName=$1
#partitionName=$2
IFS=","
while read -r tableName partitionName
do
if [ ! -z ${tableName} ];then
if [ ! -z ${partitionName} ];then
partitionExistOrNot=$(hive2 -e "show partitions ${tableName}" | grep ${partitionName})
if [ ! -z ${partitionExistOrNot} ];then
partitionLocation=$(hive2 -e "DESCRIBE FORMATTED ${tableName} PARTITION(pday=${partitionName})" | grep Location | awk '{print $2}')
partitionDirName=$(basename ${partitionLocation})
tableLocation=$(dirname ${partitionLocation})
echo ${tableLocation},${partitionDirName} >> ${osspathCsv}
else
echo "${tableName}[${partitionName}] === No" >> ${osspathCsv}
fi
else
tableLocation=$(hive2 -e "DESCRIBE FORMATTED ${tableName}" | grep Location | awk '{print $2}')
echo ${tableLocation} >> ${osspathCsv}
fi
fi
done < table.txt
注:以上脚本,适用于不同环境间分别获取数据存储路径,如果在同一个环境,比如在stg环境中从脱敏库导入到非脱敏库可以使用以下优化的版本:
cat make-csv-tuomin.sh
#!/bin/bash
#
set -x
shopt -s expand_aliases
alias hive2='/usr/lib/hive-current/bin/hive -hivevar bucket=oss://cdp-pri-lakehouse-emr-stg'
cat <<EOF
=== table-tuomin.txt === eg:
cdl_tuomin.cdl_dim_com_voc_ji_config_mn_map_t,cdl_dim_com_voc_ji_config_mn_map_t,20240301
dwc_tuomin.dwc_dim_com_dmx_user_t,dwc.dwc_dim_com_dmx_user_t,20240301
dwc_tuomin.dwc_fact_com_voc_ji_q_class_plus_t,dwc.dwc_fact_com_voc_ji_q_class_plus_t
EOF
[ -d ./myosspath ] || mkdir -p myosspath
osspathCsv=./myosspath/osspath-parition-tuomin-$(date +%Y%m%d_%H:%M).csv
#srcTableName=$1
#destTableName=$2
#partitionName=$3
IFS=","
while read -r srcTableName destTableName partitionName
do
if [ ! -z ${srcTableName} ];then # 判断是否为空行
if [ ! -z ${partitionName} ];then
partitionExistOrNot=$(hive2 -e "show partitions ${srcTableName}" | grep ${partitionName})
if [ ! -z ${partitionExistOrNot} ];then
srcPartitionLocation=$(hive2 -e "DESCRIBE FORMATTED ${srcTableName} PARTITION(pday=${partitionName})" | grep Location | awk '{print $2}')
partitionDirName=$(basename ${srcPartitionLocation})
srcTableLocation=$(dirname ${srcPartitionLocation})
destTableLocation=$(hive2 -e "DESCRIBE FORMATTED ${destTableName}" | grep Location | awk '{print $2}')
echo ${srcTableLocation},${destTableLocation},${partitionDirName} >> ${osspathCsv}
else
echo "${srcTableName}[${partitionName}] === No" >> ${osspathCsv}
fi
else
srcTableLocation=$(hive2 -e "DESCRIBE FORMATTED ${srcTableName}" | grep Location | awk '{print $2}')
destTableLocation=$(hive2 -e "DESCRIBE FORMATTED ${destTableName}" | grep Location | awk '{print $2}')
echo ${srcTableLocation},${destTableLocation} >> ${osspathCsv}
fi
fi
done < table_tuomin.txt
2 数据迁移:
cat export-data-from-orca-to-stg.sh
#!/bin/bash
#
set -x
shopt -s expand_aliases
alias hive2='/usr/lib/hive-current/bin/hive -hivevar bucket=oss://cdp-pri-lakehouse-emr-stg'
cat <<EOF
=== orca-prod-to-stg-osspath.csv === eg:
oss://prod/my_data_t,oss://stg/my_data_t,pday=xxxx
oss://prod/my_data_t,oss://stg/my_data_t,xxxx
oss://prod/my_data_t,oss://stg/my_data_t
EOF
mycsv=${1:-./orca-prod-to-stg-osspath.csv}
IFS=","
while read -r osspath_prod osspath_stg partition_name # 此处的partition_name是指分区所关联的目录
do
prod_table_name=$(basename ${osspath_prod})
stg_table_name=$(basename ${osspath_stg})
stg_db_name=${stg_table_name%%_*}
count=0
if [ -z ${partition_name} ];then
files_list=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/ | grep ^oss)
total_count=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/ | grep ^oss | wc -l)
else
files_list=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/${partition_name}/ | grep ^oss)
total_count=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/${partition_name}/ | grep ^oss | wc -l)
fi
#cat <<EOF
echo ${files_list} | while read -r oss_file_path;do
echo "=== Download data ==="
ossutil -c .ossutilconfig-prod cp ${oss_file_path} ./${prod_table_name}/${partition_name}/
((count++))
echo "=== 总共 $total_count 文件, 已下载 $count 文件 ==="
## 每个下载10个文件上传一次,并删除下载的本地目录
if [ $((count % 10)) -eq 0 ];then
echo "=== Upload data ==="
hdfs dfs -put ./${prod_table_name}/* ${osspath_stg}/
echo "=== Remove local data -- ./${prod_table_name} -- ==="
[ -d "./${prod_table_name}" ] && rm -r ./${prod_table_name}
fi
done
echo "=== 上传最后一部分文件 ==="
hdfs dfs -put ./${prod_table_name}/* ${osspath_stg}/
echo "=== 删除最后一部分文件 ==="
[ -d "./${prod_table_name}" ] && rm -r ./${prod_table_name}
#EOF
#cat <<EOF
echo "=== 关联表与分区 ==="
hive2 -e """
alter table ${stg_db_name}.${stg_table_name} drop if exists partition(pday=${partition_name#*=});
alter table ${stg_db_name}.${stg_table_name} add partition(pday=${partition_name#*=}) location '${osspath_stg}/${partition_name}';
"""
#EOF
done < ${mycsv}
注:其中的.ossutilconfig-prod配置文件如下:
[Credentials]
language=CH
endpoint=oss-cn-beijing.aliyuncs.com
accessKeyID=LTAI5tJWfGroGgdXXXXXX
accessKeySecret=oXfZStPlgXr1TUwGbiXXXXXX
Spark导数和跑数脚本:
1 导数脚本
cat daoshu.sh
#!/bin/bash
#
if [ -z "$1" ];then
echo "Usage: $0 tableName"
echo "eg: $0 ads_rpt_bc4870_result_dr_t"
exit 1
fi
set -x
tableName=${1}
## queue: root.orca_bi, root.lakehouse_bi.others
qarams='''
--driver-memory 10G \
--num-executors 20 \
--executor-memory 16G \
--executor-cores 2 \
--queue root.orca_bi
--conf spark.sql.hive.convertMetastoreParquet=false \
--conf spark.sql.adaptive.skewJoin.enabled=true \
--conf spark.sql.parquet.writeLegacyFormat=true \
--conf spark.sql.debug.maxToStringFields=100
'''
outputDir=oss://cdp-pri-orca01-emr-prod/orca01_dr_data/output/${tableName}_$(date +%Y%m%d%H%M)
cat<<EOF
#########
sql("select * from orca01_dr_data.${tableName}").coalesce(1).write.option("header", "true").csv("${outputDir}")
#########
EOF
spark-shell ${qarams} <<EOF
sql("select * from orca01_dr_data.${tableName}").coalesce(1).write.option("header", "true").csv("${outputDir}")
EOF
csvFilePath=$(ossutil -c .myossutilconfig ls -s ${outputDir} | grep .csv)
ossutil -c .myossutilconfig cp ${csvFilePath} ${outputDir}/${tableName}.txt
ossutil -c .myossutilconfig rm ${csvFilePath}
2 跑数脚本
cat paoshu.sh
#!/bin/bash
#
if [ -z "$1" ] && [ -z "$2" ];then
echo "Usage: $0 pday fday [sql-script]"
echo "eg: $0 20240220 20240221 OR $0 20240220 20240221 my.sql"
exit 1
fi
set -x
pday=$1
fday=$2
sqlScript=$3
qarams='''
--driver-memory 10G \
--num-executors 20 \
--executor-memory 16G \
--executor-cores 2 \
--queue root.datacenter_st78
--conf spark.sql.hive.convertMetastoreParquet=false \
--conf spark.sql.adaptive.skewJoin.enabled=true \
--conf spark.sql.parquet.writeLegacyFormat=true
'''
if [ -z "$3" ];then
#cat <<EOF
spark-sql ${qarams} -d pday=${pday} -d fday=${fday}
#EOF
else
cat <<EOF
spark-sql ${qarams} -d pday=${pday} -d fday=${fday} -f ${sqlScript}
EOF
fi
Powershell脚本:
param (
[Parameter(Mandatory=$true)]
[string]$excelPath
)
# 指定 Excel 文件路径
#$excelPath = "D:\导数\手工表\dwc_fact_sal_ncs_os_funnel_mn_t 20240131全量.xlsx"
#$excelPath = "D:\导数\手工表\dwc_fact_sal_ncs_os_login_mn_t 20231231全量.xlsx"
$excelName = Split-Path -Leaf $excelPath
$excelNameWithoutExtension = [System.IO.Path]::GetFileNameWithoutExtension($excelName)
# 指定输出文本文件路径
$txtPath = "./$excelNameWithoutExtension.txt"
# 导入Excel操作需要的模块
Import-Module ImportExcel -ErrorAction Stop
# 读取Excel文件,跳过第一行(假设表头在第一行),然后导出为txt
Import-Excel $excelPath -StartRow 1| ForEach-Object {
# 将对象的属性转换为制表符分隔的字符串
$headers = $_.PSObject.Properties.Name
# 创建一个新的数组来存储当前行的格式化数据
$rowData = @()
# 遍历表头中的每一个列名
foreach ($header in $headers) {
# 尝试获取当前行中对应列名的值,如果不存在则添加空字符串
$cellValue = $_."$header"
if ($cellValue -eq $null) {
$cellValue = ""
}
# 将值添加到$rowData数组中
$rowData += $cellValue
}
# 将$rowData数组转换为一个逗号分隔的字符串,并输出到TXT文件
Add-Content -Path $txtPath -Value ($rowData -join "`u{0001}") -Encoding UTF8
}
# 输出完成信息
Write-Host "数据已成功转换为文本文件,其中做了:1. 去除了表头;2. 替换了制表符为\u0001;3. 去除了千分位。"
# ossutil 命令和配置文件路径
$ossutilExePath = "D:\ossutil-v1.7.18-windows-amd64\ossutil64.exe"
$ossutilConfig = "D:\导数\ossutilconfig-prod-shougongbiao"
# 生成上传目录路径,使用 Split 方法分割字符串,并去除前三个和最后一个元素
$parts = $excelNameWithoutExtension.Split('_')
$tableDir = $parts[3..($parts.Length - 2)] -join '_'
$pdayDir = "pday=" + $(Get-Date -Format 'yyyyMMdd')
$uploadDir = $tableDir + "/" + $pdayDir
# 上传文件
& $ossutilExePath -c $ossutilConfig cp $txtPath oss://e2ebi-sync/bi_to_cdp/manual_data_upload_to_cdp/$uploadDir/data.txt
# 输出完成信息
Write-Host "已经上传文件到 oss://e2ebi-sync/bi_to_cdp/manual_data_upload_to_cdp/$uploadDir/"
注:上面的脚本需要至少powershell 6 版本上运行,另外需要安装ImportExcel 插件。