Shell脚本:

 1 获取数据存储路径(此处是oss路径):
cat make-csv.sh 

#!/bin/bash

#
 
set -x
 
shopt -s expand_aliases

alias hive2='/usr/lib/hive-current/bin/hive -hivevar bucket=oss://cdp-pri-lakehouse-emr-stg'
 
cat <<EOF

=== table.txt === eg:

dwc.dwc_fact_afs_bdc_oppty_follow_detail_link_t,20240220

dwc.dwc_fact_afs_dmo2_opportunities_follow_full_t,20240221

dws.dws_fact_afs_bdc_accident_vehicle_clue_t

dws.dws_fact_afs_bdc_renewal_policy_premium_t

EOF
 
[ -d ./myosspath ] || mkdir -p myosspath
 
osspathCsv=./myosspath/osspath-parition-$(date +%Y%m%d_%H:%M).csv
 
#tableName=$1

#partitionName=$2
 
IFS=","
 
while read -r tableName partitionName

do

  if [ ! -z ${tableName} ];then
 
    if [ ! -z ${partitionName} ];then
 
      partitionExistOrNot=$(hive2 -e "show partitions ${tableName}" | grep ${partitionName})
 
      if [ ! -z ${partitionExistOrNot} ];then
 
        partitionLocation=$(hive2 -e "DESCRIBE FORMATTED ${tableName} PARTITION(pday=${partitionName})" | grep Location | awk '{print $2}')

        partitionDirName=$(basename ${partitionLocation})
 
        tableLocation=$(dirname ${partitionLocation})
 
        echo ${tableLocation},${partitionDirName} >> ${osspathCsv}
 
      else

        echo "${tableName}[${partitionName}] === No" >> ${osspathCsv}

      fi
 
    else
 
      tableLocation=$(hive2 -e "DESCRIBE FORMATTED ${tableName}" | grep Location | awk '{print $2}')

      echo ${tableLocation} >> ${osspathCsv}
 
    fi
 
  fi

done < table.txt

注:以上脚本,适用于不同环境间分别获取数据存储路径,如果在同一个环境,比如在stg环境中从脱敏库导入到非脱敏库可以使用以下优化的版本:

cat make-csv-tuomin.sh 
#!/bin/bash
#
 
set -x
 
shopt -s expand_aliases
alias hive2='/usr/lib/hive-current/bin/hive -hivevar bucket=oss://cdp-pri-lakehouse-emr-stg'
 
cat <<EOF
=== table-tuomin.txt === eg:
cdl_tuomin.cdl_dim_com_voc_ji_config_mn_map_t,cdl_dim_com_voc_ji_config_mn_map_t,20240301
dwc_tuomin.dwc_dim_com_dmx_user_t,dwc.dwc_dim_com_dmx_user_t,20240301
dwc_tuomin.dwc_fact_com_voc_ji_q_class_plus_t,dwc.dwc_fact_com_voc_ji_q_class_plus_t
EOF
 
[ -d ./myosspath ] || mkdir -p myosspath
osspathCsv=./myosspath/osspath-parition-tuomin-$(date +%Y%m%d_%H:%M).csv
 
 
#srcTableName=$1
#destTableName=$2
#partitionName=$3
 
IFS=","
 
while read -r srcTableName destTableName partitionName
do
  if [ ! -z ${srcTableName} ];then # 判断是否为空行
 
    if [ ! -z ${partitionName} ];then
 
      partitionExistOrNot=$(hive2 -e "show partitions ${srcTableName}" | grep ${partitionName})
 
      if [ ! -z ${partitionExistOrNot} ];then
 
        srcPartitionLocation=$(hive2 -e "DESCRIBE FORMATTED ${srcTableName} PARTITION(pday=${partitionName})" | grep Location | awk '{print $2}')
 
        partitionDirName=$(basename ${srcPartitionLocation})
 
        srcTableLocation=$(dirname ${srcPartitionLocation})
 
        destTableLocation=$(hive2 -e "DESCRIBE FORMATTED ${destTableName}" | grep Location | awk '{print $2}')
 
        echo ${srcTableLocation},${destTableLocation},${partitionDirName} >> ${osspathCsv}
 
      else
 
        echo "${srcTableName}[${partitionName}] === No" >> ${osspathCsv}
 
      fi
    else
 
        srcTableLocation=$(hive2 -e "DESCRIBE FORMATTED ${srcTableName}" | grep Location | awk '{print $2}')
 
        destTableLocation=$(hive2 -e "DESCRIBE FORMATTED ${destTableName}" | grep Location | awk '{print $2}')
        echo ${srcTableLocation},${destTableLocation} >> ${osspathCsv}
 
    fi
 
  fi
done < table_tuomin.txt
2 数据迁移:
cat export-data-from-orca-to-stg.sh 
#!/bin/bash
#
 
set -x
shopt -s expand_aliases
alias hive2='/usr/lib/hive-current/bin/hive -hivevar bucket=oss://cdp-pri-lakehouse-emr-stg'
cat <<EOF
=== orca-prod-to-stg-osspath.csv === eg:
  oss://prod/my_data_t,oss://stg/my_data_t,pday=xxxx
  oss://prod/my_data_t,oss://stg/my_data_t,xxxx
  oss://prod/my_data_t,oss://stg/my_data_t
EOF
mycsv=${1:-./orca-prod-to-stg-osspath.csv}
IFS=","
while read -r osspath_prod osspath_stg partition_name # 此处的partition_name是指分区所关联的目录
do
    prod_table_name=$(basename ${osspath_prod})
    stg_table_name=$(basename ${osspath_stg})
    stg_db_name=${stg_table_name%%_*}
    count=0
    if [ -z ${partition_name} ];then
        files_list=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/ | grep ^oss)
        total_count=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/ | grep ^oss | wc -l)
    else
        files_list=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/${partition_name}/ | grep ^oss)
        total_count=$(ossutil -c .ossutilconfig-prod ls -s ${osspath_prod}/${partition_name}/ | grep ^oss | wc -l)
    fi
 
#cat <<EOF
    echo  ${files_list} | while read -r oss_file_path;do
        echo "=== Download data ==="
        ossutil -c .ossutilconfig-prod cp  ${oss_file_path} ./${prod_table_name}/${partition_name}/
 
        ((count++))
        echo "=== 总共 $total_count 文件, 已下载 $count 文件 ==="
 
        ## 每个下载10个文件上传一次,并删除下载的本地目录
        if [ $((count % 10)) -eq 0 ];then
                echo "=== Upload data ==="
                hdfs dfs -put ./${prod_table_name}/* ${osspath_stg}/
 
                echo "=== Remove local data -- ./${prod_table_name} -- ==="
                [ -d "./${prod_table_name}" ] && rm -r ./${prod_table_name}
        fi
    done
 
    echo "=== 上传最后一部分文件 ==="
    hdfs dfs -put ./${prod_table_name}/* ${osspath_stg}/
 
    echo "=== 删除最后一部分文件 ==="
    [ -d "./${prod_table_name}" ] && rm -r ./${prod_table_name}
#EOF
#cat <<EOF
    echo  "=== 关联表与分区  ==="
    hive2 -e """
alter table ${stg_db_name}.${stg_table_name} drop if exists partition(pday=${partition_name#*=});
alter table ${stg_db_name}.${stg_table_name} add partition(pday=${partition_name#*=}) location '${osspath_stg}/${partition_name}';
"""
#EOF
done < ${mycsv}

注:其中的.ossutilconfig-prod配置文件如下:
[Credentials]
language=CH
endpoint=oss-cn-beijing.aliyuncs.com
accessKeyID=LTAI5tJWfGroGgdXXXXXX
accessKeySecret=oXfZStPlgXr1TUwGbiXXXXXX

Spark导数和跑数脚本:

1 导数脚本
cat daoshu.sh 
#!/bin/bash
#
 
if [ -z "$1" ];then
    echo "Usage: $0 tableName"
    echo "eg: $0 ads_rpt_bc4870_result_dr_t"
    exit 1
fi
 
set -x
 
tableName=${1}
 
## queue: root.orca_bi, root.lakehouse_bi.others
qarams='''
--driver-memory 10G \
--num-executors 20 \
--executor-memory 16G \
--executor-cores 2 \
--queue root.orca_bi
--conf spark.sql.hive.convertMetastoreParquet=false \
--conf spark.sql.adaptive.skewJoin.enabled=true \
--conf spark.sql.parquet.writeLegacyFormat=true \
--conf spark.sql.debug.maxToStringFields=100
'''
 
outputDir=oss://cdp-pri-orca01-emr-prod/orca01_dr_data/output/${tableName}_$(date +%Y%m%d%H%M)
 
cat<<EOF
#########
 
sql("select * from orca01_dr_data.${tableName}").coalesce(1).write.option("header", "true").csv("${outputDir}")
 
#########
EOF
 
spark-shell ${qarams} <<EOF 
sql("select * from orca01_dr_data.${tableName}").coalesce(1).write.option("header", "true").csv("${outputDir}")
EOF
 
csvFilePath=$(ossutil -c .myossutilconfig ls -s ${outputDir} | grep .csv)
 
ossutil -c .myossutilconfig  cp ${csvFilePath} ${outputDir}/${tableName}.txt
ossutil -c .myossutilconfig  rm ${csvFilePath}
2 跑数脚本
cat paoshu.sh
#!/bin/bash
#
 
if [ -z "$1" ] && [ -z "$2" ];then
    echo "Usage: $0 pday fday [sql-script]"
    echo "eg: $0 20240220 20240221 OR $0 20240220 20240221 my.sql"
    exit 1
fi
 
set -x
 
pday=$1
fday=$2
sqlScript=$3
 
qarams='''
--driver-memory 10G \
--num-executors 20 \
--executor-memory 16G \
--executor-cores 2 \
--queue root.datacenter_st78
--conf spark.sql.hive.convertMetastoreParquet=false \
--conf spark.sql.adaptive.skewJoin.enabled=true \
--conf spark.sql.parquet.writeLegacyFormat=true
'''
 
if [ -z "$3" ];then
#cat <<EOF
    spark-sql ${qarams}  -d pday=${pday} -d fday=${fday}
#EOF
else
cat <<EOF
    spark-sql ${qarams}  -d pday=${pday} -d fday=${fday} -f ${sqlScript}
EOF
fi

Powershell脚本:

param (
    [Parameter(Mandatory=$true)]
    [string]$excelPath
)

# 指定 Excel 文件路径
#$excelPath = "D:\导数\手工表\dwc_fact_sal_ncs_os_funnel_mn_t 20240131全量.xlsx"
#$excelPath = "D:\导数\手工表\dwc_fact_sal_ncs_os_login_mn_t 20231231全量.xlsx"

$excelName = Split-Path -Leaf $excelPath
$excelNameWithoutExtension = [System.IO.Path]::GetFileNameWithoutExtension($excelName)

# 指定输出文本文件路径
$txtPath = "./$excelNameWithoutExtension.txt"

# 导入Excel操作需要的模块
Import-Module ImportExcel -ErrorAction Stop

# 读取Excel文件,跳过第一行(假设表头在第一行),然后导出为txt
Import-Excel $excelPath  -StartRow 1| ForEach-Object {
    # 将对象的属性转换为制表符分隔的字符串
    $headers = $_.PSObject.Properties.Name

     # 创建一个新的数组来存储当前行的格式化数据
    $rowData = @()
    
    # 遍历表头中的每一个列名
    foreach ($header in $headers) {
        # 尝试获取当前行中对应列名的值,如果不存在则添加空字符串
        $cellValue = $_."$header"
        if ($cellValue -eq $null) {
            $cellValue = ""
        }
        # 将值添加到$rowData数组中
        $rowData += $cellValue
    }
    
    # 将$rowData数组转换为一个逗号分隔的字符串,并输出到TXT文件
    Add-Content -Path $txtPath -Value ($rowData  -join "`u{0001}") -Encoding UTF8
} 

# 输出完成信息
Write-Host "数据已成功转换为文本文件,其中做了:1. 去除了表头;2. 替换了制表符为\u0001;3. 去除了千分位。"

# ossutil 命令和配置文件路径
$ossutilExePath = "D:\ossutil-v1.7.18-windows-amd64\ossutil64.exe"
$ossutilConfig =  "D:\导数\ossutilconfig-prod-shougongbiao"

# 生成上传目录路径,使用 Split 方法分割字符串,并去除前三个和最后一个元素  
$parts = $excelNameWithoutExtension.Split('_')  
$tableDir = $parts[3..($parts.Length - 2)] -join '_'
$pdayDir = "pday=" + $(Get-Date -Format 'yyyyMMdd')
$uploadDir = $tableDir + "/" + $pdayDir

# 上传文件
& $ossutilExePath -c $ossutilConfig  cp $txtPath  oss://e2ebi-sync/bi_to_cdp/manual_data_upload_to_cdp/$uploadDir/data.txt

# 输出完成信息
Write-Host "已经上传文件到 oss://e2ebi-sync/bi_to_cdp/manual_data_upload_to_cdp/$uploadDir/"

注:上面的脚本需要至少powershell 6 版本上运行,另外需要安装ImportExcel 插件。