设置队列
set hive.execution.engine=mr;
SET mapreduce.job.queuename=root.users.liuwenchao;
配置案列
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.parallel=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.auto.convert.join = true;
set hive.mapjoin.smalltable.filesize=200000000;
set hive.exec.parallel=true;
set hive.auto.convert.join = true;
set hive.mapjoin.smalltable.filesize=25000000;
set mapreduce.map.memory.mb=3048;
set mapreduce.reduce.memory.mb=3048;
set yarn.nodemanager.vmem-pmem-ratio=3;
set hive.auto.convert.join = false; 关闭mapjoin ,默认是true因为报如下错误:
Caused by: java.lang.IllegalStateException: ManagerFactory [org.apache.logging.log4j.core.appender.RandomAccessFileManager$RandomAccessFileManagerFactory@7d710482] unable to create manager for [/var/log/hive/operation_logs/1fd2540e-80cf-4553-893c-7a9a819fae3a/dev_20210126104003_b541ec4d-44df-49f6-8609-a5179e460734] with data [org.apache.logging.log4j.core.appender.RandomAccessFileManager$FactoryData@7261773c]
参数优化
合并小文件参数
合并小文件参数
set hive.merge.mapfiles=true;
set hive.merge.mapredfiles=true;
set hive.merge.size.per.task=128000000;
set hive.merge.smallfiles.avgsize=128000000;
set mapred.max.split.size=128000000;
案列
beeline -u 'jdbc:hive2://node03-bigdata-prod-bj1.ybm100.top:10000/;principal=hive/node03-bigdata-prod-bj1.ybm100.top@YBM100.COM' -e "
set hive.exec.parallel=true;
set mapreduce.map.memory.mb=2048;
set mapreduce.reduce.memory.mb=2048;
set hive.merge.mapfiles=true;
--在Map-only的任务结束时合并小文件
set hive.merge.mapredfiles=true;
--在Map-Reduce的任务结束时合并小文件
set hive.merge.size.per.task=128000000;
--合并后文件的大小为128M左右
set hive.merge.smallfiles.avgsize=128000000;
--当输出文件的平均大小小于128M时,启动一个独立的map-reduce任务进行文件merge
select
standard_library_id --标准库id
from xyy_bigdata_ods.saas_xyy_saas_product_saas_product_baseinfo
where data_is_del =0 or data_is_del is null --1表示物理删除 0 和NULL 表示非物理删除
group by pref,standard_library_id
) t3
on t1.product_code = t3.pref;"
shell动态传参
方法一 ,对于SQL文件里有特殊字符
通过sheel文件执行 SQL文件如下:
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.parallel=true;
set hive.auto.convert.join = true;
set hive.mapjoin.smalltable.filesize=25000000;
set mapreduce.map.memory.mb=3072;
set mapreduce.reduce.memory.mb=2048;
drop table xyy_temp_data.tmp_b_event_detail_2;
create table xyy_temp_data.tmp_b_event_detail_2
as
select user_id,
dt,
action_type,
property_name,
property_value,
a.event_id
from
(
select user_id,
event_id,
dt,
action_type
from xyy_snowground.b_event_2
where dt='${day}'
)a
join
(
select event_id,
property_name,
property_value
from xyy_snowground.b_eventdetail_2
where dt='${day}'
)b
on a.event_id=b.event_id;
---PC端访客数和各时段访客日均落地的表
CREATE TABLE if not exists xyy_app_data.gdd_pc_visitor_platform(
create_time string comment '日期',
hour_time string comment '时段',
erp_org_code string comment 'erp机构编码',
provice_name string comment '机构名称',
branch_code string comment 'ec机构编码',
product_code string comment 'erp 神农商品编码',
barcode string comment 'ec商品编码',
user_id string comment '访客ID'
)
comment '供多多权益平台_pc端访客'
partitioned by (dt string )
stored as parquet;
insert overwrite table xyy_app_data.gdd_pc_visitor_platform partition (dt ='${day}')
select
t1.create_time --日期
,t1.hour_time --时段
,t2.erp_org_code --erp机构编码
,t2.branch_name as provice_name --机构名称
,t1.branch_code --ec机构编码
,t1.product_code --erp 神农商品编码
,t1.barcode -- ec商品编码
,t1.user_id --访客ID
from xyy_test.tmp_gdd_pc_visitor_platform t1
left join (select
branch_code --ec机构编码
,erp_org_code --erp机构编码
,branch_name --机构名称
from xyy_bigdata_dim.dim_branch_ss
WHERE dt = date_add(current_date(), -1) and erp_org_code is not null
AND branch_code NOT IN ( 'XS777777', 'XS666666')
group by
branch_code
,erp_org_code
,branch_name
) t2
on t1.branch_code=t2.branch_code ;
gdd_exposure_flow.sql
gdd_exposure_flow.sh文件内容如下:
< dev@node04-bigdata-prod-bj1:~/azkaban/app_dag/gdd/right_sys >$ more gdd_exposure_flow.sh
#! /bin/sh
if [ $# -eq 1 ]; then
day=$1
else
day=`date -d '1 day ago' +%Y%m%d`
fi
echo $day
beeline -u 'jdbc:hive2://node03-bigdata-prod-bj1.ybm100.top:10000/;principal=hive/node03-bigdata-prod-bj1.ybm100.top@YBM100.COM' -hivevar day=$day -f /home/dev/azkaban/app_dag/gdd/right_sys/
gdd_exposure_flow.sql
通过 sh gdd_exposure_flow.sh 执行脚本
截图如下:
方法二 ,对于SQL文件里有特殊字符
sheel和SQL放在一个文件里
< dev@node04-bigdata-prod-bj1:~/azkaban/app_dag/gdd >$ more gdd_ec_order_product_direction_flow_detail.sh
#! /bin/sh
if [ $# -eq 1 ]; then
day=$1
else
day=`date -d '1 day ago' +%Y-%m-%d`
fi
echo $day
beeline -u 'jdbc:hive2://node03-bigdata-prod-bj1.ybm100.top:10000/;principal=hive/node03-bigdata-prod-bj1.ybm100.top@YBM100.COM' -e"
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.dynamic.partition=true;
set hive.exec.parallel=true;
set hive.auto.convert.join = true;
set hive.mapjoin.smalltable.filesize=25000000;
set mapreduce.map.memory.mb=3072;
--订单历史记录表信息
CREATE TABLE if not exists xyy_app_data.gdd_ec_tb_order_log
(
order_no string comment '订单号',
status int comment '状态',
update_time1 string comment '订单流转时间' ,
update_time string comment '订单流转时间2' ,
system_time string comment '系统时间'
)
partitioned by (dt string )
stored as parquet;
insert overwrite table xyy_app_data.gdd_ec_tb_order_log partition (dt='"$day"')
select
tt.order_no --订单号
,tt.status --状态
,tt.update_time1 --订单流转时间
,tt.update_time --订单流转时间2
,current_date() as system_time --系统时间
from
(
select
ty.order_no,
ty.status,
ty.update_time1,
ty.update_time,
-- row_number()over(partition by ty.order_no,ty.update_time1 order by ty.update_time desc) num
row_number()over(partition by ty.order_no order by ty.update_time desc,ty.status desc ) num
from (
select
order_no,
status,
update_time,
substr(update_time,1,10) as update_time1
from xyy_bigdata_ods.ec_tb_order_operate_log
)ty
) tt
where tt.num = 1 and tt.status in (1,2,3,7)
group by
tt.order_no
,tt.status
,tt.update_time1
,tt.update_time ;"
gdd_ec_tb_order_log.sh
通过 sh gdd_ec_tb_order_log.sh 执行脚本
基于快照表做增量表
---基于快照表计算增量数据 计算 5月3号的增量数据 invalidate metadata xyy_test.incr_order_detail_partitions
--ID 是每个分区快照表的主键 ,MD5判断数据是否发生变化,根据主键和MD5来判断
insert overwrite table xyy_test.incr_order_detail_partitions partition (dt = '2020-05-03')
select
m1.id -----id键
,m1.create_time -----下单时间
,m1.order_no -----订单号
,m1.branch_code -----区域编码
,m1.variety_num -----品种数量
,m1.money -----订单实付金额
,m1.status -----订单状态
,m1.barcode -----商品编号
,m1.product_price -----商品单价
,m1.product_amount -----商品数量
,m1.md5_flag ----md5标志
from
(
select
id -----id键
,create_time -----下单时间
,order_no -----订单号
,branch_code -----区域编码
,variety_num -----品种数量
,money -----订单实付金额
,status -----订单状态
,barcode -----商品编号
,product_price -----商品单价
,product_amount -----商品数量
,md5(concat(id,create_time,order_no,branch_code,variety_num,money,status,barcode,product_price,product_amount) ) as md5_flag
from xyy_test.order_detail_partitions
where dt ='2020-05-03' ----5月3号的快照数据
) m1
left join
(
select
id -----id键
,create_time -----下单时间
,order_no -----订单号
,branch_code -----区域编码
,variety_num -----品种数量
,money -----订单实付金额
,status -----订单状态
,barcode -----商品编号
,product_price -----商品单价
,product_amount -----商品数量
,md5(concat(id,create_time,order_no,branch_code,variety_num,money,status,barcode,product_price,product_amount) ) as md5_flag
from xyy_test.order_detail_partitions
where dt ='2020-05-02' ---5月2号的快照数据
) m2
on m1.id = m2.id
where m1.md5_flag !=m2.md5_flag or m2.id is null
基于增量表做拉链
增量表做拉链
----拉链表
drop table xyy_test.d_order_detail_partitions;
CREATE TABLE xyy_test.d_order_detail_partitions(
id bigint comment 'id键',
create_time string comment '下单时间' ,
order_no string comment '订单号' ,
branch_code string comment '区域编码' ,
variety_num int comment '品种数量' ,
money double comment '订单实付金额' ,
status int comment '订单状态' ,
barcode string comment '商品编号' ,
product_price double comment '商品单价' ,
product_amount int comment '商品数量' ,
start_time string comment '开始时间' ,
end_time string comment '结束时间' ,
load_time string comment '数据加载时间'
)
comment '这是张拉链表'
stored as parquet;
---初始化数据
insert into xyy_test.d_order_detail_partitions ---这是张拉链表
select
t1.id -----id键
,t1.create_time -----下单时间
,t1.order_no -----订单号
,t1.branch_code -----区域编码
,t1.variety_num -----品种数量
,t1.money -----订单实付金额
,t1.status -----订单状态
,t1.barcode -----商品编号
,t1.product_price -----商品单价
,t1.product_amount -----商品数量
,'1598-01-01' as start_time -----开始时间
,'9999-12-31' as end_time -----结束时间
,'2020-05-02' as load_time -----数据加载时间
--from xyy_test.incr_order_detail_partitions t1
from xyy_test.order_detail_partitions t1
where dt ='2020-05-02' ;
--- invalidate metadata xyy_test.d_order_detail_partitions
insert into xyy_test.d_order_detail_partitions
select
m1.id -----id键
,m1.create_time -----下单时间
,m1.order_no -----订单号
,m1.branch_code -----区域编码
,m1.variety_num -----品种数量
,m1.money -----订单实付金额
,m1.status -----订单状态
,m1.barcode -----商品编号
,m1.product_price -----商品单价
,m1.product_amount -----商品数量
,m1.start_time
,m1.end_time
,m1.load_time
from (
---匹配成功的数据 修改拉链时间
select
t1.id -----id键
,t1.create_time -----下单时间
,t1.order_no -----订单号
,t1.branch_code -----区域编码
,t1.variety_num -----品种数量
,t1.money -----订单实付金额
,t1.status -----订单状态
,t1.barcode -----商品编号
,t1.product_price -----商品单价
,t1.product_amount -----商品数量
,t1.start_time -----开始时间 匹配成功的数据提取之前的开始时间
,case when t2.id is not null and t1.end_time > '2020-05-03' then '2020-05-02' else t1.end_time end as end_time -----结束时间 匹配成功的数据修改结束时间
,t1.load_time ----- 匹配成功的数据提取之前的数据加载时间
from xyy_test.d_order_detail_partitions t1 --拉链表
left join
(
select * from xyy_test.incr_order_detail_partitions ---这个是增量表
where dt ='2020-05-03'
) t2
on t1.id = t2.id
union all
--- 新的增量数据
select
t1.id -----id键
,t1.create_time -----下单时间
,t1.order_no -----订单号
,t1.branch_code -----区域编码
,t1.variety_num -----品种数量
,t1.money -----订单实付金额
,t1.status -----订单状态
,t1.barcode -----商品编号
,t1.product_price -----商品单价
,t1.product_amount -----商品数量
,'2020-05-03' as start_time -----开始时间 新增数据的开始时间 是结束时间的+1 天
,'9999-12-31' as end_time -----结束时间 指定默认的结束时间
,'2020-05-03' as load_time -----数据加载时间 获取提取系统时间-1
from xyy_test.incr_order_detail_partitions t1 -----这个是增量表
where dt ='2020-05-03'
) m1
案列
full join
create table xyy_temp_data.data_20200615_temp
(
id int ,
provice string,
amount double )
create table xyy_temp_data.data_20200615_2temp
(
id int ,
provice string,
amount double )
insert into xyy_temp_data.data_20200615_temp values (1,'北京',200),(2,'河北',300),(3,'河南',900)
insert into xyy_temp_data.data_20200615_2temp values (4,'黑龙江',1000),(5,'湖北',680),(3,'河南',2000)
select t1.id,t1.provice,t1.amount ,t2.id,t2.provice,t2.amount as sale_amount
from xyy_temp_data.data_20200615_temp t1
full join
xyy_temp_data.data_20200615_2temp t2
on t1.id =t2.id
---执行结果:
+--------+-------------+------------+--------+-------------+--------------+--+
| t1.id | t1.provice | t1.amount | t2.id | t2.provice | sale_amount |
+--------+-------------+------------+--------+-------------+--------------+--+
| 1 | 北京 | 200.0 | NULL | NULL | NULL |
| 2 | 河北 | 300.0 | NULL | NULL | NULL |
| 3 | 河南 | 900.0 | 3 | 河南 | 2000.0 |
| NULL | NULL | NULL | 4 | 黑龙江 | 1000.0 |
| NULL | NULL | NULL | 5 | 湖北 | 680.0 |
+--------+-------------+------------+--------+-------------+--------------+--+
同时启动10个进程跑分区数据,一次跑10个分区
#/bin/bash
set -e
yesterday=`date -d "-1 day" +"%Y-%m-%d"`
execdate=$yesterday
#beeline -u 'jdbc:hive2://node07-bigdata-prod-bj1.ybm100.top:10000/;principal=hive/node07-bigdata-prod-bj1.ybm100.top@YBM100.COM' --hivevar execdate=$execdate -f /home/dev
/azkaban/dws_sh/work_by_tkyin/sql/dws_b2b_inventory_turnover_info_ss.sql
count=1
for(( i=0;i<=365;i++ ))
do
execdate=`date -d "$yesterday -$i day" +"%Y-%m-%d"`
echo $execdate
nohup hive --hivevar execdate=$execdate -f /home/dev/tkyin/a.sql>/home/dev/tkyin/log/$execdate.log 2>&1 &
count=$(($count+1))
flag=$(($count%10))
if [ $flag -eq 0 ];then
sleep 60
fi
done