1.使用multi-table-insert写法对union all进行优化
(1)原SQL(目标是往一个新表user_new里,插入最大值和最小值的数据)
insert into table user_new
select sex, max(age) as stat, 'max' as class
from user
group by sex
union all
select sex, min(age) as stat, 'min' as class
from user
group by sex;
(2)优化SQL(完成相同的目标,使用multi-table-insert写法)
from user
insert into table user_new
select sex, max(age) as stat, 'max' as class
group by sex
insert into table user_new
select sex, min(age) as stat, 'min' as class
group by sex;
2.使用grouping sets替代union all进行优化
(1)原SQL(目标是进行2个group by,但后一个的分组字段是前者的子集)
select * from
(
select sex, age, count(1) as user_num
from user
group by sex, age
union all
select sex, null as age, count(1) as user_num
from user
group by sex
) a;
select sex, age, count(1)
from user
group by sex, age
grouping sets((sex),(sex,age));
3.使用子查询对count(distinct)可能造成的数据倾斜进行优化
(1)原SQL(统计每种性别有多少种不同年龄)
select sex, count(distinct age) as age_num
from sex
group by sex;
(2)优化SQL(完成相同的目标,使用子查询避免数据出现倾斜而导致性能问题)
select sex, count(1) as age_num
from (
select sex, age, count(1) as num
group by sex, age
) a;
4.使用left join替代not in完成取A表中没有但B表中有的数据
(1)原SQL(查找订单日期不在2020-10-23日的用户的user_id)
select user_id
from user
where order_id not in(
select order_id from order
where order_date != '2020-10-23'
);
(2)优化SQL(完成相同的目标,用left join去实现,避免not in不走索引带来的问题)
select a.user_id
from user a
left join
order b
on a.order_id = b.order_id and b.order_date != '2020-10-23'
where b.user_id is null;
总结
- 在有数据倾斜的情况下,第一种写法的SQL方式更优。
- 当数据大到一定的量级时,第一种写法的SQL有两个作业,可以把处理逻辑分散到两个阶段中,即第一个阶段先处理一部分数据,缩小数据量,第二个阶段在已经缩小的数据集上继续处理。
- 而第二种写法的SQL,经过Map阶段处理的数据还非常多时,所有的数据却都需要交给一个Reduce节点去处理,就好比千军万马过独木桥一样,不仅无法利用到分布式集群的优势,还要浪费大量时间在等待,而这个等待的时间远比第一种写法的SQL多个MapReduce所延长的流程导致额外花费的时间还多。
- 在Hive3.0中即使遇到数据倾斜,第二种写法的SQL将hive.optimize.countdistinct设置为true,则整个写法也能达到第一种写法的SQL的效果。
开窗函数
select name,orderdate,cost,
sum(cost) over() as sample1,--所有行相加
sum(cost) over(partition by name) as sample2,--按name分组,组内数据相加
sum(cost) over(partition by name order by orderdate) as sample3,--按name分组,组内数据累加
sum(cost) over(partition by name order by orderdate rows between UNBOUNDED PRECEDING and current row ) as sample4 ,--和sample3一样,由起点到当前行的聚合
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING and current row) as sample5, --当前行和前面一行做聚合
sum(cost) over(partition by name order by orderdate rows between 1 PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行
sum(cost) over(partition by name order by orderdate rows between current row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行
from t_window;