1.常用函数
from_unixtime(time_stamp, 'yyyy-MM-dd hh:mm:ss')
unix_timestamp(time_string)
to_date(time_string)
datediff(end_date,start_date)
date_add(string start_date,int days)
date_sub(string start_date,int days)
if(condition, true_result, false_result)
substr(field, start, length)
get_json_object(json_string, '$.key_name')
map_field('key') e.g. people_info: {"":"","":""} people_info('phone_brand')
coalesce(expression_1, expression_2,...) 常和full join连用,用以合并两个表两列的非空值,如果expression_1为null,则返回expression_2,以此类推
2.示例
有user_info,user_refund,user_trade三张表,分别是用户信息、用户退款、和用户交易表。用户信息表中包含用户性别、所在城市、用户名、以及保存其它信息的两个json字段(分别为map格式与string)。用户交易表包含用户名、交易额、交易时间。用户退款表包含用户名、退款金额、退款日期。
对函数的应用
例1
用户Alice 2018年的平均支付金额以及2018年最大支付日期和最小支付日期的间隔
select avg(pay_amount) as avg_amount,
datediff(
max(
from_unixtime(pay_time,'yyyy-MM-dd')
),
min(
from_unixtime(pay_time,'yyyy-MM-dd')
)
)
from user_trade
where user_name = 'Alice'
and year(dt) = '2018'
例2
2018年购买的商品品类在两个以上的用户数
select count(temp.user_name)
from (
select user_name, count(distinct goods_category) as num
from user_trade
where year(dt) = '2018'
group by user_name
having count(distinct goods_category) > 2
) temp
例3
激活时间在2018年,年龄段20-30岁和30-40岁的用户婚姻状况分布
select temp.age_type,
if(temp.marriage_status=1, '已婚', '未婚'),
count(distinct temp.user_id)
from
(
select case when age < 20 then '20岁以下'
case when age >=20 and age < 30 then '20-30岁'
case when age >=30 and age < 40 then '30-40岁'
else '40岁以上' end as age_type,
get_json_object(extra1, '$.marriage_status') as marriage_status,
user_id
from user_info
where to_date(firstactivetime) between '2018-01-01' and '2018-12-31'
) temp
连表查询示例
例1
找出在表1不在表2的用户
--hive sql in不能嵌套子查询
select a.user_id, a.user_name
from user_list_1 a
left join user_list_2 b on a.user_id = b.user_id
where b.user_id is null
例2
追加表2信息到表1
--字段名和字段顺序必须全都一致
--union 连接时会将前后去重且排序,但速度更慢,建议使用union all在内部去哄
select user_id,
user_name,
from user_list_1
union all
select user_id,
user_name,
from user_list_2
例3
2019年每个用户的支付和退款金额汇总
--解法1,使用union all
select temp.user_name,
sum(temp.pay_amount),
sum(temp.refund_amount)
from
(
select user_name,
sum(pay_amount),
0 as refund_amount -- union all需要列名一致,顺序统一,所以以0占位,不影响求和的计算结果
from user_trade
where year(dt) = 2019
group by user_name
union all
select user_name,
0 as pay_amount,
sum(refund_amount)
from user_refund
where year(dt) = 2019
group by user_name
) temp
group by temp.user_name
--解法2,上述问题以full join实现,但是较之union all效率要差很多
--用户可能只在一张表中出现,用coalese可以确保呈现出所有用户的数据
select coalesce(a.user_name, b.user_name),
if(a.pay_amount is null, 0, a.pay_amount),
if(b.refund_amoutn is null, 0, b.refund_amount)
from
(
select user_name,
sum(pay_amount) as pay_amount
from user_trade
where year(dt) = 2019
group by user_name
) a full join
(
select user_name,
sum(refund_amount) as refund_amount
from user_refund
where year(dt) = 2019
group by user_name
) b on a.user_name = b.user_name
例4
首次激活时间在2017年,但是一直没有支付的用户年龄段分布
select a.age_level,
count(a.user_name)
from
(
select user_name,
case when age < 20 then '20岁以下'
when age >=20 and age<30 then '20-30岁'
when age >=30 and age<40 then '30-40岁'
else '40岁以上' end as age_level
from user_info
where year(firstactivetime) = 2017
) a
left join
(
select distinct user_name
from user_trade
where year(dt) >= 2017
) b on a.user_name = b.user_name
where b.user_name is null
group by a.age_level
例5
2018年、2019年交易的用户,其激活时间的分布
select hour(firstactivetime),
count(a.user_name)
(
select distinct user_name
from user_trade
where year(dt) between 2018 and 2019
) a
left join
(
select user_name,
firstactivetime
from user_info
) b on a.user_name = b.user_name
group by hour(firstactivetime)
窗口函数
例1
2018年、2019年每个月的支付总额和截至当月的本年度累计支付总额
select a.year,
a.month,
a.pay_amount,
sum(a.pay_amount) over(partition by a.year order by a.month)
from
(
select year(dt) year
month(dt) month,
sum(pay_amount) as pay_amount
from user_trade
where year(dt) in (2018, 2019)
group by year(dt)
month(dt)
) a
例2
2018年每个月的近三个月移动平均支付金额
select a.month,
a.pay_amount,
--x following表示到当前行下面几行
avg(a.pay_amount) over(order by a.month rows between 2 preceding and current row)
from
(
select month(dt),
sum(pay_amount) as pay_amount
from user_trade
where year(dt) = 2018
group by month(dt)
) a
例3
2019年1月,用户购买商品品类数量排名
select user_name,
count(distinct goods_category),
row_number() over(order by count(distinct goods_category))
from user_trade
where substr(dt,1,7) = '2019-01'
group by user_name
例4
选出2019年支付金额排名在第10,20,30名的用户
select a.user_name
a.rank_num
from
(
select user_name,
sum(pay_amount),
rank() over(order by sum(pay_amount) desc) as rank_num
from user_trade
where year(dt) = 2019
group by user_name
) a
where a.rank_num in (10,20,30)
例5
将2019年1月的支付用户,按照支付金额分成5组
--ntile(n) over()
select user_name,
sum(pay_amount) pay_amount,
ntile(5) over(order by sum(pay_amount) desc) as level
from user_trade
where substr(dt,1,7) = '2019-01'
group by user_name
例6
偏移分析窗口函数 lag/lead(expression, offset, default)
lag/lead指的是当前行处于lag/lead的状态,即前者是向前偏移,后者是向后推
支付时间间隔超过100天的用户数
select count(distinct user_name)
from
(
select user_name,
dt,
lead(dt) over(partition by user_name order by dt) next_dt
from user_trade
where dt>'0'
) a
where a.lead_dt is not null and datediff(a.next_dt, a.dt) > 100
例7
每个城市、不同性别、2018年支付金额最高的TOP3用户
--4.筛选分组排序的结果
select uit.user_name,
uit.city,
uit.sex,
uit.pay_amount,
uit,ranking
(
select ut.user_name,
ui.city,
ui.sex,
ut.pay_amount,
--3.有了支付信息和用户信息就开始分组排序
row_number() over(partition by ui.city, ui.sex order by ut.pay_amount desc) as ranking
from
(
--21取出2018年每个人的支付总额
select user_name,
sum(pay_amount) pay_amount
from user_trade
where year(dt) = '2018'
group by user_name
) ut
--2.既然要按照城市和性别分组,就需要与用户信息连接
left join user_info ui on ut.user_name = ui.user_name
) uit
where uit.ranking < 4
例8
每个手机品牌退款金额前25%的用户
select *
from
(
select ur.user_name,
ui.extra2('phonebrand') as phonebrand,
ur.refund_amount,
ntile(4) over(partition by ui.extra('phonebrand') order by ur.refund_amount) as tile
from
(
select user_name,
sum(refund_amount) as refund_amount
from user_refund
where dt > '0'
group by user_name
) ur left join user_info ui
on ur.user_name = ui.user_name
) uri
where uri.tile = 1
3.建议
- 查询的筛选条件中必须包含分区字段,因为Hive中表的数据量很大,通常会以某个字段为基准进行分区,避免每次要筛选整张表。
- ORDER BY在select之后执行,所以必须使用重命名后的列名(如果有的话)。执行顺序:FROM - WHERE - GROUP BY - HAVING - SELECT - ORDER BY
- 聚合函数不可嵌套
- 先去重,再做表连接,尽量缩小数据集,好的sql语句不在意长短,而在于执行效率