编号函数 对窗口中的数据进行编号 一般编号有排序
RANK() 排序相同时会重复,总数不会变
DENSE_RANK() 排序相同时会重复,总数会减少
ROW_NUMBER() 会根据顺序计算编号 不关注排序字段的值
1 示例
1.1 数据
孙悟空 语文 87
孙悟空 数学 95
孙悟空 英语 68
娜娜 语文 94
娜娜 数学 56
娜娜 英语 84
宋宋 语文 64
宋宋 数学 86
宋宋 英语 84
婷婷 语文 65
婷婷 数学 85
婷婷 英语 78
1.2 建表导入数据
create table if not exists tb_stu(
name string ,
subject string ,
score double
)
row format delimited fields terminated by "\t" ;
load data local inpath "/hive/data/stu.txt" into table tb_stu ;
0: jdbc:hive2://linux01:10000> select * from tb_stu ;
OK
+--------------+-----------------+---------------+
| tb_stu.name | tb_stu.subject | tb_stu.score |
+--------------+-----------------+---------------+
| 孙悟空 | 语文 | 87.0 |
| 孙悟空 | 数学 | 95.0 |
| 孙悟空 | 英语 | 68.0 |
| 娜娜 | 语文 | 94.0 |
| 娜娜 | 数学 | 56.0 |
| 娜娜 | 英语 | 84.0 |
| 宋宋 | 语文 | 64.0 |
| 宋宋 | 数学 | 86.0 |
| 宋宋 | 英语 | 84.0 |
| 婷婷 | 语文 | 65.0 |
| 婷婷 | 数学 | 85.0 |
| 婷婷 | 英语 | 78.0 |
+--------------+-----------------+---------------+
1.3 rank()
会判断排序字段的值 ,值相同标号一致 ,总数量不会变化
select
subject ,
name ,
score ,
rank() over(partition by subject order by score desc)
from
tb_stu ;
+----------+-------+--------+----------------+
| subject | name | score | rank_window_0 |
+----------+-------+--------+----------------+
| 数学 | 孙悟空 | 95.0 | 1 |
| 数学 | 宋宋 | 86.0 | 2 |
| 数学 | 婷婷 | 85.0 | 3 |
| 数学 | 娜娜 | 56.0 | 4 |
| 英语 | 宋宋 | 84.0 | 1 |
| 英语 | 娜娜 | 84.0 | 1 |
| 英语 | 婷婷 | 78.0 | 3 |
| 英语 | 孙悟空 | 68.0 | 4 |
| 语文 | 娜娜 | 94.0 | 1 |
| 语文 | 孙悟空 | 87.0 | 2 |
| 语文 | 婷婷 | 65.0 | 3 |
| 语文 | 宋宋 | 64.0 | 4 |
+----------+-------+--------+----------------+
1.4 dense_rank()
select
subject ,
name ,
score ,
rank() over(partition by subject order by score desc) ,
dense_rank() over(partition by subject order by score desc)
from
tb_stu ;
+----------+-------+--------+----------------+----------------------+
| subject | name | score | rank_window_0 | dense_rank_window_1 |
+----------+-------+--------+----------------+----------------------+
| 数学 | 孙悟空 | 95.0 | 1 | 1 |
| 数学 | 宋宋 | 86.0 | 2 | 2 |
| 数学 | 婷婷 | 85.0 | 3 | 3 |
| 数学 | 娜娜 | 56.0 | 4 | 4 |
| 英语 | 宋宋 | 84.0 | 1 | 1 |
| 英语 | 娜娜 | 84.0 | 1 | 1 |
| 英语 | 婷婷 | 78.0 | 3 | 2 |
| 英语 | 孙悟空 | 68.0 | 4 | 3 |
| 语文 | 娜娜 | 94.0 | 1 | 1 |
| 语文 | 孙悟空 | 87.0 | 2 | 2 |
| 语文 | 婷婷 | 65.0 | 3 | 3 |
| 语文 | 宋宋 | 64.0 | 4 | 4 |
+----------+-------+--------+----------------+----------------------+
1.5 row_number()
全局所有的数据编号
select
* ,
row_number() over()
from
tb_stu ;
+--------------+-----------------+---------------+----------------------+
| tb_stu.name | tb_stu.subject | tb_stu.score | row_number_window_0 |
+--------------+-----------------+---------------+----------------------+
| 婷婷 | 英语 | 78.0 | 1 |
| 婷婷 | 数学 | 85.0 | 2 |
| 婷婷 | 语文 | 65.0 | 3 |
| 宋宋 | 英语 | 84.0 | 4 |
| 宋宋 | 数学 | 86.0 | 5 |
| 宋宋 | 语文 | 64.0 | 6 |
| 娜娜 | 英语 | 84.0 | 7 |
| 娜娜 | 数学 | 56.0 | 8 |
| 娜娜 | 语文 | 94.0 | 9 |
| 孙悟空 | 英语 | 68.0 | 10 |
| 孙悟空 | 数学 | 95.0 | 11 |
| 孙悟空 | 语文 | 87.0 | 12 |
+--------------+-----------------+---------------+----------------------+
分区内标号
select
* ,
row_number() over(partition by subject order by score)
from
tb_stu ;
+--------------+-----------------+---------------+----------------------+
| tb_stu.name | tb_stu.subject | tb_stu.score | row_number_window_0 |
+--------------+-----------------+---------------+----------------------+
| 婷婷 | 数学 | 85.0 | 1 |
| 宋宋 | 数学 | 86.0 | 2 |
| 娜娜 | 数学 | 56.0 | 3 |
| 孙悟空 | 数学 | 95.0 | 4 |
| 婷婷 | 英语 | 78.0 | 1 |
| 宋宋 | 英语 | 84.0 | 2 |
| 娜娜 | 英语 | 84.0 | 3 |
| 孙悟空 | 英语 | 68.0 | 4 |
| 婷婷 | 语文 | 65.0 | 1 |
| 宋宋 | 语文 | 64.0 | 2 |
| 娜娜 | 语文 | 94.0 | 3 |
| 孙悟空 | 语文 | 87.0 | 4 |
+--------------+-----------------+---------------+----------------------+
2 练习一 连续n天销售记录的店铺案例
2.1 数据
a,2017-02-05,200
a,2017-02-06,300
a,2017-02-07,200
a,2017-02-08,400
a,2017-02-10,600
b,2017-02-05,200
b,2017-02-06,300
b,2017-02-08,200
b,2017-02-09,400
b,2017-02-10,600
c,2017-01-31,200
c,2017-02-01,300
c,2017-02-02,200
c,2017-02-03,400
c,2017-02-10,600
a,2017-03-01,200
a,2017-03-02,300
a,2017-03-03,200
a,2017-03-04,400
a,2017-03-05,600
2.2 建表导入数据
create table tb_shop (
name string ,
dt string ,
money double
)
row format delimited fields terminated by "," ;
load data local inpath "/hive/data/shop.txt" into table tb_shop ;
+---------------+-------------+----------------+
| tb_shop.name | tb_shop.dt | tb_shop.money |
+---------------+-------------+----------------+
| a | 2017-02-05 | 200.0 |
| a | 2017-02-06 | 300.0 |
| a | 2017-02-07 | 200.0 |
| a | 2017-02-08 | 400.0 |
| a | 2017-02-10 | 600.0 |
| b | 2017-02-05 | 200.0 |
| b | 2017-02-06 | 300.0 |
| b | 2017-02-08 | 200.0 |
| b | 2017-02-09 | 400.0 |
| b | 2017-02-10 | 600.0 |
| c | 2017-01-31 | 200.0 |
| c | 2017-02-01 | 300.0 |
| c | 2017-02-02 | 200.0 |
| c | 2017-02-03 | 400.0 |
| c | 2017-02-10 | 600.0 |
| a | 2017-03-01 | 200.0 |
| a | 2017-03-02 | 300.0 |
| a | 2017-03-03 | 200.0 |
| a | 2017-03-04 | 400.0 |
| a | 2017-03-05 | 600.0 |
+---------------+-------------+----------------+\
2.3 SQL实现
1) 店铺分组 按照日期排序
select
* ,
row_number() over(partition by name order by dt) n
from
tb_shop ;
+---------------+-------------+----------------+-----+
| tb_shop.name | tb_shop.dt | tb_shop.money | n | diff
+---------------+-------------+----------------+-----+
| a | 2017-02-05 | 200.0 | 1 |2017-02-04
| a | 2017-02-06 | 300.0 | 2 |2017-02-04
| a | 2017-02-07 | 200.0 | 3 |2017-02-04
| a | 2017-02-08 | 400.0 | 4 |2017-02-04
| a | 2017-02-10 | 600.0 | 5 |2017-02-05
| a | 2017-03-01 | 200.0 | 6 |
| a | 2017-03-02 | 300.0 | 7 |
| a | 2017-03-03 | 200.0 | 8 |
| a | 2017-03-04 | 400.0 | 9 |
| a | 2017-03-05 | 600.0 | 10 |
| b | 2017-02-05 | 200.0 | 1 |
| b | 2017-02-06 | 300.0 | 2 |
| b | 2017-02-08 | 200.0 | 3 |
| b | 2017-02-09 | 400.0 | 4 |
| b | 2017-02-10 | 600.0 | 5 |
| c | 2017-01-31 | 200.0 | 1 |
| c | 2017-02-01 | 300.0 | 2 |
| c | 2017-02-02 | 200.0 | 3 |
| c | 2017-02-03 | 400.0 | 4 |
| c | 2017-02-10 | 600.0 | 5 |
+---------------+-------------+----------------+-----+
2) 日期和编号求差值
0: jdbc:hive2://linux01:10000> select date_sub('2020-06-21' , 1) ;
OK
+-------------+
| _c0 |
+-------------+
| 2020-06-20 |
+-------------+
1 row selected (0.169 seconds)
0: jdbc:hive2://linux01:10000> select date_sub('2020-06-21' , 2) ;
OK
+-------------+
| _c0 |
+-------------+
| 2020-06-19 |
+-------------+
select
name ,
dt ,
money ,
n ,
date_sub(dt , n) diff
from
(select
* ,
row_number() over(partition by name order by dt) n
from
tb_shop) t ;
+-------+-------------+--------+-----+-------------+
| name | dt | money | n | diff |
+-------+-------------+--------+-----+-------------+
| a | 2017-02-05 | 200.0 | 1 | 2017-02-04 |
| a | 2017-02-06 | 300.0 | 2 | 2017-02-04 |
| a | 2017-02-07 | 200.0 | 3 | 2017-02-04 |
| a | 2017-02-08 | 400.0 | 4 | 2017-02-04 |
| a | 2017-02-10 | 600.0 | 5 | 2017-02-05 |
| a | 2017-03-01 | 200.0 | 6 | 2017-02-23 |
| a | 2017-03-02 | 300.0 | 7 | 2017-02-23 |
| a | 2017-03-03 | 200.0 | 8 | 2017-02-23 |
| a | 2017-03-04 | 400.0 | 9 | 2017-02-23 |
| a | 2017-03-05 | 600.0 | 10 | 2017-02-23 |
| b | 2017-02-05 | 200.0 | 1 | 2017-02-04 |
| b | 2017-02-06 | 300.0 | 2 | 2017-02-04 |
| b | 2017-02-08 | 200.0 | 3 | 2017-02-05 |
| b | 2017-02-09 | 400.0 | 4 | 2017-02-05 |
| b | 2017-02-10 | 600.0 | 5 | 2017-02-05 |
| c | 2017-01-31 | 200.0 | 1 | 2017-01-30 |
| c | 2017-02-01 | 300.0 | 2 | 2017-01-30 |
| c | 2017-02-02 | 200.0 | 3 | 2017-01-30 |
| c | 2017-02-03 | 400.0 | 4 | 2017-01-30 |
| c | 2017-02-10 | 600.0 | 5 | 2017-02-05 |
+-------+-------------+--------+-----+-------------+
3) 名字 差值分组聚合 统计次数 过滤掉小于3的数据
select
name ,
diff ,
count(*) cnt
from
(select
name ,
dt ,
money ,
n ,
date_sub(dt , n) diff
from
(select
* ,
row_number() over(partition by name order by dt) n
from
tb_shop)t) t2
group by name , diff
having cnt >= 3 ;
+-------+-------------+------+
| name | diff | cnt |
+-------+-------------+------+
| a | 2017-02-04 | 4 |
| a | 2017-02-05 | 1 |
| a | 2017-02-23 | 5 |
| b | 2017-02-04 | 2 |
| b | 2017-02-05 | 3 |
| c | 2017-01-30 | 4 |
| c | 2017-02-05 | 1 |
+-------+-------------+------+
+-------+-------------+------+
| name | diff | cnt |
+-------+-------------+------+
| a | 2017-02-04 | 4 |
| a | 2017-02-23 | 5 |
| b | 2017-02-05 | 3 |
| c | 2017-01-30 | 4 |
+-------+-------------+------+
4) 去重重复名字的店铺
with x as (select
name ,
diff ,
count(*) cnt
from
(select
name ,
dt ,
money ,
n ,
date_sub(dt , n) diff
from
(select
* ,
row_number() over(partition by name order by dt) n
from
tb_shop)t) t2
group by name , diff
having cnt >= 3)
select
distinct name
from x ;
+-------+
| name |
+-------+
| a |
| b |
| c |
+-------+
3 练习二 连续命中案例
3.1 数据
打地鼠游戏 求连续命中次数在3次以上的用户
用户 打击顺次 命中(1)
u01,1,1
u01,2,0
u01,3,1
u01,4,1
u01,5,0
u01,6,1
u02,1,1
u02,2,1
u02,3,0
u02,4,1
u02,5,1
u02,6,0
u02,7,0
u02,8,1
u02,9,1
u03,1,1
u03,2,1
u03,3,1
u03,4,1
u03,5,1
u03,6,0
3.2 建表导入数据
create table ds (
uid string ,
seq int ,
mz int
)
row format delimited fields terminated by "," ;
load data local inpath "/hive/data/ds.txt" into table ds ;
+---------+---------+--------+
| ds.uid | ds.seq | ds.mz |
+---------+---------+--------+
| u01 | 1 | 1 |
| u01 | 2 | 0 |
| u01 | 3 | 1 |
| u01 | 4 | 1 |
| u01 | 5 | 0 |
| u01 | 6 | 1 |
| u02 | 1 | 1 |
| u02 | 2 | 1 |
| u02 | 3 | 0 |
| u02 | 4 | 1 |
| u02 | 5 | 1 |
| u02 | 6 | 0 |
| u02 | 7 | 0 |
| u02 | 8 | 1 |
| u02 | 9 | 1 |
| u03 | 1 | 1 |
| u03 | 2 | 1 |
| u03 | 3 | 1 |
| u03 | 4 | 1 |
| u03 | 5 | 1 |
| u03 | 6 | 0 |
+---------+---------+--------+
3.3 实现
select
distinct uid
from
(select
uid ,
count(*) cnt
from
(select
uid ,
(seq - n) diff
from
(select
* ,
row_number() over(partition by uid order by seq) n
from
(
select
*
from
ds
where mz != 0
) t ) t2) t3
group by uid , diff
having cnt >= 3) t4 ;