文章目录
- 高级
- 连续
- 求连续3天排放量大于100的id
- 数据准备
- 逻辑
- 答案
- 小鹏汽车
- 最长连续登陆天数
- 基本
- 用一条 SQL 语句查询 学生表每门课都大于 80 分的学生姓名
- 解决办法一: having
- 解决办法二:not in
- topN 问题
- 连续问题(7 天连续登陆)
- 留存问题
- trap
- union all 无序,如想有序,设置别名后用别名排序
高级
连续
求连续3天排放量大于100的id
数据准备
show databases;
create database interview_questions;
create table if not exists interview_questions.senlinlianxu
(
id string,
dt string,
lowcarbon string
);
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1001', '2021-12-12', '123');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1002', '2021-12-12', '45');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1001', '2021-12-13', '43');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1001', '2021-12-13', '45');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1001', '2021-12-13', '23');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1002', '2021-12-14', '45');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1001', '2021-12-14', '230');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1002', '2021-12-15', '45');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1001', '2021-12-15', '23');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1003', '2021-12-17', '101');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1003', '2021-12-18', '101');
INSERT INTO TABLE interview_questions.senlinlianxu VALUES ('1003', '2021-12-19', '101');
truncate table interview_questions.senlinlianxu;
select * from interview_questions.senlinlianxu;
逻辑
–某维度连续x次符合某条件,比如连续3天登录、连续5年续保。
–其他条件好筛选,怎么判断连续呢?先求出rank值,因为rank值肯定是连续的,只要值和rank值的差是一样的,就说明值是连续的。比如日期和rank的差是一样的,说明日期是连续的
–判断一串数是否连续,就找一组连续的数,然后借力。类似于尺子
- 1、每人每日的排放量,每人每日排放量>100
- 2、根据人分组,根据日期排序,并求出每个组的rank
- 3、每个人组内,算出日期和rank的差。
- 4、每个人组内,根据差分组,求sum
- 5、筛选出sum>=3的
答案
select id, sub, count(*) count
from (select id, dt, date_sub(dt, rank) sub
from (select id, dt, rank() over (partition by id order by dt asc ) rank
from (select id, dt, sum(lowcarbon) lowcarbonTotal
from interview_questions.senlinlianxu
group by id, dt
having lowcarbonTotal > 100) t1) t2) t3
group by id, sub
having count >= 3;
小鹏汽车
小鹏汽车充电有两种类型,快充、慢充,有如下数据:
车辆ID 充电时间 充电类型
a 20200601 19:21:09 1
a 20200611 11:30:09 1
a 20200621 21:10:09 0
a 20200701 19:01:09 1
a 20200701 20:30:09 1
a 20200701 21:00:09 0
a 20200702 20:30:00 1
a 20200703 09:01:09 1
a 20200704 12:05:09 1
b 20200706 12:20:09 0
其中1为快充,0为慢充,求每辆车最长连续快充次数
以上例子结果为
a 3
b 0
SELECT
vcl_id,
max(cnt) as rn_group --最大次数
from(
select
vcl_id,
rn1 - rn2 as rn_group, --间距
count(1) as cnt --间距出现的次数
from (
select
vcl_id,
time,
type,
row_number() over (partition by vcl_id order by time asc) as rn1,
row_number() over (partition by vcl_id,type order by time asc) as rn2
from tableName t
) t1
group by
vcl_id,
rn1 - rn2
)t2
group by vcl_id
最长连续登陆天数
基本
用一条 SQL 语句查询 学生表每门课都大于 80 分的学生姓名
解决办法一: having
思路:如果最小的课程都大于80分,那他所有的课程分数,肯定都大于80分!
SELECT name
FROM xuesheng
GROUP BY name
HAVING MIN(score)> 80
解决办法二:not in
可以用反向思维,先查询出表里面有小于 80 分的 name,然后用 not in 去除掉
代码实现
SELECT DISTINCT name
FROM xuesheng
WHERE name NOT IN
(SELECT DISTINCT name
FROM xuesheng
WHERE score <=80);
topN 问题
案例:查询各科成绩前两名的记录
代码实现
row_number() over( partition by 课程 order by 成绩 desc) as rank
...
where rank <= 2 -- 前两名
连续问题(7 天连续登陆)
实现思路:
因为每天用户登录次数可能不止一次,所以需要先将用户每天的登录日期去重。
再用row_number() over(partition by _ order by _)函数将用户id分组,按照登陆时间进行排序。
计算登录日期减去第二步骤得到的结果值,用户连续登陆情况下,每次相减的结果都相同。
按照id和日期分组并求和,筛选大于等于7的即为连续7天登陆的用户。
代码实现
SELECT user_id, MAX(count_val) AS max_count -- 查出了最大连续登陆,where>=7,即7天连续
FROM (
-- group by相同日期
SELECT user_id, symbol_date, COUNT(*) AS count_val
FROM (
-- 日期减rank,连续登陆的话,会得到相同日期
SELECT user_id, log_date, date_sub(log_date, CAST(rn AS INT)) AS symbol_date
FROM (
-- 打上rank标识
SELECT user_id, log_date, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY log_date) AS rn
FROM user_logging_format
) c
) d
GROUP BY user_id, symbol_date
) e
GROUP BY user_id;
留存问题
留存率是衡量用户质量的最重要指标之一。
留存率指标中,通常需要关注次日留存、3日留存、7日留存和月留存。对新增用户而言,需要关注更细颗粒度的数据,也就是7日内每天的留存率。
代码实现
select
dd
, count( if(id=lead_id and datediff(dd,lead_dd)=1 ,id, null ) ) as '1 日留存'
, count( if(id=lead_id7 and datediff(dd,lead_dd7)=7 ,id, null ) ) as '7 日留存'
from
(
select
id, dd
, lead(dd,1) over(partition by id order by dd asc ) as lead_dd
, lead(id,1) over(partition by id order by dd asc ) as lead_id
, lead(dd,7) over(partition by id order by dd asc ) as lead_dd7
, lead(id,7) over(partition by id order by dd asc ) as lead_id7
from
(
select 'slm' as id, '2018-12-26' as dd
union all select 'slm' as id, '2018-12-27' as dd
union all select 'slm' as id, '2018-12-28' as dd
union all select 'hh ' as id, '2018-12-26' as dd
union all select 'hh ' as id, '2018-12-28' as dd ) aa
trap
union all 无序,如想有序,设置别名后用别名排序
select col1,0 as px
union all
select col2,1 as px
union all
select col3,2 as px
order by px