SQL强化
文章目录
- SQL强化
- hive
- SQL执行顺序
- 行转列(转置)
- 连续N天登陆
- 窗口函数
- N日留存率
- 分组内top前几
- join系列
- join优化
- 带条件的聚合统计
- mysql
- 列传行
- postgres
- pg 窗口函数row_number()
hive
SQL执行顺序
--举例:
select
a.sex,
b.city,
count(1) as cnt,
sum(salary) as sum1
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
group by a.sex,b.city
having cnt>=2
order by a.sex,b.city
limit 10
--或者是
select distinct
a.sex,
b.city,
a.age
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
order by a.sex,b.city
limit 10
上面的SQL语句的执行顺序是: from (去加载table1 和 table2这2个表 ) -> join -> on -> where -> group by->select 后面的聚合函数count,sum -> having -> distinct -> order by -> limit
行转列(转置)
- 行转列的常规做法是,group by+sum(if())【或count(if())】
题一
已知
year | month | amount |
1991 | 1 | 1.1 |
1991 | 2 | 1.2 |
1991 | 3 | 1.3 |
1991 | 4 | 1.4 |
1992 | 1 | 2.1 |
1992 | 2 | 2.2 |
1992 | 3 | 2.3 |
1992 | 4 | 2.4 |
查成这样一个结果
year | m1 | m2 | m3 | m4 |
1991 | 1.1 | 1.2 | 1.3 | 1.4 |
1992 | 2.1 | 2.2 | 2.3 | 2.4 |
解答
use test_sql;
set hive.exec.mode.local.auto=true; //开启本地执行,加速执行
create table table2(year int,month int ,amount double) ;
insert overwrite table table2 values
(1991,1,1.1),
(1991,2,1.2),
(1991,3,1.3),
(1991,4,1.4),
(1992,1,2.1),
(1992,2,2.2),
(1992,3,2.3),
(1992,4,2.4);
select * from table2;
--行转列
--常规做法是,group by+sum(if())
--扩展,当case when里面的分支只有两个,并且是互斥,则等价于if语句,也就是【case when month=1 then amount else 0 end】 等价于 【if(month=1,amount,0)】
select year,
sum(if(month=1,amount,0)) m1,
sum(if(month=2,amount,0)) m2,
sum(if(month=3,amount,0)) m3,
sum(if(month=4,amount,0)) m4
from table2
group by year;
或者
select year,
sum(case when month=1 then amount else 0 end) as m1,
sum(case when month=2 then amount else 0 end) as m2,
sum(case when month=3 then amount else 0 end) as m3,
sum(case when month=4 then amount else 0 end) as m4
from table2 as t
group by year;
题二
表table如下:
DDate | shengfu |
2015-05-09 | 胜 |
2015-05-09 | 胜 |
2015-05-09 | 负 |
2015-05-09 | 负 |
2015-05-10 | 胜 |
2015-05-10 | 负 |
2015-05-10 | 负 |
如果要生成下列结果, 该如何写sql语句?
DDate | 胜 | 负 |
2015-05-09 | 2 | 2 |
2015-05-10 | 1 | 2 |
--建表
create table table1(DDate string, shengfu string) ;
insert overwrite table table1 values ('2015-05-09', "胜"),
('2015-05-09', "胜"),
('2015-05-09', "负"),
('2015-05-09', "负"),
('2015-05-10', "胜"),
('2015-05-10', "负"),
('2015-05-10', "负");
--注意:count 对应else必须为null,count是统计不为0的行数;sum 对应else为null和0都行,做累加
select DDate,
count(case when shengfu = '胜' then 1 else null end) `胜`,
count(case when shengfu = '负' then 1 else null end) `负`
from table1
group by DDate;
题三
假设tableA如表5, tableB如表6,
表5
qq号(字段名:qq) | 游戏(字段名:game) |
10000 | a |
10000 | b |
10000 | c |
20000 | c |
20000 | d |
表6
qq号(字段名:qq) | 游戏(字段名:game) |
10000 | a_b_c |
20000 | c_d |
请写出以下sql逻辑:
a, 将tableA输出为tableB的格式; 【行转列】
b, 将tableB输出为tableA的格式; 【列转行】
create table tableA(qq string, game string)
insert overwrite table tableA values
(10000, 'a'),
(10000, 'b'),
(10000, 'c'),
(20000, 'c'),
(20000, 'd');
create table tableB(qq string, game string) ;
insert overwrite table tableB values
(10000, 'a_b_c'),
(20000, 'c_d');
--将tableA输出为tableB的格式;
select qq,
concat_ws('_', collect_list(game)) game
from tableA
group by qq;
--将tableB输出为tableA的格式; 需要借助【侧视图+explode函数】
select qq,
tmp.game
from tableB lateral view explode(split(game, '_')) tmp as game;
连续N天登陆
思路分析过程
- 思路一
--核心代码
->distinct
-> row_number
-> date_sub(dt,rn) as dt2
-> group by dt2,name
-> having count(1)>=N天
-> distinct name
-> count(name)
- 思路二
--核心代码
->distinct
->date_add(dt,N-1) as date2
->lead(dt,N-1) over(partition by userid order by dt) as date3
->where date2=date3
->distinct
set hive.exec.mode.local.auto=true;
/*--OPPO*/
create table game(name string,`date` string);
insert overwrite table game values
('张三','2021-01-01'),
('张三','2021-01-02'),
('张三','2021-01-03'),
('张三','2021-01-02'),
('张三','2021-01-07'),
('张三','2021-01-08'),
('张三','2021-01-09'),
('李四','2021-01-01'),
('李四','2021-01-02'),
('王五','2021-01-03'),
('王五','2021-01-02'),
('王五','2021-01-02');
select * from game;
--方案1
with t1 as (
select distinct name,`date` from game
),
t2 as (
select *,
row_number() over (partition by name order by `date`) as rn
from t1
),
t3 as (
select *,
date_sub(`date`,rn) as temp
from t2
),
t4 as (
select name,temp ,
count(1) as cnt
from t3
group by name,temp
having count(1)>=3
)
select distinct name from t4;
--方案2
with t1 as (
select distinct name,`date` from game
),
t2 as (
select *,
date_add(`date`,2) as date2,
lead(`date`,2) over(partition by name order by `date`) as date3
from t1
)
select distinct name from t2 where date2=date3;
窗口函数
窗口函数的最大特点是有over()关键字,代表定义窗口
函数名(字段名)over(partition by xxx,yyy order by zzz)
- 聚合类的窗口函数
sum() count()/avg()/max()/min() - 排序类的窗口函数
row_number,rank,dense_rank - 偏移类的,跨行的
lag,lead - 【了解】first_value和last_value
- 【了解】ntile
题一
create table score(cid int ,sname string,score int);
insert overwrite table score values
(1,'张三',60),
(1,'李四',70),
(1,'王五',80),
(1,'赵六',90),
(2,'安安',80),
(2,'平平',90);
select * from score;
-- 第一类、 聚合类的窗口函数
-- 计算同一个班级内,每个同学和比他分数低的总分是多少
sum(score) over (partition by cid order by score ) as `累加分数1`,
-- 上句等价于
sum(score) over (partition by cid order by score rows between unbounded preceding and current row ) as `累加分数2`,
--思考:如何计算每个同学,在班级内,包括他自己,和比他低一名的,和比他高1名的,这3个分数求和
sum(score) over(partition by cid order by score rows between 1 preceding and 1 following) as `累加分数3`
-- 第二类、 排序类的窗口函数
-- 同一个班内,按分数排序打上序号
row_number() over (partition by cid order by score) as `分数序号排名`,
-- 考虑并列
rank() over (partition by cid order by score) as `分数序号排名2`,
dense_rank() over (partition by cid order by score) as `分数序号排名3` 并列后序号,自然顺序
-- 第三类、 偏移类的,跨行的
select *,
-- 同一班内,考得比自己低1名的分数是多少
lag(score, 1) over (partition by cid order by score) as `低一名的分数`,
-- 同一班内,考得比自己低1名的分数是多少,如果找不到,则显示为0
lag(score, 1, 0) over (partition by cid order by score) as `低一名的分数2`,
-- 同一班内,考得比自己低2名的分数是多少
lag(score, 2) over (partition by cid order by score) as `低2名的分数`,
-- 同一班内,考得比自己高1名的分数是多少
lead(score, 1) over (partition by cid order by score) as `高一名的分数`
from score;
题二
Emp表的表数据如下:
NAME | MONTH | AMT |
张三 | 01 | 100 |
李四 | 02 | 120 |
王五 | 03 | 150 |
赵六 | 04 | 500 |
张三 | 05 | 400 |
李四 | 06 | 350 |
王五 | 07 | 180 |
赵六 | 08 | 400 |
问题:请写出可以得到以下的结果SQL
NAME | 总金额 | 排名 | 占比 |
赵六 | 900 | 1 | 40.91% |
张三 | 500 | 2 | 22.73% |
李四 | 470 | 3 | 21.36% |
王五 | 330 | 4 | 15.00% |
create table emp(name string , month string, amt int);
insert overwrite table emp values ('张三', '01', 100),
('李四', '02', 120),
('王五', '03', 150),
('赵六', '04', 500),
('张三', '05', 400),
('李四', '06', 350),
('王五', '07', 180),
('赵六', '08', 400);
---
with t1 as (
select name,
sum(amt) as sum_amt
from emp
group by name
),
t2 as (
select *,
row_number() over (partition by null order by sum_amt desc) as rn, --排名
sum_amt/sum(sum_amt) over () as rate
from t1
)
select name, sum_amt, rn, concat(round(rate*100,2),'%') rate from t2
题三
题目描述:
在第一题员工表的基础上,统计每年入职总数以及截至本年累计入职总人数。
截至本年累计入职总人数=本年总入职人数 + 本年之前所有年的总入职人数之和
结果
select *,
sum(cnt) over (order by year1) cnt2
from
(select year(hiredate) as year1,count(empno) as cnt from emp group by year(hiredate)) a;
题四
需求:根据商品大类Goods_type对商品金额price从小到大排序,前30%为低挡,30%~85%为中档,高于85%为高档打上标签。
select goods_type,
goods_name,
price,
case when rate<0.3 then '低档'
when rate>=0.3 and rate<0.85 then '中档'
when rate>=0.85 then '高档'
end as type
from (
select *,
-- 每个在组内的排名
row_number() over (partition by goods_type order by price)
-- 每个组内有多少个
/count(1) over(partition by goods_type) as rate --比例
from goods
) as t1
;
N日留存率
- 核心代码
-> where 日期 in (首日,1天后,7天后)
-> group by 用户
-> count(if(日期=首日,1,null)) as cnt
count(if(日期=1天后,1,null)) as cnt2
count(if(日期=7天后,1,null)) as cnt8
->having cnt>0
->count(user_id) as 首日总数
count(if(cnt2>0,1,null)) as 次日留存数
count(if(cnt8>0,1,null)) as 7日留存数
->次日留存数/首日总数 as 次日留存率
7日留存数/首日总数 as 7日留存率
**需求:**一条sql统计出以下指标 (4.1号uv,4.1号在4.2号的留存uv,4.1号在4.8号的留存uv);
uv就是不同的用户数
create table if not exists tb_cuid_1d
(
cuid string comment '用户的唯一标识',
os string comment '平台',
soft_version string comment '版本',
event_day string comment '日期',
visit_time int comment '用户访问时间戳',
duration decimal comment '用户访问时长',
ext array<string> comment '扩展字段'
);
insert overwrite table tb_cuid_1d values
(1,'android',1,'2020-04-01',1234567,100,`array`('')),
(1,'android',1,'2020-04-02',1234567,100,`array`('')),
(1,'android',1,'2020-04-08',1234567,100,`array`('')),
(2,'android',1,'2020-04-01',1234567,100,`array`('')),
(3,'android',1,'2020-04-02',1234567,100,`array`(''));
**方案1:**性能快,步骤稍微复杂
- 先按用户分组,得到每个用户的各相关日期的登录情况。
select '2020-04-01的留存情况' as type,
count(cuid) as uv_4_1,
count(case when cnt_4_2>0 then 1 else null end) as uv_4_2,
count(case when cnt_4_8>0 then 1 else null end) as uv_4_8
from
(select cuid,
count(case when event_day='2020-04-01' then 1 else null end) as cnt_4_1,
count(case when event_day='2020-04-02' then 1 else null end) as cnt_4_2,
count(case when event_day='2020-04-08' then 1 else null end) as cnt_4_8
from tb_cuid_1d
--提前过滤数据
where event_day in ('2020-04-01','2020-04-02','2020-04-08')
group by cuid
-- 2020-04-01必须登录,剔除掉2020-04-01没登录的
having cnt_4_1>0) as t;
效果如下
- 再对上面的用户汇总
select count(cnt) as uv,
count(if(cnt2!=0,1,null)) as uv2, //4月2日留存数
count(if(cnt8!=0,1,null)) as uv8,
-- 留存率
uv2/uv as rate_4_2, //4月2日留存率
uv8/uv as rate_4_8
- 最后再用 【后续日期的留存数】除以【首日总数】,就是【留存率】
**方案2:**性能慢,步骤比较简单
select count(a.cuid) uv,
count(b.cuid) uv2,
count(c.cuid) uv7,
count(b.cuid)/count(a.cuid) as rate_4_2,
count(c.cuid)/count(a.cuid) as rate_4_8
from (select distinct event_day, cuid from tb_cuid_1d where event_day='首日') as a
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='次日') as b on a.cuid=b.cuid
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='7日后') as c on a.cuid=c.cuid;
分组内top前几
- 需求常见词:
- 【每组xxx内按yyy排序的前n个zzz】
- 【每组xxx内按yyy排序的第1个zzz】
- 【每组xxx内按yyy排序的最后1个zzz】
- 特点是yyy和zzz是不同的字段。
- 比如:班内按性别分组,组内按身高排序,的前3个学生姓名
- 公式:row_number() over(partition by 组名 order by yyy) as rn,再筛选rn<=N名
题一
员工表结构
员工表数据
题目描述
求出每个部门工资最高的前三名员工,并计算这些员工的工资占所属部门总工资的百分比。
结果
create table emp(empno string ,ename string,hiredate string,sal int ,deptno string);
insert overwrite table emp values
('7521', 'WARD', '1981-2-22', 1250, 30),
('7566', 'JONES', '1981-4-2', 2975, 20),
('7876', 'ADAMS', '1987-7-13', 1100, 20),
('7369', 'SMITH', '1980-12-17', 800, 20),
('7934', 'MILLER', '1982-1-23', 1300, 10),
('7844', 'TURNER', '1981-9-8', 1500, 30),
('7782', 'CLARK', '1981-6-9', 2450, 10),
('7839', 'KING', '1981-11-17', 5000, 10),
('7902', 'FORD', '1981-12-3', 3000, 20),
('7499', 'ALLEN', '1981-2-20', 1600, 30),
('7654', 'MARTIN', '1981-9-28', 1250, 30),
('7900', 'JAMES', '1981-12-3', 950, 30),
('7788', 'SCOTT', '1987-7-13', 3000, 20),
('7698', 'BLAKE', '1981-5-1', 2850, 30);
--求出每个部门工资最高的前三名员工,并计算这些员工的工资占所属部门总工资的百分比。
select a.empno,
a.sal,
a.deptno,
a.rn,
a.sum_sal,
round(a.sal/a.sum_sal,2) as rate
from
(select *,
--每个部门工资排名
row_number() over (partition by deptno order by sal desc) as rn,
--每个部门的总工资
sum(sal) over(partition by deptno ) as sum_sal
from emp) a
where rn<=3;
join系列
【区分 inner /left / right / full / left semi / left anti join 的特点】
有以下银行信息表
银行黑名单信息表
left join
使用left join对所有用户,如果也在黑名单中,则标记为YES,否则标记为NO。
id | name | sex | age | flag |
1 | 张三 | 男 | 20 | YES |
2 | 李四 | 男 | 29 | YES |
3 | 王五 | 男 | 21 | YES |
4 | 赵六 | 女 | 28 | NO |
5 | 田七 | 女 | 22 | NO |
select a.*,
if(b.user_id is not null, 'YES', 'NO') flag
from all_users a
left join black_list b
on a.id = b.user_id;
right join
对上面的问题,使用right join再做一次。
id | name | sex | age | flag |
1 | 张三 | 男 | 20 | YES |
2 | 李四 | 男 | 29 | YES |
3 | 王五 | 男 | 21 | YES |
4 | 赵六 | 女 | 28 | NO |
5 | 田七 | 女 | 22 | NO |
select b.*,
if(a.user_id is not null, 'YES', 'NO') flag
from black_list a
right join all_users b
on a.user_id = b.id;
left semi join
使用left semi join对所有用户,如果也在黑名单中,则挑选出来。
id | name | sex | age |
1 | 张三 | 男 | 20 |
2 | 李四 | 男 | 29 |
3 | 王五 | 男 | 21 |
-- left semi join结果只显示左表字段,不会显示右表字段
select *
from all_users a
left semi join black_list b on a.id = b.user_id;
-- 他跟left join的关系是:
select a.*
from all_users a
left join black_list b on a.id = b.user_id
where b.user_id is not null;
-- 也等价于
select a.*
from all_users a
join black_list b on a.id = b.user_id;
left anti join
使用left anti join对所有用户,如果不在黑名单中,则挑选出来。
id | name | sex | age |
4 | 赵六 | 女 | 28 |
5 | 田七 | 女 | 22 |
select *
from all_users a
left anti join black_list b on a.id = b.user_id;
--他跟left join的关系是:
select a.*
from all_users a
left join black_list b on a.id = b.user_id
where b.user_id is null;
full join
用户银行存款信息表(user_id ‘用户id’,amount ‘存款金额’)
用户银行负债信息表(user_id ‘用户id’,amount ‘负债金额’)
使用full join,展示用户的存款金额和负债金额。
user_id | deposit_amount | debt_amount |
1 | 2000 | 0 |
2 | 2900 | 0 |
3 | 2100 | 3400 |
4 | 0 | 2800 |
5 | 0 | 2200 |
select coalesce(a.user_id,b.user_id) as user_id_new,
coalesce(a.amount,0) as deposit_amount,
coalesce(b.amount,0) as debt_amount
from deposit a
full join debt b on a.user_id=b.user_id;
-- coalesce返回第一个不为null的值
select coalesce(123,456) as x; 123
select coalesce(null,456) as x; 456
select nvl(123,0) as x; 123
select coalesce(null,null,456) as x; 456
题一:
需求:某个游戏中的元宝分为,付费元宝和免费元宝。玩家购买商城道具时候,可以使用付费元宝也可以使用免费元宝。请使用HIve SQL语句计算出2021-01-01至2021-01-07期间各个角色当日消耗元宝的付费免费比例(付费免费比 = 付费元宝消耗量 / 免费元宝消耗量)。
现有表结构如下:
desc | dm_paid_buy; |
#dm_paid_buy | 角色使用付费元宝购买商城道具时候记录一条 |
time | bigint | #购买的时间戳 |
server_id | string | #服务器ID |
role_id | int | #角色ID |
cost | int | #购买对应道具消耗的付费元宝数量 |
item_id | int | #购买对应道具的id |
amount | int | #购买对应道具的数量 |
p_date | string | #登录日期,yyyy-MM-dd |
desc | dm_free_buy; |
#dm_free_buy | 角色使用免费元宝购买商城道具时候记录一条 |
time | bigint | #购买的时间戳 |
server_id | string | #服务器ID |
role_id | int | #角色ID |
cost | int | #购买对应道具消耗的免费元宝数量 |
item_id | int | #购买对应道具的id |
amount | int | #购买对应道具的数量 |
p_date | string | #登录日期,yyyy-MM-dd |
结果输出
p_date | server_id | role_id | 付费免费比 |
2021-01-01 | 123 | 10098 | 0 |
2021-01-01 | 120 | 10098 | 0.4 |
2021-01-02 | 123 | 10098 | 0.2 |
use interview_db;
set hive.exec.mode.local.auto=true;
create table if not exists dm_paid_buy
(
`time` bigint comment '#购买的时间戳',
server_id string comment '#服务器ID',
role_id int comment '#角色ID',
cost int comment '#购买对应道具消耗的付费元宝数量',
item_id int comment '#购买对应道具的id',
amount int comment '#购买对应道具的数量',
p_date string comment '#登录日期, yyyy-MM-dd'
) comment '角色使用付费元宝购买商城道具时候记录一条';
insert overwrite table dm_paid_buy values
(1234567,120,10098,2,3,4,'2021-01-01'),
(1234567,120,10098,4,3,5,'2021-01-01'),
(1234567,123,10098,3,3,2,'2021-01-02'),
(1234567,123,10098,2,3,2,'2021-01-02');
---
create table if not exists dm_free_buy
(
`time` bigint comment '#购买的时间戳',
server_id string comment '#服务器ID',
role_id int comment '#角色ID',
cost int comment '#购买对应道具消耗的免费元宝数量',
item_id int comment '#购买对应道具的id',
amount int comment '#购买对应道具的数量',
p_date string comment '#登录日期, yyyy-MM-dd'
) comment '角色使用免费元宝购买商城道具时候记录一条';
insert overwrite table dm_free_buy values
(1234567,123,10098,8,3,4,'2021-01-01'),
(1234567,123,10098,5,3,5,'2021-01-01'),
(1234567,120,10098,6,3,4,'2021-01-01'),
(1234567,120,10098,9,3,5,'2021-01-01'),
(1234567,123,10098,18,3,2,'2021-01-02'),
(1234567,123,10098,7,3,2,'2021-01-02');
select coalesce(a.p_date, b.p_date) p_date,
coalesce(a.server_id, b.server_id) server_id,
coalesce(a.role_id, b.role_id) role_id,
round(nvl(a.cost, 0) / b.cost, 2) as rate
from (select p_date, server_id, role_id, sum(cost) cost
from dm_paid_buy
where p_date between '2021-01-01' and '2021-01-07'
group by p_date, server_id, role_id) a
full join (select p_date, server_id, role_id, sum(cost) cost
from dm_free_buy
wherep_date between '2021-01-01' and '2021-01-07'
group by p_date, server_id, role_id) b
on a.p_date=b.p_date and a.server_id=b.server_id and a.role_id=b.role_id;
-- 使用with as 来重构上面的逻辑
with a as (
select p_date,server_id,role_id,
sum(cost) as cost
from dm_paid_buy
where p_date between '2021-01-01' and '2021-01-07'
group by p_date,server_id,role_id
),
b as(
select p_date,server_id,role_id,
sum(cost) as cost
from dm_free_buy
where p_date between '2021-01-01' and '2021-01-07'
group by p_date,server_id,role_id
)
select coalesce(a.p_date,b.p_date) as p_date,
coalesce(a.server_id,b.server_id) as server_id,
coalesce(a.role_id,b.role_id) as role_id,
coalesce(a.cost,0) as a_cost,
coalesce(b.cost,0) as b_cost,
coalesce(a.cost,0)/b.cost as rate
from a
full join b
on a.p_date=b.p_date and a.server_id=b.server_id and a.role_id=b.role_id;
join优化
join前请三思,能否先减小数据量,再join
通用口诀:先用where或group by减少数据量,再join
-- 优化前,慢:
select
b.userid,
b.username,
sum(xx),
count(xx)
from a【比如交易明细事实表】
join b【比如客户维度信息表】
on a.userid = b.userid
where b.yy < 条件
group by b.userid, b.username
having count (a.xx) > 10;
-- 优化后,快:
select b.userid,
b.username,
s,
cnt
--【提前对a减少数据量】
from (select userid,
sum(xx) s,
count(xx) cnt
from a group by userid having count(a.xx) >10) a
-- 【提前对b减少数据量】
join (select * from b where b.yy<条件) b
on a.userid = b.userid;
题一
表T1的数据结构:
字段英文名 | 字段中文名 | 类型 | 主键标志 | 注释 |
Rec_no | 记录号 | CHAR(3) | Y | |
Ci_no | 客户号 | CHAR(6) | N | |
Cust_Type | 客户类型 | CHAR(2) | N | |
Cre_dt | 开户日期 | Date | N | |
Cus_sts | 客户状态 | Char(1) | N | Y-正常 N-无效 |
表T1的数据
Rec_no | ci_no | cust_type | cre_dt | cus_sts |
123 | 111111 | 01 | 2010-11-15 | Y |
234 | 222222 | 02 | 2011-09-01 | Y |
345 | 333333 | 02 | 2012-01-09 | Y |
456 | 444444 | 01 | 2012-09-08 | Y |
表T2的数据结构:
字段英文名 | 字段中文名 | 类型 | 主键标志 | 注释 |
Ci_no | 客户号 | CHAR(6) | Y | |
AC_no | 客户账号 | CHAR(9) | Y | |
Bal | 账号余额 | DECIMAL(15,2) | N |
表T2的数据:
Ci_no char(6) | Ac_no char(9) | Bal decimal(15,2) |
222222 | 123456888 | 1000.28 |
222222 | 123456999 | 886 |
333333 | 123454321 | 5000 |
请编写sql统计在9月份开户且账户余额不为0的有效客户数。(8分)
-- 传统的写法。
select count(distinct t1.ci_no) as cnt
from t1
join t2 on t1.ci_no=t2.ci_no
where month(t1.cre_dt)=9
and t1.cus_sts='Y'
and bal>0;
-- 方案2
select count(t1.ci_no) as cnt
-- 9月份开户,有效客户
from (select * from t1 where month(cre_dt)=9 and cus_sts='Y') t1
-- 账户余额不为0
join (select ci_no from t2 group by ci_no having sum(bal)>0) t2
on t1.ci_no=t2.ci_no;
带条件的聚合统计
- 一般的做法是group by xx,yy 再多次的sum(if(…))
- 好处是避免多次加载表,一次性得到多个指标,可以只加载一次表就得到多个指标。
题一
查询课程编号“2”的成绩比课程编号“1”低的所有同学的学号、姓名。【这是行转列的衍生题】
create table student(sid int, sname string, gender string, class_id int);
insert overwrite table student
values (1, '张三', '女', 1),
(2, '李四', '女', 1),
(3, '王五', '男', 2);
create table course (cid int, cname string, teacher_id int);
insert overwrite table course
values (1, '生物', 1),
(2, '体育', 1),
(3, '物理', 2);
create table score (sid int, student_id int, course_id int, number int);
insert overwrite table score
values (1, 1, 1, 58),
(4, 1, 2, 50),
(2, 1, 2, 68),
(3, 2, 2, 89);
-- 优化sql
select sid, sname
from
(select student_id,
sum(if(course_id=2,number,0)) as pe, --体育
sum(if(course_id=1,number,0)) as bio --生物
from score
group by student_id
having pe<bio) as t
join student on t1.student_id = sid;
问题二
问题:用一条SQL生成完整的用户画像表,包含如下字段:
user_id, sex, age, d7order_num, d14_order_num,后面两个字段分别为近7天订单数量,近14天订单数量。
create table sale_order(
order_id bigint comment '订单ID',
user_id bigint comment '用户ID',
order_status int ,
create_time string,
last_update_time string,
product_id bigint,
product_num bigint
);
create table user_info(
user_id bigint comment '用户ID,唯一主键',
sex string,
age int
);
-- 优化
select u.user_id,
u.sex,
u.age,
s.d7order_num,
s.d14order_num
from user_info u
left join (select user_id,
-- 近7天订单数量
count(distinct if(to_date(create_time) between data_sub(`current_date()`,6) and `current_date()`,order_id,null)) as d7order_num,
-- 近14天订单数量
count(distinct if(to_date(create_time) between data_sub(`current_date()`,13) and `current_date()`,order_id,null)) as d14order_num
from sale_order
where create_time >= data_sub(`current_date()`,13)
group by user_id) s
on u.user_id = s.user_id;
mysql
列传行
- 行转列的做法是,substring_index(截取字符串的函数) + help_topic_id(截取字符串的函数) + help_topic_id(mysql自带帮助主题表)
已知:
user01
role01
查询:
解答:
SELECT t.user_name, group_concat(role_name SEPARATOR '|') AS role_names
from (SELECT user_name, SUBSTRING_INDEX(SUBSTRING_INDEX(role_ids, '|', help_topic_id + 1), '|', -1) AS role_id
FROM user01
JOIN mysql.help_topic
on help_topic_id < (LENGTH(role_ids) - LENGTH(REPLACE(role_ids, '|', '')) + 1)) t
JOIN role01 on t.role_id = role01.role_id
GROUP BY t.user_name;
-- 注:
-- (LENGTH(role_ids) - LENGTH(REPLACE(role_ids, '|', '')) +1 ) 为字符串分割后的长度
-- 即:分隔前长度-去掉分隔符长度+1
postgres
pg 窗口函数row_number()