mysql计算人员排名

转载

mob64ca141677f9 2024-10-29 06:46:09

文章标签 mysql计算人员排名 hive sql 数据库 ci 文章分类 MySQL 数据库

SQL强化

文章目录

SQL强化

hive

SQL执行顺序
行转列(转置)
连续N天登陆
窗口函数
N日留存率
分组内top前几
join系列
join优化
带条件的聚合统计

mysql

列传行

postgres

pg 窗口函数row_number()

hive

SQL执行顺序

--举例：
select 
       a.sex,
       b.city,
       count(1) as cnt,
       sum(salary) as sum1
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
group by a.sex,b.city
having cnt>=2
order by a.sex,b.city
limit 10

--或者是
select distinct
       a.sex,
       b.city,
       a.age
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
order by a.sex,b.city
limit 10

上面的SQL语句的执行顺序是: from (去加载table1 和 table2这2个表 ) -> join -> on -> where -> group by->select 后面的聚合函数count,sum -> having -> distinct -> order by -> limit

行转列(转置)

行转列的常规做法是，group by+sum(if())【或count(if())】

题一

已知

year	month	amount
1991	1	1.1
1991	2	1.2
1991	3	1.3
1991	4	1.4
1992	1	2.1
1992	2	2.2
1992	3	2.3
1992	4	2.4

查成这样一个结果

year	m1	m2	m3	m4
1991	1.1	1.2	1.3	1.4
1992	2.1	2.2	2.3	2.4

解答

use test_sql;
set hive.exec.mode.local.auto=true;		//开启本地执行，加速执行
create table table2(year int,month int ,amount double) ;
insert overwrite table table2 values
           (1991,1,1.1),
           (1991,2,1.2),
           (1991,3,1.3),
           (1991,4,1.4),
           (1992,1,2.1),
           (1992,2,2.2),
           (1992,3,2.3),
           (1992,4,2.4);
select * from table2;


--行转列
--常规做法是，group by+sum(if())
--扩展，当case when里面的分支只有两个，并且是互斥，则等价于if语句，也就是【case when month=1 then amount else 0 end】 等价于 【if(month=1,amount,0)】
select year,
       sum(if(month=1,amount,0)) m1,
       sum(if(month=2,amount,0)) m2,
       sum(if(month=3,amount,0)) m3,
       sum(if(month=4,amount,0)) m4
from table2
group by year;
或者
select year,
	   sum(case when month=1 then amount else 0 end) as m1,
	   sum(case when month=2 then amount else 0 end) as m2,
	   sum(case when month=3 then amount else 0 end) as m3,
	   sum(case when month=4 then amount else 0 end) as m4
from table2 as t
group by year;

题二

表table如下：

DDate	shengfu
2015-05-09	胜
2015-05-09	胜
2015-05-09	负
2015-05-09	负
2015-05-10	胜
2015-05-10	负
2015-05-10	负

如果要生成下列结果, 该如何写sql语句?

DDate	胜	负
2015-05-09	2	2
2015-05-10	1	2

--建表
create table table1(DDate string, shengfu string) ;
insert overwrite table table1 values ('2015-05-09', "胜"),
       ('2015-05-09', "胜"),
       ('2015-05-09', "负"),
       ('2015-05-09', "负"),
       ('2015-05-10', "胜"),
       ('2015-05-10', "负"),
       ('2015-05-10', "负");

--注意：count 对应else必须为null,count是统计不为0的行数；sum 对应else为null和0都行，做累加
select DDate,
       count(case when shengfu = '胜' then 1 else null end) `胜`,
       count(case when shengfu = '负' then 1 else null end) `负`
from table1
group by DDate;

题三

假设tableA如表5, tableB如表6,

表5

qq号（字段名：qq）	游戏（字段名：game）
10000	a
10000	b
10000	c
20000	c
20000	d

表6

qq号（字段名：qq）	游戏（字段名：game）
10000	a_b_c
20000	c_d

请写出以下sql逻辑：

a, 将tableA输出为tableB的格式；【行转列】

b, 将tableB输出为tableA的格式; 【列转行】

create table tableA(qq string, game string) 
insert overwrite table tableA values 
       (10000, 'a'),
       (10000, 'b'),
       (10000, 'c'),
       (20000, 'c'),
       (20000, 'd');

create table tableB(qq string, game string) ;
insert overwrite table tableB values 
(10000, 'a_b_c'),
(20000, 'c_d');
       
--将tableA输出为tableB的格式；  
select qq,
       concat_ws('_', collect_list(game)) game
from tableA
group by qq;   
       
--将tableB输出为tableA的格式; 需要借助【侧视图+explode函数】   
select qq,
       tmp.game
from tableB lateral view explode(split(game, '_')) tmp as game;

连续N天登陆

思路分析过程

思路一

mysql计算人员排名_ci

--核心代码
->distinct
-> row_number 
-> date_sub(dt,rn) as dt2
-> group by dt2,name 
-> having count(1)>=N天 
-> distinct name 
-> count(name)

思路二

mysql计算人员排名_ci_02

--核心代码
->distinct
->date_add(dt,N-1) as date2
->lead(dt,N-1) over(partition by userid order by dt) as date3
->where date2=date3
->distinct

set hive.exec.mode.local.auto=true;
/*--OPPO*/
create table game(name string,`date` string);
insert overwrite table game  values
                       ('张三','2021-01-01'),
                       ('张三','2021-01-02'),
                       ('张三','2021-01-03'),
                       ('张三','2021-01-02'),

                       ('张三','2021-01-07'),
                       ('张三','2021-01-08'),
                       ('张三','2021-01-09'),

                       ('李四','2021-01-01'),
                       ('李四','2021-01-02'),
                       ('王五','2021-01-03'),
                       ('王五','2021-01-02'),
                       ('王五','2021-01-02');

select * from game;
--方案1
with t1 as (
    select distinct name,`date` from game
),
     t2 as (
         select *,
                row_number() over (partition by name order by `date`) as rn
         from t1
     ),
     t3 as (
         select *,
                date_sub(`date`,rn) as temp
         from t2
     ),
     t4 as (
         select name,temp ,
                count(1) as cnt
         from t3
         group by name,temp
         having count(1)>=3
     )
select distinct name from t4;

--方案2
with t1 as (
    select distinct name,`date` from game
),
    t2 as (
        select *,
               date_add(`date`,2) as date2,
               lead(`date`,2) over(partition by name order by `date`) as date3
          from t1
    )
select distinct name from t2 where date2=date3;

窗口函数

窗口函数的最大特点是有over()关键字,代表定义窗口

函数名（字段名）over(partition by xxx,yyy order by zzz)

聚合类的窗口函数
sum() count()/avg()/max()/min()
排序类的窗口函数
row_number，rank，dense_rank
偏移类的，跨行的
lag,lead
【了解】first_value和last_value
【了解】ntile

题一

create table score(cid int ,sname string,score int);
insert overwrite table score values
(1,'张三',60),
(1,'李四',70),
(1,'王五',80),
(1,'赵六',90),
(2,'安安',80),
(2,'平平',90);
select * from score;
-- 第一类、 聚合类的窗口函数
-- 计算同一个班级内，每个同学和比他分数低的总分是多少
	sum(score) over (partition by cid order by score )  as `累加分数1`,
-- 上句等价于
	sum(score) over (partition by cid order by score rows between unbounded preceding and current row ) as `累加分数2`,
--思考：如何计算每个同学，在班级内，包括他自己，和比他低一名的，和比他高1名的，这3个分数求和
	sum(score) over(partition by cid order by score rows between 1 preceding and 1 following) as `累加分数3`
    
-- 第二类、 排序类的窗口函数
-- 同一个班内，按分数排序打上序号
	row_number() over (partition by cid order by score)  as `分数序号排名`,
-- 考虑并列
	rank()       over (partition by cid order by score)  as `分数序号排名2`, 
	dense_rank() over (partition by cid order by score)  as `分数序号排名3`	并列后序号，自然顺序
	
-- 第三类、 偏移类的，跨行的
select *,
       -- 同一班内，考得比自己低1名的分数是多少
       lag(score, 1) over (partition by cid order by score)     as `低一名的分数`,
       -- 同一班内，考得比自己低1名的分数是多少,如果找不到，则显示为0
       lag(score, 1, 0) over (partition by cid order by score)  as `低一名的分数2`,
       -- 同一班内，考得比自己低2名的分数是多少
       lag(score, 2) over (partition by cid order by score)     as `低2名的分数`,
       -- 同一班内，考得比自己高1名的分数是多少
       lead(score, 1) over (partition by cid order by score)    as `高一名的分数`
from score;

题二

Emp表的表数据如下：

NAME	MONTH	AMT
张三	01	100
李四	02	120
王五	03	150
赵六	04	500
张三	05	400
李四	06	350
王五	07	180
赵六	08	400

问题：请写出可以得到以下的结果SQL

NAME	总金额	排名	占比
赵六	900	1	40.91%
张三	500	2	22.73%
李四	470	3	21.36%
王五	330	4	15.00%

create table emp(name string , month string, amt int);
insert overwrite table emp values ('张三', '01', 100),
                                  ('李四', '02', 120),
                                  ('王五', '03', 150),
                                  ('赵六', '04', 500),
                                  ('张三', '05', 400),
                                  ('李四', '06', 350),
                                  ('王五', '07', 180),
                                  ('赵六', '08', 400);
---
with t1 as (
    select name,
           sum(amt) as sum_amt
        from emp
        group by name
),
t2 as (
    select *,
    row_number() over (partition by null order by sum_amt desc) as rn, --排名
    sum_amt/sum(sum_amt) over () as rate
    from t1
)
select name, sum_amt, rn, concat(round(rate*100,2),'%') rate from t2

题三

题目描述：

在第一题员工表的基础上，统计每年入职总数以及截至本年累计入职总人数。

截至本年累计入职总人数=本年总入职人数 + 本年之前所有年的总入职人数之和

结果

mysql计算人员排名_mysql计算人员排名_03

select *,
       sum(cnt) over (order by year1) cnt2
from
(select year(hiredate) as year1,count(empno) as cnt from emp group by year(hiredate)) a;

题四

mysql计算人员排名_数据库_04

需求：根据商品大类Goods_type对商品金额price从小到大排序，前30％为低挡，30％～85％为中档，高于85％为高档打上标签。

select goods_type,
       goods_name,
       price,
       case when rate<0.3 then '低档'
            when rate>=0.3 and rate<0.85 then '中档'
            when rate>=0.85 then '高档'
       end    as type
from (
         select *,
    			-- 每个在组内的排名
                row_number() over (partition by goods_type order by price)
    			-- 每个组内有多少个
                    /count(1) over(partition by goods_type) as rate --比例
         from goods
     ) as t1
;

N日留存率

mysql计算人员排名_sql_05

核心代码

-> where 日期 in (首日,1天后,7天后)
-> group by 用户
-> count(if(日期=首日,1,null))  as cnt
   count(if(日期=1天后,1,null)) as cnt2
   count(if(日期=7天后,1,null)) as cnt8
->having cnt>0
->count(user_id) as 首日总数
  count(if(cnt2>0,1,null)) as 次日留存数
  count(if(cnt8>0,1,null)) as 7日留存数
->次日留存数/首日总数 as 次日留存率
  7日留存数/首日总数 as 7日留存率

**需求：**一条sql统计出以下指标（4.1号uv，4.1号在4.2号的留存uv，4.1号在4.8号的留存uv）;

uv就是不同的用户数

create table if not exists tb_cuid_1d
(
    cuid         string comment '用户的唯一标识',
    os           string comment '平台',
    soft_version string comment '版本',
    event_day    string comment '日期',
    visit_time    int comment '用户访问时间戳',
    duration     decimal comment '用户访问时长',
    ext          array<string> comment '扩展字段'
);
insert overwrite table tb_cuid_1d values
                                      (1,'android',1,'2020-04-01',1234567,100,`array`('')),
                                      (1,'android',1,'2020-04-02',1234567,100,`array`('')),
                                      (1,'android',1,'2020-04-08',1234567,100,`array`('')),
                                      (2,'android',1,'2020-04-01',1234567,100,`array`('')),
                                      (3,'android',1,'2020-04-02',1234567,100,`array`(''));

**方案1：**性能快，步骤稍微复杂

先按用户分组，得到每个用户的各相关日期的登录情况。

select '2020-04-01的留存情况' as type,
       count(cuid) as uv_4_1,
       count(case when cnt_4_2>0 then 1 else null end) as uv_4_2,
       count(case when cnt_4_8>0 then 1 else null end) as uv_4_8
from
(select cuid,
       count(case when event_day='2020-04-01' then 1 else null end) as cnt_4_1,
       count(case when event_day='2020-04-02' then 1 else null end) as cnt_4_2,
       count(case when event_day='2020-04-08' then 1 else null end) as cnt_4_8
from tb_cuid_1d
--提前过滤数据
where event_day in ('2020-04-01','2020-04-02','2020-04-08')
group by cuid
-- 2020-04-01必须登录，剔除掉2020-04-01没登录的
having cnt_4_1>0) as t;

效果如下

mysql计算人员排名_sql_06

再对上面的用户汇总

select count(cnt) as uv,
       count(if(cnt2!=0,1,null)) as uv2,	//4月2日留存数
       count(if(cnt8!=0,1,null)) as uv8，
       -- 留存率
       uv2/uv as rate_4_2，	//4月2日留存率
       uv8/uv as rate_4_8

mysql计算人员排名_mysql计算人员排名_07

最后再用【后续日期的留存数】除以【首日总数】，就是【留存率】

**方案2：**性能慢，步骤比较简单

select count(a.cuid) uv,
       count(b.cuid) uv2,
       count(c.cuid) uv7,
       count(b.cuid)/count(a.cuid) as rate_4_2,
       count(c.cuid)/count(a.cuid) as rate_4_8
from (select distinct event_day, cuid from tb_cuid_1d where event_day='首日') as a
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='次日') as b on a.cuid=b.cuid
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='7日后') as c on a.cuid=c.cuid;

分组内top前几

需求常见词：

【每组xxx内按yyy排序的前n个zzz】
【每组xxx内按yyy排序的第1个zzz】
【每组xxx内按yyy排序的最后1个zzz】
特点是yyy和zzz是不同的字段。
比如：班内按性别分组，组内按身高排序，的前3个学生姓名

公式：row_number() over(partition by 组名 order by yyy) as rn，再筛选rn<=N名

题一

员工表结构

mysql计算人员排名_sql_08

员工表数据

mysql计算人员排名_数据库_09

题目描述

求出每个部门工资最高的前三名员工，并计算这些员工的工资占所属部门总工资的百分比。

结果

mysql计算人员排名_mysql计算人员排名_10

create table emp(empno string ,ename string,hiredate string,sal int ,deptno string);
insert overwrite table emp values
('7521', 'WARD', '1981-2-22', 1250, 30),
('7566', 'JONES', '1981-4-2', 2975, 20),
('7876', 'ADAMS', '1987-7-13', 1100, 20),
('7369', 'SMITH', '1980-12-17', 800, 20),
('7934', 'MILLER', '1982-1-23', 1300, 10),
('7844', 'TURNER', '1981-9-8', 1500, 30),
('7782', 'CLARK', '1981-6-9', 2450, 10),
('7839', 'KING', '1981-11-17', 5000, 10),
('7902', 'FORD', '1981-12-3', 3000, 20),
('7499', 'ALLEN', '1981-2-20', 1600, 30),
('7654', 'MARTIN', '1981-9-28', 1250, 30),
('7900', 'JAMES', '1981-12-3', 950, 30),
('7788', 'SCOTT', '1987-7-13', 3000, 20),
('7698', 'BLAKE', '1981-5-1', 2850, 30);

--求出每个部门工资最高的前三名员工，并计算这些员工的工资占所属部门总工资的百分比。
select a.empno,
       a.sal,
       a.deptno,
       a.rn,
       a.sum_sal,
       round(a.sal/a.sum_sal,2) as rate
from
(select *,
--每个部门工资排名
         row_number() over (partition by deptno order by sal desc) as rn, 
--每个部门的总工资
         sum(sal) over(partition by deptno ) as sum_sal 
from emp) a
where rn<=3;

join系列

【区分 inner /left / right / full / left semi / left anti join 的特点】

有以下银行信息表

mysql计算人员排名_ci_11

银行黑名单信息表

mysql计算人员排名_mysql计算人员排名_12

left join

使用left join对所有用户，如果也在黑名单中，则标记为YES，否则标记为NO。

id	name	sex	age	flag
1	张三	男	20	YES
2	李四	男	29	YES
3	王五	男	21	YES
4	赵六	女	28	NO
5	田七	女	22	NO

select a.*,
       if(b.user_id is not null, 'YES', 'NO') flag
from all_users a
left join black_list b
on a.id = b.user_id;

right join

对上面的问题，使用right join再做一次。

id	name	sex	age	flag
1	张三	男	20	YES
2	李四	男	29	YES
3	王五	男	21	YES
4	赵六	女	28	NO
5	田七	女	22	NO

select b.*,
       if(a.user_id is not null, 'YES', 'NO') flag
from black_list a
right join all_users b
on a.user_id = b.id;

left semi join

使用left semi join对所有用户，如果也在黑名单中，则挑选出来。

id	name	sex	age
1	张三	男	20
2	李四	男	29
3	王五	男	21

-- left semi join结果只显示左表字段，不会显示右表字段
select *
from all_users a
left semi join black_list b on a.id = b.user_id;
-- 他跟left join的关系是:
select a.*
from all_users a
left join black_list b on a.id = b.user_id
where b.user_id is not null;
-- 也等价于
select a.*
from all_users a
join black_list b on a.id = b.user_id;

left anti join

使用left anti join对所有用户，如果不在黑名单中，则挑选出来。

id	name	sex	age
4	赵六	女	28
5	田七	女	22

select *
from all_users a
left anti join black_list b on a.id = b.user_id;
--他跟left join的关系是:
select a.*
from all_users a
left join black_list b on a.id = b.user_id
where b.user_id is null;

full join

用户银行存款信息表(user_id ‘用户id’,amount ‘存款金额’)

mysql计算人员排名_ci_13

用户银行负债信息表(user_id ‘用户id’,amount ‘负债金额’)

mysql计算人员排名_ci_14

使用full join，展示用户的存款金额和负债金额。

user_id	deposit_amount	debt_amount
1	2000	0
2	2900	0
3	2100	3400
4	0	2800
5	0	2200

select coalesce(a.user_id,b.user_id) as user_id_new,
       coalesce(a.amount,0) as deposit_amount,
       coalesce(b.amount,0) as debt_amount
from deposit a
full join debt b on a.user_id=b.user_id;

-- coalesce返回第一个不为null的值
select coalesce(123,456) as x;	123
select coalesce(null,456) as x;	456
select nvl(123,0) as x;	123
select coalesce(null,null,456) as x; 456

题一：

需求：某个游戏中的元宝分为，付费元宝和免费元宝。玩家购买商城道具时候，可以使用付费元宝也可以使用免费元宝。请使用HIve SQL语句计算出2021-01-01至2021-01-07期间各个角色当日消耗元宝的付费免费比例（付费免费比 = 付费元宝消耗量 / 免费元宝消耗量）。

现有表结构如下：

desc	dm_paid_buy;
#dm_paid_buy	角色使用付费元宝购买商城道具时候记录一条

time	bigint	#购买的时间戳
server_id	string	#服务器ID
role_id	int	#角色ID
cost	int	#购买对应道具消耗的付费元宝数量
item_id	int	#购买对应道具的id
amount	int	#购买对应道具的数量
p_date	string	#登录日期,yyyy-MM-dd

desc	dm_free_buy;
#dm_free_buy	角色使用免费元宝购买商城道具时候记录一条

time	bigint	#购买的时间戳
server_id	string	#服务器ID
role_id	int	#角色ID
cost	int	#购买对应道具消耗的免费元宝数量
item_id	int	#购买对应道具的id
amount	int	#购买对应道具的数量
p_date	string	#登录日期,yyyy-MM-dd

结果输出

p_date	server_id	role_id	付费免费比
2021-01-01	123	10098	0
2021-01-01	120	10098	0.4
2021-01-02	123	10098	0.2

use interview_db;
set hive.exec.mode.local.auto=true;

create table if not exists dm_paid_buy
(
    `time`    bigint comment '#购买的时间戳',
    server_id string comment '#服务器ID',
    role_id   int comment '#角色ID',
    cost      int comment '#购买对应道具消耗的付费元宝数量',
    item_id   int comment '#购买对应道具的id',
    amount    int comment '#购买对应道具的数量',
    p_date    string comment '#登录日期, yyyy-MM-dd'
) comment '角色使用付费元宝购买商城道具时候记录一条';
insert overwrite table dm_paid_buy values
(1234567,120,10098,2,3,4,'2021-01-01'),
(1234567,120,10098,4,3,5,'2021-01-01'),
(1234567,123,10098,3,3,2,'2021-01-02'),
(1234567,123,10098,2,3,2,'2021-01-02');

---
create table if not exists dm_free_buy
(
    `time`    bigint comment '#购买的时间戳',
    server_id string comment '#服务器ID',
    role_id   int comment '#角色ID',
    cost      int comment '#购买对应道具消耗的免费元宝数量',
    item_id   int comment '#购买对应道具的id',
    amount    int comment '#购买对应道具的数量',
    p_date    string comment '#登录日期, yyyy-MM-dd'
) comment '角色使用免费元宝购买商城道具时候记录一条';
insert overwrite table dm_free_buy values
(1234567,123,10098,8,3,4,'2021-01-01'),
(1234567,123,10098,5,3,5,'2021-01-01'),
(1234567,120,10098,6,3,4,'2021-01-01'),
(1234567,120,10098,9,3,5,'2021-01-01'),
(1234567,123,10098,18,3,2,'2021-01-02'),
(1234567,123,10098,7,3,2,'2021-01-02');

select coalesce(a.p_date, b.p_date)         p_date,
       coalesce(a.server_id, b.server_id)   server_id,
       coalesce(a.role_id, b.role_id)       role_id,
       round(nvl(a.cost, 0) / b.cost, 2) as rate
from (select p_date, server_id, role_id, sum(cost) cost
      from dm_paid_buy
      where p_date between '2021-01-01' and '2021-01-07'
      group by p_date, server_id, role_id) a
full join (select p_date, server_id, role_id, sum(cost) cost
      from dm_free_buy
      wherep_date between '2021-01-01' and '2021-01-07'
      group by p_date, server_id, role_id) b
    on a.p_date=b.p_date and a.server_id=b.server_id and a.role_id=b.role_id;
    
-- 使用with as 来重构上面的逻辑
with a as (
    select p_date,server_id,role_id,
           sum(cost) as cost
    from dm_paid_buy
    where p_date between '2021-01-01' and '2021-01-07'
    group by p_date,server_id,role_id
),
    b as(
        select p_date,server_id,role_id,
               sum(cost) as cost
        from dm_free_buy
        where p_date between '2021-01-01' and '2021-01-07'
        group by p_date,server_id,role_id
    )
select coalesce(a.p_date,b.p_date) as p_date,
       coalesce(a.server_id,b.server_id) as server_id,
       coalesce(a.role_id,b.role_id) as role_id,
       coalesce(a.cost,0) as a_cost,
       coalesce(b.cost,0) as b_cost,
       coalesce(a.cost,0)/b.cost as rate
from  a
full join b
on a.p_date=b.p_date and a.server_id=b.server_id and a.role_id=b.role_id;

join优化

join前请三思，能否先减小数据量，再join

通用口诀：先用where或group by减少数据量，再join

-- 优化前,慢:
select 
  b.userid, 
  b.username, 
  sum(xx), 
  count(xx) 
from a【比如交易明细事实表】 
join b【比如客户维度信息表】
on a.userid = b.userid 
where b.yy < 条件 
group by b.userid, b.username 
having count (a.xx) > 10;

-- 优化后，快：
select b.userid,
	   b.username, 
       s,
       cnt 
      --【提前对a减少数据量】
from (select userid,
      		 sum(xx) s,
      	     count(xx) cnt
		from a group by userid having count(a.xx) >10) a 
	  -- 【提前对b减少数据量】
join (select * from b where b.yy<条件) b
on a.userid = b.userid;

题一

表T1的数据结构：

字段英文名	字段中文名	类型	主键标志	注释
Rec_no	记录号	CHAR(3)	Y
Ci_no	客户号	CHAR(6)	N
Cust_Type	客户类型	CHAR(2)	N
Cre_dt	开户日期	Date	N
Cus_sts	客户状态	Char(1)	N	Y-正常 N-无效

表T1的数据

Rec_no	ci_no	cust_type	cre_dt	cus_sts
123	111111	01	2010-11-15	Y
234	222222	02	2011-09-01	Y
345	333333	02	2012-01-09	Y
456	444444	01	2012-09-08	Y

表T2的数据结构：

字段英文名	字段中文名	类型	主键标志	注释
Ci_no	客户号	CHAR(6)	Y
AC_no	客户账号	CHAR(9)	Y
Bal	账号余额	DECIMAL(15，2)	N

表T2的数据：

Ci_no char(6)	Ac_no char(9)	Bal decimal(15,2)
222222	123456888	1000.28
222222	123456999	886
333333	123454321	5000

请编写sql统计在9月份开户且账户余额不为0的有效客户数。（8分）

-- 传统的写法。
select count(distinct t1.ci_no) as cnt
from t1
join t2 on t1.ci_no=t2.ci_no
where month(t1.cre_dt)=9
and t1.cus_sts='Y'
and bal>0;

-- 方案2
select count(t1.ci_no) as cnt
	-- 9月份开户，有效客户
from (select * from t1 where month(cre_dt)=9 and cus_sts='Y') t1
	-- 账户余额不为0
join (select ci_no from t2 group by ci_no having sum(bal)>0) t2
on t1.ci_no=t2.ci_no;

带条件的聚合统计

一般的做法是group by xx,yy 再多次的sum(if(…))
好处是避免多次加载表，一次性得到多个指标，可以只加载一次表就得到多个指标。

题一

查询课程编号“2”的成绩比课程编号“1”低的所有同学的学号、姓名。【这是行转列的衍生题】

create table student(sid int, sname string, gender string, class_id int);
insert overwrite table student
values (1, '张三', '女', 1),
       (2, '李四', '女', 1),
       (3, '王五', '男', 2);

create table  course (cid int, cname string, teacher_id int);
insert overwrite table course
values (1, '生物', 1),
       (2, '体育', 1),
       (3, '物理', 2);

create table score (sid int, student_id int, course_id int, number int);
insert overwrite table score
values (1, 1, 1, 58),
       (4, 1, 2, 50),
       (2, 1, 2, 68),
       (3, 2, 2, 89);

-- 优化sql
select sid, sname
from 
(select student_id,
       sum(if(course_id=2,number,0)) as pe, --体育
       sum(if(course_id=1,number,0)) as bio --生物
from score
group by student_id
having pe<bio) as t
join student on t1.student_id = sid;

问题二

问题：用一条SQL生成完整的用户画像表，包含如下字段：

user_id, sex, age, d7order_num, d14_order_num，后面两个字段分别为近7天订单数量，近14天订单数量。

create table sale_order(
    order_id bigint comment '订单ID',
    user_id bigint comment '用户ID',
    order_status int ,
    create_time string,
    last_update_time string,
    product_id bigint,
    product_num bigint
);
create table user_info(
    user_id bigint comment '用户ID,唯一主键',
    sex string,
    age int
);

-- 优化
select u.user_id,
	   u.sex,
       u.age,
       s.d7order_num,
       s.d14order_num
from user_info u
left join (select user_id,
           		  -- 近7天订单数量
                  count(distinct if(to_date(create_time) between data_sub(`current_date()`,6) and `current_date()`,order_id,null)) as d7order_num,
           		  -- 近14天订单数量
                  count(distinct if(to_date(create_time) between data_sub(`current_date()`,13) and `current_date()`,order_id,null)) as d14order_num
           from sale_order
           where create_time >= data_sub(`current_date()`,13)
           group by user_id) s 
on u.user_id = s.user_id;

mysql

列传行

行转列的做法是，substring_index（截取字符串的函数） + help_topic_id（截取字符串的函数） + help_topic_id（mysql自带帮助主题表）

已知：

user01

mysql计算人员排名_数据库_15

role01

mysql计算人员排名_数据库_16

查询：

mysql计算人员排名_hive_17

解答：

SELECT t.user_name, group_concat(role_name SEPARATOR '|') AS role_names
from (SELECT user_name, SUBSTRING_INDEX(SUBSTRING_INDEX(role_ids, '|', help_topic_id + 1), '|', -1) AS role_id
      FROM user01
               JOIN mysql.help_topic
                    on help_topic_id < (LENGTH(role_ids) - LENGTH(REPLACE(role_ids, '|', '')) + 1)) t
         JOIN role01 on t.role_id = role01.role_id
GROUP BY t.user_name;

-- 注：
-- (LENGTH(role_ids) - LENGTH(REPLACE(role_ids, '|', '')) +1 ) 为字符串分割后的长度
-- 即：分隔前长度-去掉分隔符长度+1