SQL强化

文章目录

  • SQL强化
  • hive
  • SQL执行顺序
  • 行转列(转置)
  • 连续N天登陆
  • 窗口函数
  • N日留存率
  • 分组内top前几
  • join系列
  • join优化
  • 带条件的聚合统计
  • mysql
  • 列传行
  • postgres
  • pg 窗口函数row_number()

hive

SQL执行顺序

--举例:
select 
       a.sex,
       b.city,
       count(1) as cnt,
       sum(salary) as sum1
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
group by a.sex,b.city
having cnt>=2
order by a.sex,b.city
limit 10

--或者是
select distinct
       a.sex,
       b.city,
       a.age
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
order by a.sex,b.city
limit 10

上面的SQL语句的执行顺序是: from (去加载table1 和 table2这2个表 ) -> join -> on -> where -> group by->select 后面的聚合函数count,sum -> having -> distinct -> order by -> limit

行转列(转置)

  • 行转列的常规做法是,group by+sum(if())【或count(if())】

题一

已知

year

month

amount

1991

1

1.1

1991

2

1.2

1991

3

1.3

1991

4

1.4

1992

1

2.1

1992

2

2.2

1992

3

2.3

1992

4

2.4

查成这样一个结果

year

m1

m2

m3

m4

1991

1.1

1.2

1.3

1.4

1992

2.1

2.2

2.3

2.4

解答

use test_sql;
set hive.exec.mode.local.auto=true;		//开启本地执行,加速执行
create table table2(year int,month int ,amount double) ;
insert overwrite table table2 values
           (1991,1,1.1),
           (1991,2,1.2),
           (1991,3,1.3),
           (1991,4,1.4),
           (1992,1,2.1),
           (1992,2,2.2),
           (1992,3,2.3),
           (1992,4,2.4);
select * from table2;


--行转列
--常规做法是,group by+sum(if())
--扩展,当case when里面的分支只有两个,并且是互斥,则等价于if语句,也就是【case when month=1 then amount else 0 end】 等价于 【if(month=1,amount,0)】
select year,
       sum(if(month=1,amount,0)) m1,
       sum(if(month=2,amount,0)) m2,
       sum(if(month=3,amount,0)) m3,
       sum(if(month=4,amount,0)) m4
from table2
group by year;
或者
select year,
	   sum(case when month=1 then amount else 0 end) as m1,
	   sum(case when month=2 then amount else 0 end) as m2,
	   sum(case when month=3 then amount else 0 end) as m3,
	   sum(case when month=4 then amount else 0 end) as m4
from table2 as t
group by year;

题二

表table如下:

DDate

shengfu

2015-05-09

2015-05-09

2015-05-09

2015-05-09

2015-05-10

2015-05-10

2015-05-10

如果要生成下列结果, 该如何写sql语句?

DDate

2015-05-09

2

2

2015-05-10

1

2

--建表
create table table1(DDate string, shengfu string) ;
insert overwrite table table1 values ('2015-05-09', "胜"),
       ('2015-05-09', "胜"),
       ('2015-05-09', "负"),
       ('2015-05-09', "负"),
       ('2015-05-10', "胜"),
       ('2015-05-10', "负"),
       ('2015-05-10', "负");

--注意:count 对应else必须为null,count是统计不为0的行数;sum 对应else为null和0都行,做累加
select DDate,
       count(case when shengfu = '胜' then 1 else null end) `胜`,
       count(case when shengfu = '负' then 1 else null end) `负`
from table1
group by DDate;

题三

假设tableA如表5, tableB如表6,

表5

qq号(字段名:qq)

游戏(字段名:game)

10000

a

10000

b

10000

c

20000

c

20000

d

表6

qq号(字段名:qq)

游戏(字段名:game)

10000

a_b_c

20000

c_d

请写出以下sql逻辑:

a, 将tableA输出为tableB的格式; 【行转列】

b, 将tableB输出为tableA的格式; 【列转行】

create table tableA(qq string, game string) 
insert overwrite table tableA values 
       (10000, 'a'),
       (10000, 'b'),
       (10000, 'c'),
       (20000, 'c'),
       (20000, 'd');

create table tableB(qq string, game string) ;
insert overwrite table tableB values 
(10000, 'a_b_c'),
(20000, 'c_d');
       
--将tableA输出为tableB的格式;  
select qq,
       concat_ws('_', collect_list(game)) game
from tableA
group by qq;   
       
--将tableB输出为tableA的格式; 需要借助【侧视图+explode函数】   
select qq,
       tmp.game
from tableB lateral view explode(split(game, '_')) tmp as game;

连续N天登陆

思路分析过程

  • 思路一

mysql计算人员排名_ci

--核心代码
->distinct
-> row_number 
-> date_sub(dt,rn) as dt2
-> group by dt2,name 
-> having count(1)>=N天 
-> distinct name 
-> count(name)
  • 思路二

mysql计算人员排名_ci_02

--核心代码
->distinct
->date_add(dt,N-1) as date2
->lead(dt,N-1) over(partition by userid order by dt) as date3
->where date2=date3
->distinct
set hive.exec.mode.local.auto=true;
/*--OPPO*/
create table game(name string,`date` string);
insert overwrite table game  values
                       ('张三','2021-01-01'),
                       ('张三','2021-01-02'),
                       ('张三','2021-01-03'),
                       ('张三','2021-01-02'),

                       ('张三','2021-01-07'),
                       ('张三','2021-01-08'),
                       ('张三','2021-01-09'),

                       ('李四','2021-01-01'),
                       ('李四','2021-01-02'),
                       ('王五','2021-01-03'),
                       ('王五','2021-01-02'),
                       ('王五','2021-01-02');

select * from game;
--方案1
with t1 as (
    select distinct name,`date` from game
),
     t2 as (
         select *,
                row_number() over (partition by name order by `date`) as rn
         from t1
     ),
     t3 as (
         select *,
                date_sub(`date`,rn) as temp
         from t2
     ),
     t4 as (
         select name,temp ,
                count(1) as cnt
         from t3
         group by name,temp
         having count(1)>=3
     )
select distinct name from t4;

--方案2
with t1 as (
    select distinct name,`date` from game
),
    t2 as (
        select *,
               date_add(`date`,2) as date2,
               lead(`date`,2) over(partition by name order by `date`) as date3
          from t1
    )
select distinct name from t2 where date2=date3;

窗口函数

窗口函数的最大特点是有over()关键字,代表定义窗口

函数名(字段名)over(partition by xxx,yyy order by zzz)

  • 聚合类的窗口函数
    sum() count()/avg()/max()/min()
  • 排序类的窗口函数
    row_number,rank,dense_rank
  • 偏移类的,跨行的
    lag,lead
  • 【了解】first_value和last_value
  • 【了解】ntile

题一

create table score(cid int ,sname string,score int);
insert overwrite table score values
(1,'张三',60),
(1,'李四',70),
(1,'王五',80),
(1,'赵六',90),
(2,'安安',80),
(2,'平平',90);
select * from score;
-- 第一类、 聚合类的窗口函数
-- 计算同一个班级内,每个同学和比他分数低的总分是多少
	sum(score) over (partition by cid order by score )  as `累加分数1`,
-- 上句等价于
	sum(score) over (partition by cid order by score rows between unbounded preceding and current row ) as `累加分数2`,
--思考:如何计算每个同学,在班级内,包括他自己,和比他低一名的,和比他高1名的,这3个分数求和
	sum(score) over(partition by cid order by score rows between 1 preceding and 1 following) as `累加分数3`
    
-- 第二类、 排序类的窗口函数
-- 同一个班内,按分数排序打上序号
	row_number() over (partition by cid order by score)  as `分数序号排名`,
-- 考虑并列
	rank()       over (partition by cid order by score)  as `分数序号排名2`, 
	dense_rank() over (partition by cid order by score)  as `分数序号排名3`	并列后序号,自然顺序
	
-- 第三类、 偏移类的,跨行的
select *,
       -- 同一班内,考得比自己低1名的分数是多少
       lag(score, 1) over (partition by cid order by score)     as `低一名的分数`,
       -- 同一班内,考得比自己低1名的分数是多少,如果找不到,则显示为0
       lag(score, 1, 0) over (partition by cid order by score)  as `低一名的分数2`,
       -- 同一班内,考得比自己低2名的分数是多少
       lag(score, 2) over (partition by cid order by score)     as `低2名的分数`,
       -- 同一班内,考得比自己高1名的分数是多少
       lead(score, 1) over (partition by cid order by score)    as `高一名的分数`
from score;

题二

Emp表的表数据如下:

NAME

MONTH

AMT

张三

01

100

李四

02

120

王五

03

150

赵六

04

500

张三

05

400

李四

06

350

王五

07

180

赵六

08

400

问题:请写出可以得到以下的结果SQL

NAME

总金额

排名

占比

赵六

900

1

40.91%

张三

500

2

22.73%

李四

470

3

21.36%

王五

330

4

15.00%

create table emp(name string , month string, amt int);
insert overwrite table emp values ('张三', '01', 100),
                                  ('李四', '02', 120),
                                  ('王五', '03', 150),
                                  ('赵六', '04', 500),
                                  ('张三', '05', 400),
                                  ('李四', '06', 350),
                                  ('王五', '07', 180),
                                  ('赵六', '08', 400);
---
with t1 as (
    select name,
           sum(amt) as sum_amt
        from emp
        group by name
),
t2 as (
    select *,
    row_number() over (partition by null order by sum_amt desc) as rn, --排名
    sum_amt/sum(sum_amt) over () as rate
    from t1
)
select name, sum_amt, rn, concat(round(rate*100,2),'%') rate from t2

题三

题目描述:

在第一题员工表的基础上,统计每年入职总数以及截至本年累计入职总人数。

截至本年累计入职总人数=本年总入职人数 + 本年之前所有年的总入职人数之和

结果

mysql计算人员排名_mysql计算人员排名_03

select *,
       sum(cnt) over (order by year1) cnt2
from
(select year(hiredate) as year1,count(empno) as cnt from emp group by year(hiredate)) a;

题四

mysql计算人员排名_数据库_04

需求:根据商品大类Goods_type对商品金额price从小到大排序,前30%为低挡,30%~85%为中档,高于85%为高档打上标签。

select goods_type,
       goods_name,
       price,
       case when rate<0.3 then '低档'
            when rate>=0.3 and rate<0.85 then '中档'
            when rate>=0.85 then '高档'
       end    as type
from (
         select *,
    			-- 每个在组内的排名
                row_number() over (partition by goods_type order by price)
    			-- 每个组内有多少个
                    /count(1) over(partition by goods_type) as rate --比例
         from goods
     ) as t1
;

N日留存率

mysql计算人员排名_sql_05

  • 核心代码
-> where 日期 in (首日,1天后,7天后)
-> group by 用户
-> count(if(日期=首日,1,null))  as cnt
   count(if(日期=1天后,1,null)) as cnt2
   count(if(日期=7天后,1,null)) as cnt8
->having cnt>0
->count(user_id) as 首日总数
  count(if(cnt2>0,1,null)) as 次日留存数
  count(if(cnt8>0,1,null)) as 7日留存数
->次日留存数/首日总数 as 次日留存率
  7日留存数/首日总数 as 7日留存率

**需求:**一条sql统计出以下指标 (4.1号uv,4.1号在4.2号的留存uv,4.1号在4.8号的留存uv);

uv就是不同的用户数

create table if not exists tb_cuid_1d
(
    cuid         string comment '用户的唯一标识',
    os           string comment '平台',
    soft_version string comment '版本',
    event_day    string comment '日期',
    visit_time    int comment '用户访问时间戳',
    duration     decimal comment '用户访问时长',
    ext          array<string> comment '扩展字段'
);
insert overwrite table tb_cuid_1d values
                                      (1,'android',1,'2020-04-01',1234567,100,`array`('')),
                                      (1,'android',1,'2020-04-02',1234567,100,`array`('')),
                                      (1,'android',1,'2020-04-08',1234567,100,`array`('')),
                                      (2,'android',1,'2020-04-01',1234567,100,`array`('')),
                                      (3,'android',1,'2020-04-02',1234567,100,`array`(''));

**方案1:**性能快,步骤稍微复杂

  • 先按用户分组,得到每个用户的各相关日期的登录情况。
select '2020-04-01的留存情况' as type,
       count(cuid) as uv_4_1,
       count(case when cnt_4_2>0 then 1 else null end) as uv_4_2,
       count(case when cnt_4_8>0 then 1 else null end) as uv_4_8
from
(select cuid,
       count(case when event_day='2020-04-01' then 1 else null end) as cnt_4_1,
       count(case when event_day='2020-04-02' then 1 else null end) as cnt_4_2,
       count(case when event_day='2020-04-08' then 1 else null end) as cnt_4_8
from tb_cuid_1d
--提前过滤数据
where event_day in ('2020-04-01','2020-04-02','2020-04-08')
group by cuid
-- 2020-04-01必须登录,剔除掉2020-04-01没登录的
having cnt_4_1>0) as t;

效果如下

mysql计算人员排名_sql_06

  • 再对上面的用户汇总
select count(cnt) as uv,
       count(if(cnt2!=0,1,null)) as uv2,	//4月2日留存数
       count(if(cnt8!=0,1,null)) as uv8,
       -- 留存率
       uv2/uv as rate_4_2,	//4月2日留存率
       uv8/uv as rate_4_8

mysql计算人员排名_mysql计算人员排名_07

  • 最后再用 【后续日期的留存数】除以【首日总数】,就是【留存率】

**方案2:**性能慢,步骤比较简单

select count(a.cuid) uv,
       count(b.cuid) uv2,
       count(c.cuid) uv7,
       count(b.cuid)/count(a.cuid) as rate_4_2,
       count(c.cuid)/count(a.cuid) as rate_4_8
from (select distinct event_day, cuid from tb_cuid_1d where event_day='首日') as a
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='次日') as b on a.cuid=b.cuid
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='7日后') as c on a.cuid=c.cuid;

分组内top前几

  • 需求常见词:
  • 【每组xxx内按yyy排序的前n个zzz】
  • 【每组xxx内按yyy排序的第1个zzz】
  • 【每组xxx内按yyy排序的最后1个zzz】
  • 特点是yyy和zzz是不同的字段。
  • 比如:班内按性别分组,组内按身高排序,的前3个学生姓名
  • 公式:row_number() over(partition by 组名 order by yyy) as rn,再筛选rn<=N名

题一

员工表结构

mysql计算人员排名_sql_08

员工表数据

mysql计算人员排名_数据库_09

题目描述

求出每个部门工资最高的前三名员工,并计算这些员工的工资占所属部门总工资的百分比。

结果

mysql计算人员排名_mysql计算人员排名_10

create table emp(empno string ,ename string,hiredate string,sal int ,deptno string);
insert overwrite table emp values
('7521', 'WARD', '1981-2-22', 1250, 30),
('7566', 'JONES', '1981-4-2', 2975, 20),
('7876', 'ADAMS', '1987-7-13', 1100, 20),
('7369', 'SMITH', '1980-12-17', 800, 20),
('7934', 'MILLER', '1982-1-23', 1300, 10),
('7844', 'TURNER', '1981-9-8', 1500, 30),
('7782', 'CLARK', '1981-6-9', 2450, 10),
('7839', 'KING', '1981-11-17', 5000, 10),
('7902', 'FORD', '1981-12-3', 3000, 20),
('7499', 'ALLEN', '1981-2-20', 1600, 30),
('7654', 'MARTIN', '1981-9-28', 1250, 30),
('7900', 'JAMES', '1981-12-3', 950, 30),
('7788', 'SCOTT', '1987-7-13', 3000, 20),
('7698', 'BLAKE', '1981-5-1', 2850, 30);

--求出每个部门工资最高的前三名员工,并计算这些员工的工资占所属部门总工资的百分比。
select a.empno,
       a.sal,
       a.deptno,
       a.rn,
       a.sum_sal,
       round(a.sal/a.sum_sal,2) as rate
from
(select *,
--每个部门工资排名
         row_number() over (partition by deptno order by sal desc) as rn, 
--每个部门的总工资
         sum(sal) over(partition by deptno ) as sum_sal 
from emp) a
where rn<=3;

join系列

【区分 inner /left / right / full / left semi / left anti join 的特点】

有以下银行信息表

mysql计算人员排名_ci_11

银行黑名单信息表

mysql计算人员排名_mysql计算人员排名_12

left join

使用left join对所有用户,如果也在黑名单中,则标记为YES,否则标记为NO。

id

name

sex

age

flag

1

张三

20

YES

2

李四

29

YES

3

王五

21

YES

4

赵六

28

NO

5

田七

22

NO

select a.*,
       if(b.user_id is not null, 'YES', 'NO') flag
from all_users a
left join black_list b
on a.id = b.user_id;

right join

对上面的问题,使用right join再做一次。

id

name

sex

age

flag

1

张三

20

YES

2

李四

29

YES

3

王五

21

YES

4

赵六

28

NO

5

田七

22

NO

select b.*,
       if(a.user_id is not null, 'YES', 'NO') flag
from black_list a
right join all_users b
on a.user_id = b.id;

left semi join

使用left semi join对所有用户,如果也在黑名单中,则挑选出来。

id

name

sex

age

1

张三

20

2

李四

29

3

王五

21

-- left semi join结果只显示左表字段,不会显示右表字段
select *
from all_users a
left semi join black_list b on a.id = b.user_id;
-- 他跟left join的关系是:
select a.*
from all_users a
left join black_list b on a.id = b.user_id
where b.user_id is not null;
-- 也等价于
select a.*
from all_users a
join black_list b on a.id = b.user_id;

left anti join

使用left anti join对所有用户,如果不在黑名单中,则挑选出来。

id

name

sex

age

4

赵六

28

5

田七

22

select *
from all_users a
left anti join black_list b on a.id = b.user_id;
--他跟left join的关系是:
select a.*
from all_users a
left join black_list b on a.id = b.user_id
where b.user_id is null;

full join

用户银行存款信息表(user_id ‘用户id’,amount ‘存款金额’)

mysql计算人员排名_ci_13

用户银行负债信息表(user_id ‘用户id’,amount ‘负债金额’)

mysql计算人员排名_ci_14

使用full join,展示用户的存款金额和负债金额。

user_id

deposit_amount

debt_amount

1

2000

0

2

2900

0

3

2100

3400

4

0

2800

5

0

2200

select coalesce(a.user_id,b.user_id) as user_id_new,
       coalesce(a.amount,0) as deposit_amount,
       coalesce(b.amount,0) as debt_amount
from deposit a
full join debt b on a.user_id=b.user_id;

-- coalesce返回第一个不为null的值
select coalesce(123,456) as x;	123
select coalesce(null,456) as x;	456
select nvl(123,0) as x;	123
select coalesce(null,null,456) as x; 456

题一:

需求:某个游戏中的元宝分为,付费元宝和免费元宝。玩家购买商城道具时候,可以使用付费元宝也可以使用免费元宝。请使用HIve SQL语句计算出2021-01-01至2021-01-07期间各个角色当日消耗元宝的付费免费比例(付费免费比 = 付费元宝消耗量 / 免费元宝消耗量)。

现有表结构如下:

desc

dm_paid_buy;

#dm_paid_buy

角色使用付费元宝购买商城道具时候记录一条

time

bigint

#购买的时间戳

server_id

string

#服务器ID

role_id

int

#角色ID

cost

int

#购买对应道具消耗的付费元宝数量

item_id

int

#购买对应道具的id

amount

int

#购买对应道具的数量

p_date

string

#登录日期,yyyy-MM-dd

desc

dm_free_buy;

#dm_free_buy

角色使用免费元宝购买商城道具时候记录一条

time

bigint

#购买的时间戳

server_id

string

#服务器ID

role_id

int

#角色ID

cost

int

#购买对应道具消耗的免费元宝数量

item_id

int

#购买对应道具的id

amount

int

#购买对应道具的数量

p_date

string

#登录日期,yyyy-MM-dd

结果输出

p_date

server_id

role_id

付费免费比

2021-01-01

123

10098

0

2021-01-01

120

10098

0.4

2021-01-02

123

10098

0.2

use interview_db;
set hive.exec.mode.local.auto=true;

create table if not exists dm_paid_buy
(
    `time`    bigint comment '#购买的时间戳',
    server_id string comment '#服务器ID',
    role_id   int comment '#角色ID',
    cost      int comment '#购买对应道具消耗的付费元宝数量',
    item_id   int comment '#购买对应道具的id',
    amount    int comment '#购买对应道具的数量',
    p_date    string comment '#登录日期, yyyy-MM-dd'
) comment '角色使用付费元宝购买商城道具时候记录一条';
insert overwrite table dm_paid_buy values
(1234567,120,10098,2,3,4,'2021-01-01'),
(1234567,120,10098,4,3,5,'2021-01-01'),
(1234567,123,10098,3,3,2,'2021-01-02'),
(1234567,123,10098,2,3,2,'2021-01-02');

---
create table if not exists dm_free_buy
(
    `time`    bigint comment '#购买的时间戳',
    server_id string comment '#服务器ID',
    role_id   int comment '#角色ID',
    cost      int comment '#购买对应道具消耗的免费元宝数量',
    item_id   int comment '#购买对应道具的id',
    amount    int comment '#购买对应道具的数量',
    p_date    string comment '#登录日期, yyyy-MM-dd'
) comment '角色使用免费元宝购买商城道具时候记录一条';
insert overwrite table dm_free_buy values
(1234567,123,10098,8,3,4,'2021-01-01'),
(1234567,123,10098,5,3,5,'2021-01-01'),
(1234567,120,10098,6,3,4,'2021-01-01'),
(1234567,120,10098,9,3,5,'2021-01-01'),
(1234567,123,10098,18,3,2,'2021-01-02'),
(1234567,123,10098,7,3,2,'2021-01-02');

select coalesce(a.p_date, b.p_date)         p_date,
       coalesce(a.server_id, b.server_id)   server_id,
       coalesce(a.role_id, b.role_id)       role_id,
       round(nvl(a.cost, 0) / b.cost, 2) as rate
from (select p_date, server_id, role_id, sum(cost) cost
      from dm_paid_buy
      where p_date between '2021-01-01' and '2021-01-07'
      group by p_date, server_id, role_id) a
full join (select p_date, server_id, role_id, sum(cost) cost
      from dm_free_buy
      wherep_date between '2021-01-01' and '2021-01-07'
      group by p_date, server_id, role_id) b
    on a.p_date=b.p_date and a.server_id=b.server_id and a.role_id=b.role_id;
    
-- 使用with as 来重构上面的逻辑
with a as (
    select p_date,server_id,role_id,
           sum(cost) as cost
    from dm_paid_buy
    where p_date between '2021-01-01' and '2021-01-07'
    group by p_date,server_id,role_id
),
    b as(
        select p_date,server_id,role_id,
               sum(cost) as cost
        from dm_free_buy
        where p_date between '2021-01-01' and '2021-01-07'
        group by p_date,server_id,role_id
    )
select coalesce(a.p_date,b.p_date) as p_date,
       coalesce(a.server_id,b.server_id) as server_id,
       coalesce(a.role_id,b.role_id) as role_id,
       coalesce(a.cost,0) as a_cost,
       coalesce(b.cost,0) as b_cost,
       coalesce(a.cost,0)/b.cost as rate
from  a
full join b
on a.p_date=b.p_date and a.server_id=b.server_id and a.role_id=b.role_id;

join优化

join前请三思,能否先减小数据量,再join

通用口诀:先用where或group by减少数据量,再join

-- 优化前,慢:
select 
  b.userid, 
  b.username, 
  sum(xx), 
  count(xx) 
from a【比如交易明细事实表】 
join b【比如客户维度信息表】
on a.userid = b.userid 
where b.yy < 条件 
group by b.userid, b.username 
having count (a.xx) > 10;

-- 优化后,快:
select b.userid,
	   b.username, 
       s,
       cnt 
      --【提前对a减少数据量】
from (select userid,
      		 sum(xx) s,
      	     count(xx) cnt
		from a group by userid having count(a.xx) >10) a 
	  -- 【提前对b减少数据量】
join (select * from b where b.yy<条件) b
on a.userid = b.userid;

题一

表T1的数据结构:

字段英文名

字段中文名

类型

主键标志

注释

Rec_no

记录号

CHAR(3)

Y

Ci_no

客户号

CHAR(6)

N

Cust_Type

客户类型

CHAR(2)

N

Cre_dt

开户日期

Date

N

Cus_sts

客户状态

Char(1)

N

Y-正常 N-无效

表T1的数据

Rec_no

ci_no

cust_type

cre_dt

cus_sts

123

111111

01

2010-11-15

Y

234

222222

02

2011-09-01

Y

345

333333

02

2012-01-09

Y

456

444444

01

2012-09-08

Y

表T2的数据结构:

字段英文名

字段中文名

类型

主键标志

注释

Ci_no

客户号

CHAR(6)

Y

AC_no

客户账号

CHAR(9)

Y

Bal

账号余额

DECIMAL(15,2)

N

表T2的数据:

Ci_no char(6)

Ac_no char(9)

Bal decimal(15,2)

222222

123456888

1000.28

222222

123456999

886

333333

123454321

5000

请编写sql统计在9月份开户且账户余额不为0的有效客户数。(8分)

-- 传统的写法。
select count(distinct t1.ci_no) as cnt
from t1
join t2 on t1.ci_no=t2.ci_no
where month(t1.cre_dt)=9
and t1.cus_sts='Y'
and bal>0;

-- 方案2
select count(t1.ci_no) as cnt
	-- 9月份开户,有效客户
from (select * from t1 where month(cre_dt)=9 and cus_sts='Y') t1
	-- 账户余额不为0
join (select ci_no from t2 group by ci_no having sum(bal)>0) t2
on t1.ci_no=t2.ci_no;

带条件的聚合统计

  • 一般的做法是group by xx,yy 再多次的sum(if(…))
  • 好处是避免多次加载表,一次性得到多个指标,可以只加载一次表就得到多个指标。

题一

查询课程编号“2”的成绩比课程编号“1”低的所有同学的学号、姓名。【这是行转列的衍生题】

create table student(sid int, sname string, gender string, class_id int);
insert overwrite table student
values (1, '张三', '女', 1),
       (2, '李四', '女', 1),
       (3, '王五', '男', 2);

create table  course (cid int, cname string, teacher_id int);
insert overwrite table course
values (1, '生物', 1),
       (2, '体育', 1),
       (3, '物理', 2);

create table score (sid int, student_id int, course_id int, number int);
insert overwrite table score
values (1, 1, 1, 58),
       (4, 1, 2, 50),
       (2, 1, 2, 68),
       (3, 2, 2, 89);

-- 优化sql
select sid, sname
from 
(select student_id,
       sum(if(course_id=2,number,0)) as pe, --体育
       sum(if(course_id=1,number,0)) as bio --生物
from score
group by student_id
having pe<bio) as t
join student on t1.student_id = sid;

问题二

问题:用一条SQL生成完整的用户画像表,包含如下字段:

user_id, sex, age, d7order_num, d14_order_num,后面两个字段分别为近7天订单数量,近14天订单数量。

create table sale_order(
    order_id bigint comment '订单ID',
    user_id bigint comment '用户ID',
    order_status int ,
    create_time string,
    last_update_time string,
    product_id bigint,
    product_num bigint
);
create table user_info(
    user_id bigint comment '用户ID,唯一主键',
    sex string,
    age int
);

-- 优化
select u.user_id,
	   u.sex,
       u.age,
       s.d7order_num,
       s.d14order_num
from user_info u
left join (select user_id,
           		  -- 近7天订单数量
                  count(distinct if(to_date(create_time) between data_sub(`current_date()`,6) and `current_date()`,order_id,null)) as d7order_num,
           		  -- 近14天订单数量
                  count(distinct if(to_date(create_time) between data_sub(`current_date()`,13) and `current_date()`,order_id,null)) as d14order_num
           from sale_order
           where create_time >= data_sub(`current_date()`,13)
           group by user_id) s 
on u.user_id = s.user_id;

mysql

列传行

  • 行转列的做法是,substring_index(截取字符串的函数) + help_topic_id(截取字符串的函数) + help_topic_id(mysql自带帮助主题表)

已知:

user01

mysql计算人员排名_数据库_15

role01

mysql计算人员排名_数据库_16


查询:

mysql计算人员排名_hive_17

解答:

SELECT t.user_name, group_concat(role_name SEPARATOR '|') AS role_names
from (SELECT user_name, SUBSTRING_INDEX(SUBSTRING_INDEX(role_ids, '|', help_topic_id + 1), '|', -1) AS role_id
      FROM user01
               JOIN mysql.help_topic
                    on help_topic_id < (LENGTH(role_ids) - LENGTH(REPLACE(role_ids, '|', '')) + 1)) t
         JOIN role01 on t.role_id = role01.role_id
GROUP BY t.user_name;

-- 注:
-- (LENGTH(role_ids) - LENGTH(REPLACE(role_ids, '|', '')) +1 ) 为字符串分割后的长度
-- 即:分隔前长度-去掉分隔符长度+1

postgres

pg 窗口函数row_number()

mysql计算人员排名_ci_18