sql执行顺序:from -> on -> join -> where -> group by -> 聚集函数 -> having -> having -> select ->distinct -> union -> order by -> limit
(1)分组求最大值
1、单表分组最大
//在每一個州中找出最大面積的國家,列出洲份 continent, 國家名字 name 及面積 area
SELECT continent, name, area
FROM world x
WHERE area >= ALL( //子查询相当于查找每一个州的所有area =>笛卡尔乘积,X里的每一个area都会与y的area比较(前提是continent相等)
SELECT area FROM world y
WHERE y.continent=x.continent
AND area>0
)
//有些國家的人口是同洲份的所有其他國的3倍或以上。列出 國家名字name 和 洲份 continent。
SELECT name,continent FROM world x
WHERE x.population / 3 >= ALL(//子查询相当于找到每州中所有符合的population
SELECT population FROM world y
WHERE y.continent = x.continent AND population >0 AND y.name != x.name
)
2、两表分组求最大
//每个部门工资最高的员工
方法1
SELECT Department.name AS 'Department',Employee.name AS 'Employee',Salary
FROM Employee JOIN Department ON Employee.DepartmentId = Department.Id
WHERE (Employee.DepartmentId , Salary) IN (
SELECT
DepartmentId, MAX(Salary)
FROM Employee GROUP BY DepartmentId
);
方法2
select Department.Name Department,e1.Name Employee,e1.Salary Salary
from Employee e1 join Department on DepartmentId=Department.Id
where Salary >=ALL(
select Salary
from Employee e2
where e1.DepartmentId=e2.DepartmentId
);
3、分组求topN
SQL写法
SELECT d.Name AS 'Department', e1.Name AS 'Employee', e1.Salary
FROM Employee e1 JOIN Department d ON e1.DepartmentId = d.Id
WHERE 3 > (
SELECT COUNT(DISTINCT e2.Salary)
FROM Employee e2
WHERE e1.Salary < e2.Salary AND e1.DepartmentId = e2.DepartmentId
);
取分组前3的代码
SELECT e1.Salary
FROM Employee AS e1
WHERE 3 >(
SELECT count(DISTINCT e2.Salary)
FROM Employee AS e2
WHERE e1.Salary <e2.Salary AND e2.DepartmentId = e1.DepartmentId
);
HQL写法
select name,score
from(
select name,score,row_number() over(partition by name order by score desc) rank
from stu_score
)tmp_table
where tmp_table.rank<=3;
(2)利用row_number()和over()分组求topN
select name,score,rank
from (
select name,score,row_number() over(partition by name order by score desc) rank
from stu_score
)t
where rank<=3
解析:利用排序函数row_number()和开窗函数over(),其中排序函数row_number()是顺序排序,dense_rank()是连续排序,rank()函数是跳跃排序。
(3)利用row_number()和over()求用户连续登录最长天数
select uid,max(continuous_days)
from(
select uid,date_sub(date1-rank),count(1) as continuous_days
from(
select uid,date1,row_number() over(partition by uid order by date1) rank
from user_login
)
group by uid,date_sub(date1-rank)
)
group by uid;
解析:其中date_sub(2019-12-31,1)=2019-12-30
(4)求留存率
1、先求每天的活跃用户数,记为A表
select dayNo,count(distinct(uid)) num1
from user_login
group by dayNo
2、每天留存数,记为B表
select u.dayNo,count(distinct(s.uid)) num2
from userInfo u1 left join userInfo u2 on u1.uid=u2.uid;
where datediff(u2.dayNo,u1.dayNo)=1
group by u.dayNo
3、留层率
select t1.date,t2.num2/t1.num2
from (
select date,count(distinct(user_id)) num1
from login
group by date
) t1 join (
select u1.date,count(distinct(u2.user_id)) num2
from login u1 left join login u2 on u1.user_id=u2.user_id;
where datediff(u2.date,u1.date)=1
group by u1.date
) t2 on t1.date=t2.date
union
select date,0.000 as p
from login
where date not in(
select t2.date
from t2
)
解析:利用A表和B表即可求出哪天的留存率。datediff(2019-12-30,2019-12-31)=1,若求3天留存,则将datediff(2019-12-30,2019-12-31)=3
(5)case when与group by
1、利用case when构造多个字段
FName FCourse FScore FName 语文 数学 英语
张三 语文 95.0 张三 95 0 0
张三 数学 80.0 张三 0 80.0 0
张三 英语 70.0 => 张三 0 0 70
李四 语文 88.0 李四 88.0 0 0
李四 数学 90.0 李四 0 87.0 0
李四 英语 70.0 李四 0 0 86.0
答案:
FName 语文 数学 英语
张三 95 80 70
李四 88 87 86
select
FName ,
max(case FCourse when '语文' then FScore else 0 end) as '语文',
max(case FCourse when '数学' then FScore else 0 end) as '数学',
max(case FCourse when '英语' then FScore else 0 end) as '英语'
from score
group by FName
2、利用case when构造单个字段
解法一:
select FName,
case
when FCourse='数学' then '理科' else '文科' end as '科别', sum(FScore) as '总分'
from score
group by FName,case when FCourse='数学' then '理科' else '文科' end
order by FName
解法二
select FName, FType as '科别', sum(FScore) as '总分'
from (
select FName, FCourse, FScore, FType = case when FCourse in ('语文','英语') then '文科' else '理科' end
from score
) as a
group by FName,FType
order by FName
(6)登录表:t_login: user_id,login_time,login_ip,支付表:t_pay: user_id,pay_time,pay_money
1、用1个SQL统计出 登录总次数、登录总人数、登录但未支付的总人数
select count(1),count(distinct a.user_id),
count(distinct case when b.user_id is null then a.user_id end)
from t_login as a left join t_pay as b on a.user_id = b.user_id
或者
select,count1,count2,count(distinct(user_id))
from t_login t3 join (
select t1.user_id,t2.pay_time,count(login_ip) count1,
count(distinct(t1.user_id)) count2
from t_login t1 left join t_pay t2 on t1.user_id=t2.user_id
)t on t.user_id=t3.user_id
where pay_time=null;
2、查询出每个用户最近的一条登录数据 user_id,login_time,login_ip
select user_id,login_time,login_ip
(select user_id,login_time,login_ip,
row_number over(partition by user_id order by login_time desc) new_time
from t_login) t
where new_time=1;
3、按总支付金额范围分组统计 支付的总人数、支付的平均金额
select
case
when pay_money >= 1 and pay_money <= 100 then '1~100'
when pay_money >= 101 and pay_money <= 1000 then '101~1000'
when pay_money >= 1001 then '1001+' end pay_limits,
count(distinct(user_id)),
avg(pay_money)
from
t_pay
group by
case
when pay_money >= 1 and pay_money <= 100 then '1~100'
when pay_money >= 101 and pay_money <= 1000 then '101~1000'
when pay_money >= 1001 then '1001+' end
(7)多表join
select t1.name,t2.name,t3.name
from (city t1 join city t2 on t1.id=t2.parentID) join city t3 on t2.id=t3.parentID
(8)利用explode行转列
===》
解析:explode(col_name):将hive中的一列的array或者map分成多行,如果是字符串类型则利用split将其变成数组;表movie_info与虚表temp_table利用lateral view进行相关联,其中temp_table表中的字段为category_type
(9)列转行
将列表中一个id可能会占用多行转换为每个user占一行的目标表格式
解析:concat(string1,string2,... ):连接括号内字符串,数量不限;
concat_ws(separator,string1,string2,... ):连接括号内字符串或者数组,数量不限,连接符为separator。
collect_set(字段):此函数只接受基本类型,主要是将字段的值进行去重汇总,产生array类型字段。