1.常用SQL语法
1.1 字符
substr(staff_leave_date,1,10)
concat(string1, ..., stringN) → varchar--拼接多个字符串
length(string) → bigint
length(cast(start_time as varchar))
replace(string, search, replace) → varchar
trim(string) → varchar--删除左右两侧的空格
split_part(ip,':',1)--以冒号分割取第二部分
upper/lower(string) → varchar--大小写
###mid(str,pos,len)--从指定位置截取指定长度的字符串 --错误
substr(str,start,length)--截取指定长度字符串
1.2 日期
current_date -> date
now() → timestamp
from_unixtime(create_time)--int/bigint→ timestamp
substr(staff_leave_date,1,10)
date_format(from_unixtime(status_update_time),'%Y-%m-%d')--调整时期格式'2017-11-25'
date_format(cast(audit_time as timestamp),'%Y-%m-%d %H:%i:%s')--2018-04-13 22:09:16
date_add('day', -1, CURRENT_DATE)--当前日期-1
date_trunc('month',from_unixtime(create_time))--当前月份的第一天,2001-08-22 03:04:05.321→2001-08-01 00:00:00.000
date_diff(unit, timestamp1, timestamp2) → bigint
date_diff('day',cast(substr(audit_at,1,10) as date),cast(current_timestamp as date)) as "库存时间"
day_of_week(x) → bigint
Returns the ISO day of the week from x. The value ranges from 1 (Monday) to 7 (Sunday).
week_of_year(x) → bigint
This is an alias for week()
1.3 json
json_extract(json, json_path) → json
json_extract_scalar(json, json_path) → varchar
{"chang_status_remark":["ChangeApply.id = 56798765","Apply.id = 345876"],"source_status":[3,0],"status_update_time":[98765,234589]}
json_extract(change_info,'$.source_status') --[3,0]
json_extract_scalar(change_info,'$.source_status[1]')--0
json_extract_scalar(json, '$.store.book[0].author')
1.4 关联
表间关联
t01
inner/left/right join
t02
on t01.id = t02.id
left/right 未关联到的字段置为null
多部分数据拼接 union:对结果去重且排序 union all:直接返回合并的结果,速度快
union中的每个查询必须包含相同的列,表达式及聚合函数;
只能使用一条order by字句,且必须放在最后一个查询后面
1.5 条件
两分类:
if(staff_status =1,'在职','离职') as "在职状态"
等价于
case when staff_status=1 then '在职'
else '离职'
end as "在职状态"
多分类:
case when level=1 then 'A'
when level=2 then 'B'
when level=3 then 'C'
else 'D'
end as "等级"
coalesce(value1, value2[, ...])--返回第一个非NULL的值
Coalesce(vehicle_finance_loan_order_is_micro_loan, 0)
1.6 正则
store_main_store_name not like '%测试%'
and store_main_store_name not like '%test%'
等价于
not regexp_like(store_main_store_name, '(测试|test)')--"|"这个符号一定是英文状态下的
regexp_like(location_name, '(徐州|宿迁|盐城|苏州|南京|淮安|常州|无锡|南通|扬州|泰州|镇江|连云港)')
regexp_like(字段名, '(str1|str2|...)')--全模糊匹配
regexp_like(字段名, '^(str1|str2|...)')--右模糊匹配
regexp_like(字段名, '(str1|str2|...)$')--左模糊匹配
regexp_extract_all(string, pattern) → array<varchar>
regexp_extract_all('1a 2b 14m', '\d+'); -- [1, 2, 14]
regexp_extract(string, pattern) → varchar
regexp_extract('1a 2b 14m', '\d+'); -- 1
1.7 转换
cast(value AS type) → type
cast('2017-01-01' as timestamp)--2017-01-01 00:00:00.0
cast(from_unixtime(appoint_actual_time) as timestamp)--2018-04-13 15:55:55.0
cast(1.22222 as decimal(10,2))--1.22
cast(substr(audit_at,1,10) as date)--2018-04-13
cast(business_key_ as int)--32971656
cast(start_time as varchar)
try_cast(value AS type) → type
将value转换为type指定的类型,如果转换错误,则返回NULL。
typeof(expr) → varchar
返回表达式expr数据类型的名称。
typeof(123); -- integer
1.8 聚合统计
group by user_id having
count()→ bigint
sum()
max()
min()
mean() --sql均值不是mean,也不是average,注意!!!python中是np.mean()
avg() --均值
distinct id
variance
stddev(x) → double
variance(x) → double
1.9 排序
order by score asc/desc
dense_rank() → bigint
percent_rank() → bigint
rank() → bigint
row_number
row_number() over(partition by user_id order by score asc)
sum(totalprice) over (partition by clerk order by orderdate) as rolling_sum
2. 创建时间的下一个整点
date_trunc('hour', date_add('hour', 1, cast(created_at as timestamp)))--created_at为string
3.from_unixtime(create_time/1000)--单位为秒,如何取到毫秒?
create_time from_unixtime(create_time/1000)
1602381660211 2020-10-11 10:01:00.0
1602381660381 2020-10-11 10:01:00.0
4. format_datetime
select create_time as bigint
,from_unixtime(create_time/1000) as "秒"--ms与s进制是1000
,format_datetime(from_unixtime(action_time/1000),'yyyy-MM-dd HH:mm:ss.mmm') as "毫秒"
bigint | 秒 | 毫秒 |
1603236258992 | 2020-10-21 07:24:18.0 | 2020-10-21 07:24:18.024 |
create_time 秒 毫秒
1602381660211 2020-10-11 10:01:00.0 2020-10-11 10:01:00.001
1602381660381 2020-10-11 10:01:00.0 2020-10-11 10:01:00.001
5. presto sql 如何将sql脚本作为where查询条件
select *
from table
where query = 'select aa from tt where name ='北京' limit 1'
当作普通字符串处理,单引号记得用 ''转义一下,举例如下
select *
from table
where query = 'select aa from tt where name = ''北京'' limit 1'
6. 分组排序,选TOP
需求:如何在系统日志中找到每个员工访问量前三的IP,限定每个IP下访问量1K及以上
select user_id
,ip
,num
from
(
select user_id
,ip
,count(*) as num
,row_number () over(partition by user_id order by count(*) desc) rank
from table table
where dt = '2020-11-30'
group by user_id,ip
having count(*) >= 1000
) tt
where rank <=3
order by user_id,num desc
此时rank 是对每个员工下不同IP分组计数降序排列,再筛选前三即可
这里注意,窗口函数分组不可以写为
row_number ()over(partitionby user_id,ip orderbycount(*)desc) rank
因为它的结果是对每个员工每个IP统计排序,rank 只有一种取值,等于1。
7.between..and ..注解
不同数据库对between..and ..的操作方式存在不同,自己去试了下
select distinct dt from table
where dt between '2020-11-01' and '2020-11-03'
结果如下:
dt
2020-11-02
2020-11-01
2020-11-03
说明presto sql对于between.and ..是按照闭区间方式处理的([2020-11-01,2020-11-03])
mysql也是包含边界的
另between还有取反的操作
where dt not between 1 and 3
等价于dt<1或dt>3
8.行列转换,一般应用于聚合数据块的转换
列数据块
行数据块
列转行
SELECT
city
,kv['2021-02'] as "2021-02"
,kv['2021-03'] as "2021-03"
,kv['2021-04'] as "2021-04"
FROM (
SELECT city, map_agg(time, num) kv
FROM t
GROUP BY city
) tt
其中:行列转换关键点在于建立一个(key,value)数组,或把数组的key抽出来当列名
SELECT city, map_agg(time, num) kv
FROM t
GROUP BY city
行转列
SELECT t1.city
,t2.time
,t2.num
FROM data t1
CROSS JOIN unnest (
array['2021-02', '2021-03', '2021-04'],
array["2021-02", "2021-03", "2021-04"]
) t2 (time, num)
其中:拉平数组UNNEST
文档说明:可替代hive中LATERAL VIEW explode
10.窗口函数
Proesto 官网文档地址:Window Functions — Presto 0.277 Documentation
1、over (分析函数) 开窗函数over(),将聚合函数变成窗口函数来计算
包含三个分析子句: 分组(partition by),
排序(order by),
窗口(rows)
order by 对结果集排序
having 筛选分组后数据。
子句: range unbounded preceding
range between unbounded and preceding and current row 的意思是从开始到当前行的记录
range between unbounded preceding and unbouned following 的意思针对当前所有记录的前一条、后一条记录,也就是表中的所有记录。
rows between 1 preceding and current row 是指当前行的上一行(rownum-1)到当前行的汇总
--unbounded:不受控制的,无限的
--preceding:在...之前
--following:在...之后
详细可阅读:
【原】Oracle开发专题之:分析函数(OVER) - pengpenglin - BlogJava
Oracle开发之:窗口函数 rows between unbounded preceding and current row _cnham的博客
2、排名函数:
ntile 分组/分桶函数
分组依据:
1、 每组的记录数不能大于它上一组的记录数,即编号小的桶放的记录数不能小于编号大的桶。
也就是说,第1组中的记录数只能大于等于第2组及以后各组中的记录数。
2、 所有组中的记录数要么都相同,要么从某一个记录较少的组(命名为X)开始后面所有组的记 录数都与该组(X组)的记录数相同。
每个组分得的记录条数依据:如果平均分则平均分,如果不平均,则第一组所分配的记录数为 (总条数 / 总组数)+1;剩下条数可均分则均分,如果不平均,则继续第一组分配规则
row_number 会为查询出来的每一行记录生成一个序号,依次排序且不会重复 从1开始
rank 与row_number 用法一致,但是考虑重复情况,如果字段值相同,则返回序号相同,下一条记录需要则需顺延一位,即跳跃排序,有两个第一名时接下来就是第三名
dence_rank 与rank用法一致,rank() 为跳跃排序,dence_rank() 为连续排序,即有两个第一名时接下来就是第二名
cume_dist 计算某个值在该组中的累积分布,即这个的值在改组中的相对位置返回的值范围大于0并且小于或等于1
计算方法:小于等于该行值的行数 / 总行数 (rank() 返回值就是小于等于该行值的行数)
precent_rank 用法与cume_dist类似,计算的为该行值在改组中的百分比位置
计算方法:小于等于该行值的行数 -1 / 总行数 -1
窗口函数种类
- ranking 排名类
- analytic 分析类
- aggregate 聚合类
Function Type | SQL | DataFrame API | Description |
Ranking | rank | rank | rank值可能是不连续的 |
Ranking | dense_rank | denseRank | rank值一定是连续的 |
Ranking | percent_rank | percentRank | 相同的分组中 (rank -1) / ( count(score) - 1 ) |
Ranking | ntile | ntile | 将同一组数据循环的往n个桶中放,返回对应的桶的index,index从1开始 |
Ranking | row_number | rowNumber | 很单纯的行号,类似excel的行号 |
Analytic | cume_dist | cumeDist | |
Analytic | first_value | firstValue | 相同的分组中最小值 |
Analytic | last_value | lastValue | 相同的分组中最大值 |
Analytic | lag | lag | 取前n行数据 |
Analytic | lead | lead | 取后n行数据 |
Aggregate | min | min | 最小值 |
Aggregate | max | max | 最大值 |
Aggregate | sum | sum | 求和 |
Aggregate | avg | avg | 求平均 |
二、具体用法如下
count(...) over(partition by ... order by ...)--求分组后的总数。
sum(...) over(partition by ... order by ...)--求分组后的和。
max(...) over(partition by ... order by ...)--求分组后的最大值。
min(...) over(partition by ... order by ...)--求分组后的最小值。
avg(...) over(partition by ... order by ...)--求分组后的平均值。
rank() over(partition by ... order by ...)--rank值可能是不连续的。
dense_rank() over(partition by ... order by ...)--rank值是连续的。
first_value(...) over(partition by ... order by ...)--求分组内的第一个值。
last_value(...) over(partition by ... order by ...)--求分组内的最后一个值。
lag() over(partition by ... order by ...)--取出前n行数据。
lead() over(partition by ... order by ...)--取出后n行数据。
ratio_to_report() over(partition by ... order by ...)--Ratio_to_report() 括号中就是分子,over() 括号中就是分母。
percent_rank() over(partition by ... order by ...)--
三、实际例子
案例数据:/root/score.json/score.json,学生名字、课程、分数
{"name":"A","lesson":"Math","score":100}
{"name":"B","lesson":"Math","score":100}
{"name":"C","lesson":"Math","score":99}
{"name":"D","lesson":"Math","score":98}
{"name":"A","lesson":"E","score":100}
{"name":"B","lesson":"E","score":99}
{"name":"C","lesson":"E","score":99}
{"name":"D","lesson":"E","score":98}
select
name,lesson,score,
ntile(2) over (partition by lesson order by score desc ) as ntile_2,
ntile(3) over (partition by lesson order by score desc ) as ntile_3,
row_number() over (partition by lesson order by score desc ) as row_number,
rank() over (partition by lesson order by score desc ) as rank,
dense_rank() over (partition by lesson order by score desc ) as dense_rank,
percent_rank() over (partition by lesson order by score desc ) as percent_rank
from score
order by lesson,name,score
输出结果完全一样,如下表所示
name | lesson | score | ntile_2 | ntile_3 | row_number | rank | dense_rank | percent_rank |
A | E | 100 | 1 | 1 | 1 | 1 | 1 | 0.0 |
B | E | 99 | 1 | 1 | 2 | 2 | 2 | 0.3333333333333333 |
C | E | 99 | 2 | 2 | 3 | 2 | 2 | 0.3333333333333333 |
D | E | 98 | 2 | 3 | 4 | 4 | 3 | 1.0 |
A | Math | 100 | 1 | 1 | 1 | 1 | 1 | 0.0 |
B | Math | 100 | 1 | 1 | 2 | 1 | 1 | 0.0 |
C | Math | 99 | 2 | 2 | 3 | 3 | 2 | 0.6666666666666666 |
D | Math | 98 | 2 | 3 | 4 | 4 | 3 | 1.0 |
需求:资金端capital_no有三个,求出每一个的放款笔数占比和放款金额占比
解决:
使用count( ) over(partition by ) 和sum( ) over(partition by) 进行分组统计 , 最后使用round函数求值。
-- success_time在统计日期当日
SELECT
dt -- 统计日期,
capital_no -- 资金端,
order_num -- 放款笔数,
order_num_count -- 总放款笔数,
round(
order_num * 1.0000 / order_num_count * 100,
2
) AS bishu_zhanbi -- 已取 %,
amount -- 放款金额,
amount_sum -- 总放款金额,
round(
amount * 1.0000 / amount_sum * 100,
2
) AS jine_zhanbi -- 已取 %%,
'success_time在统计日期当日' mark
FROM
(
-- success_time在统计日期当日 SELECT
b.capital_no -- 资金端,
'2021-07-13' dt,
count(a.loan_id) over (
PARTITION BY b.capital_no,
etl_date
) AS order_num -- 放款笔数,
sum(b.amount) over (
PARTITION BY b.capital_no,
etl_date
) AS amount -- 放款金额,
count(a.loan_id) over (PARTITION BY etl_date) AS order_num_count -- 放款笔数,
sum(b.amount) over (PARTITION BY etl_date) AS amount_sum -- 放款金额,
row_number () over (
PARTITION BY b.capital_no,
etl_date
) AS rk
FROM
(
SELECT
loan_id,
loan_amount,
success_time,
etl_date
FROM
dp_ods.o_hw_bu_hw_makeloan_record_s
WHERE
etl_date = date('2021-07-12')
AND loan_sts = 1
AND date(success_time) = date('2021-07-12')
) a
LEFT JOIN (
SELECT
order_id,
org_name,
loan_amount / 100 AS amount -- 元,
capital_no
FROM
dp_ods.o_hw_bu_hw_user_order_s
WHERE
etl_date = date('2021-07-12')
) b ON a.loan_id = b.order_id
) a
WHERE
rk = 1
注:
开窗函数的排序,.必须得使用,最后where rk =1
Sql 四大排名函数(ROW_NUMBER、RANK、DENSE_RANK、NTILE)简介 - 晓菜鸟
SQL SERVER中CUME_DIST和PERCENT_RANK函数_DePaul的博客
参照:【presto】函数大全_浮云6363的博客_presto函数大全 中文版