目录
- 一、聚合函数 Aggregate Functions (UDAF)
- 二、表生成函数 Table-Generating Functions (UDTF)
- 三、相关文章
- 四、下期预告
一、聚合函数 Aggregate Functions (UDAF)
count(*), count(expr), count(DISTINCT expr[, expr...])
- 返回结果: 返回行数
- 返回类型: bigint
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col)SELECT COUNT(col)FROM tmp; -- 结果为 2WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col)SELECT COUNT(case when col=2 then col end)FROM tmp; -- 结果为 0WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col)SELECT COUNT(DISTINCT col)FROM tmp; -- 结果为 1
sum(col), sum(DISTINCT col)
- 返回结果: 返回求和的值
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col)SELECT SUM(col)FROM tmp; -- 结果为 2WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col)SELECT SUM(case when col=2 then col end)FROM tmp; -- 结果为 NULLWITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col)SELECT SUM(DISTINCT col)FROM tmp; -- 结果为 1
avg(col), avg(DISTINCT col)
- 返回结果: 返回平均值
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT AVG(col)FROM tmp; -- 结果为 1.3333333333333333WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT AVG(case when col=2 then col else 2 end)FROM tmp; -- 结果为 2.0WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT AVG(DISTINCT col)FROM tmp; -- 结果为 1.5
min(col)
- 返回结果: 返回最小值
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT MIN(col)FROM tmp; -- 结果为 1WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT MIN(case when col=2 then col else 3 end)FROM tmp; -- 结果为 2
max(col)
- 返回结果: 返回最大值
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT MAX(col)FROM tmp; -- 结果为 2WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT MAX(case when col=2 then col else 3 end)FROM tmp; -- 结果为 3
variance(col), var_pop(col)
- 返回结果: 返回数值列方差
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT variance(col)FROM tmp; -- 结果为 0.25WITH tmp AS( SELECT '1' AS col UNION ALL SELECT '2' AS col UNION ALL SELECT 'a' AS col)SELECT variance(col)FROM tmp; -- 结果为 0.25
var_samp(col)
- 返回结果: 返回数值列的无偏样本方差
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT var_samp(col)FROM tmp; -- 结果为 0.5WITH tmp AS( SELECT '1' AS col UNION ALL SELECT '2' AS col UNION ALL SELECT 'a' AS col)SELECT variance(col)FROM tmp; -- 结果为 0.5
stddev_pop(col)
- 返回结果: 返回数值列的标准方差
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT stddev_pop(col)FROM tmp; -- 结果为 0.5WITH tmp AS( SELECT '1' AS col UNION ALL SELECT '2' AS col UNION ALL SELECT 'a' AS col)SELECT stddev_pop(col)FROM tmp; -- 结果为 0.5
stddev_samp(col)
- 返回结果: 返回样本标准差
- 返回类型: 1222222
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col)SELECT stddev_samp(col)FROM tmp; -- 结果为 0.7071067811865476WITH tmp AS( SELECT '1' AS col UNION ALL SELECT '2' AS col UNION ALL SELECT 'a' AS col)SELECT stddev_samp(col)FROM tmp; -- 结果为 0.7071067811865476
covar_pop(col1, col2)
- 返回结果: 返回协方差
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col1,1 AS col2 UNION ALL SELECT 2 AS col1,3 AS col2)SELECT covar_pop(col1, col2)FROM tmp; -- 结果为 0.5
covar_samp(col1, col2)
- 返回结果: 返回样本协方差
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col1,1 AS col2 UNION ALL SELECT 2 AS col1,3 AS col2)SELECT covar_samp(col1, col2)FROM tmp; -- 结果为 1.0
corr(col1, col2)
- 返回结果: 返回皮尔逊相关系数
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col1,1 AS col2 UNION ALL SELECT 2 AS col1,3 AS col2)SELECT corr(col1, col2)FROM tmp; -- 结果为 0.9999999999999999
percentile(BIGINT col, p)
- 返回结果: 返回分位数(p必须介于0和1之间;如果输入为非整数,请使用percentile_approx)
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col UNION ALL SELECT 10 AS col UNION ALL SELECT 20 AS col)SELECT percentile(col, 0.25)FROM tmp; -- 结果为 1.75WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col UNION ALL SELECT 10 AS col UNION ALL SELECT 20 AS col)SELECT percentile(col, array(0.5,0.75))FROM tmp; -- 结果为 [6.0,12.5]
percentile_approx(DOUBLE col, p [, B])
- 返回结果: 返回分位数(B参数以消耗内存为代价控制近似精度;较高的值产生更好的近似值,默认值为10000;当col中不同值的数目小于B时,这将给出精确的百分位值)
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1.1 AS col UNION ALL SELECT 2.1 AS col UNION ALL SELECT 10.1 AS col UNION ALL SELECT 20.1 AS col)SELECT percentile_approx(col, 0.25)FROM tmp; -- 结果为 1.1WITH tmp AS( SELECT 1.1 AS col UNION ALL SELECT 2.1 AS col UNION ALL SELECT 10.1 AS col UNION ALL SELECT 20.1 AS col)SELECT percentile_approx(col, array(0.5,0.75), 5)FROM tmp; -- 结果为 [2.1,10.1]
regr_avgx(y, x)
- 返回结果: 计算回归线的自变量的平均值(如果y,x其中之一或者全为NULL,则该行忽略,最终返回其他所有行的x的平均值)
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS y,1 AS x UNION ALL SELECT 2 AS y,3 AS x)SELECT regr_avgx(y, x)FROM tmp; -- 结果为 2
regr_avgy(y, x)
- 返回结果: 计算回归线的因变量的平均值(如果y,x其中之一或者全为NULL,则该行忽略,最终返回其他所有行的y的平均值)
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS y,1 AS x UNION ALL SELECT 2 AS y,3 AS x)SELECT regr_avgy(y, x)FROM tmp; -- 结果为 1.5
regr_count(y, x)
- 返回结果: 返回非空对的数量(y,x都不为NULL则为非空对)
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS y,1 AS x UNION ALL SELECT 2 AS y,NULL AS x)SELECT regr_count(y, x)FROM tmp; -- 结果为 1
regr_intercept(y, x)
- 返回结果: 返回可最佳拟合线性回归线的y轴截距b【b=AVG( y ) - REGR_SLOPE( y, x ) * AVG( x )】
- 返回类型: DOUBLE
WITH tmp AS( SELECT 2 AS y,1 AS x UNION ALL SELECT 3 AS y,2 AS x)SELECT regr_intercept(y, x)FROM tmp; -- 结果为 1
regr_r2(y, x)
- 返回结果: 返回回归线的确定系数(也称为R平方或拟合度)
- 返回类型: DOUBLE
WITH tmp AS( SELECT 2 AS y,1 AS x UNION ALL SELECT 3 AS y,2 AS x)SELECT regr_r2(y, x)FROM tmp; -- 结果为 1
regr_slope(y, x)
- 返回结果: 返回可最佳拟合线性回归线的斜率
- 返回类型: DOUBLE
WITH tmp AS( SELECT 2 AS y,1 AS x UNION ALL SELECT 3 AS y,2 AS x)SELECT regr_slope(y, x)FROM tmp; -- 结果为 1
regr_sxx(y, x)
- 返回结果: 返回线性回归模型中使用的独立表达式的平方和,可用于计算回归模型的统计有效性(regr_sxx(y, x) = regr_count(y, x) * var_pop(x))
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS y,1 AS x UNION ALL SELECT 2 AS y,3 AS x)SELECT regr_sxx(y, x)FROM tmp; -- 结果为 2.0
regr_sxy(y, x)
- 返回结果: 返回因变量与自变量的积和,可用于计算回归模型的统计有效性(regr_sxy(y, x) = regr_count(y, x) * covar_pop(y, x))
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS y,1 AS x UNION ALL SELECT 2 AS y,3 AS x)SELECT regr_sxy(y, x)FROM tmp; -- 结果为 1.0
regr_syy(y, x)
- 返回结果: 返回可以计算回归模型统计有效性的值(regr_syy(y, x) = regr_count(y, x) * var_pop(y))
- 返回类型: DOUBLE
WITH tmp AS( SELECT 1 AS y,1 AS x UNION ALL SELECT 2 AS y,3 AS x)SELECT regr_syy(y, x)FROM tmp; -- 结果为 0.5
histogram_numeric(col, b)
- 返回结果: 使用b个非均匀间隔的存储箱计算组中数值列的直方图。输出是一个大小为b的双值(x,y)坐标数组,表示bin中心和高度
- 返回类型: array
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 2 AS col UNION ALL SELECT 10 AS col UNION ALL SELECT 20 AS col)SELECT histogram_numeric(col, 3)FROM tmp; -- 结果为 [{"x":1.5,"y":2.0},{"x":10.0,"y":1.0},{"x":20.0,"y":1.0}]
collect_set(col)
- 返回结果: 返回一组删除了重复元素的对象
- 返回类型: array
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col UNION ALL SELECT 10 AS col UNION ALL SELECT 20 AS col)SELECT collect_set(col)FROM tmp; -- 结果为 [1,10,20]
collect_list(col)
- 返回结果: 返回具有重复项的对象列表
- 返回类型: array
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col UNION ALL SELECT 10 AS col UNION ALL SELECT 20 AS col)SELECT collect_list(col)FROM tmp; -- 结果为 [1,1,10,20]
ntile(INTEGER x)
- 返回结果: 将有序分区划分为x个组,称为bucket,并为分区中的每一行分配一个bucket编号。这样可以方便地计算三位数、四位数、十位数、百分位数和其他常见的汇总统计数据
- 返回类型: INTEGER
WITH tmp AS( SELECT 1 AS col UNION ALL SELECT 1 AS col UNION ALL SELECT 10 AS col UNION ALL SELECT 20 AS col)SELECT ntile(2) over(order by col) as bucket_idFROM tmp; -- 结果为 1 1 2 2
结果为:
bucket_id
1
1
2
2
二、表生成函数 Table-Generating Functions (UDTF)
explode(ARRAY a)
- 返回结果: 将数组分解为多行。返回具有单个列(col)的行集,数组中每个元素一行
- 返回类型: T
SELECT explode(array(1,2,3)) as col;
结果为:
col
123
WITH tmp AS( SELECT 'a' AS col1, ARRAY(1,2) AS col2)SELECT tmp.col1, tmp.col2, t.col3FROM tmp LATERAL VIEW explode (col2) t AS col3;
结果为:
col1 col2 col3
a [1,2] 1
a [1,2] 2
explode(MAP m)
- 返回结果: 将MAP分解为多行。返回包含两列(键、值)的行集,对于MAP中的每个键值对返回一行
- 返回类型: Tkey,Tvalue
SELECT explode(map('k1','v1','k2','v2')) as (key,value);
结果为:
key value
k1 v1
k2 v2
posexplode(ARRAY a)
- 返回结果: 使用int类型的附加位置列(项目在原始数组中的位置,从0开始)将数组分解为多行。返回包含两列(pos,val)的行集,数组中每个元素一行
- 返回类型: int,T
SELECT posexplode(array(1,2)) as (pos,val);
结果为:
pos val
0 1
1 2
inline(ARRAY> a)
- 返回结果: 将结构数组分解为多行。返回一个行集,该行集包含N列(N=结构中顶级元素的数目),数组中每个结构一行
- 返回类型: T1,...,Tn
SELECT inline(array(struct('a','uncle',1),struct('b','bean',2))) as (code,name,id);
结果为:
code name id
a uncle 1
b bean 2
stack(int r,T1 V1,...,Tn/r Vn)
- 返回结果: 将n个值V1,…,Vn拆分为r行。每一行都有n/r列。r必须是常数。
- 返回类型: T1,...,Tn/r
SELECT stack(2,'a','uncle',1,'b','bean',2);
结果为:
col1 col2 col3
a uncle 1
b bean 2
SELECT stack(2,1,2,3,4,5);
结果为:
col1 col2 col3
1234 NULL 3
json_tuple(string jsonStr,string k1,...,string kn)
- 返回结果: 接收JSON字符串和n个键,返回n个值的元组。这比get_json_object更高效,因为它只需要一次调用就可以获得多个值。
- 返回类型: string1,...,stringn
SELECT json_tuple('{"id":1,"name":"Uncle Bean"}', 'id', 'name') as (id,name);
结果为:
id name
1 Uncle Bean
parse_url_tuple(string urlStr,string p1,...,string pn)
- 返回结果: 接收URL字符串和n个URL部件名称,返回n个值的元组。这类似于parse_url,但可以从一个url中一次提取多个部分。有效的部件名称是:HOST, PATH, QUERY, REF, PROTOCOL, AUTHORITY, FILE, USERINFO, QUERY:。
- 返回类型: string 1,...,stringn
SELECT json_tuple('{"id":1,"name":"Uncle Bean"}', 'id', 'name') as (id,name);
结果为:
host path
github.com /TheUncleWhoGrowsBeans/Melon-and-fruit-fields
三、相关文章
Hive函数大全(含例子)之数学函数(Mathematical Functions)
Hive函数大全(含例子)之字符串函数(String Functions)
Hive函数大全(含例子)之集合函数、日期函数、条件函数
Hive函数大全(含例子)之数据屏蔽函数、杂项函数、XML解析函数
四、下期预告
Hive函数大全(含例子)之窗口和分析函数