小技巧:测试函数的用法,可以专门准备一个专门的dual表
create table dual(x string);
insert into table dual values('');
其实:直接用常量来测试函数即可
select substr("abcdefg",1,3);
hive的所有函数手册:
- 常用内置函数
- 类型转换函数
select cast("5" as int) from dual;
select cast("2017-08-03" as date) ;
select cast(current_timestamp as date);
示例:
1 | 1995-05-05 13:30:59 | 1200.3 |
2 | 1994-04-05 13:30:59 | 2200 |
3 | 1996-06-01 12:20:30 | 80000.5 |
create table t_fun(id string,birthday string,salary string)
row format delimited fields terminated by ',';
select id,cast(birthday as date) as bir,cast(salary as float) from t_fun;
- 数学运算函数
select round(5.4) from dual; ## 5
select round(5.1345,3) from dual; ##5.135
select ceil(5.4) from dual; // select ceiling(5.4) from dual; ## 6
select floor(5.4) from dual; ## 5
select abs(-5.4) from dual; ## 5.4
select greatest(3,5,6) from dual; ## 6
select least(3,5,6) from dual;
示例:
有表如下:
select greatest(cast(s1 as double),cast(s2 as double),cast(s3 as double)) from t_fun2;
结果:
+---------+--+
| _c0 |
+---------+--+
| 2000.0 |
| 9800.0 |
+---------+--+
select max(age) from t_person; 聚合函数
select min(age) from t_person; 聚合函数
- 字符串函数
substr(string, int start) ## 截取子串
substring(string, int start)
示例:select substr("abcdefg",2) from dual;
substr(string, int start, int len)
substring(string, int start, int len)
示例:select substr("abcdefg",2,3) from dual;
concat(string A, string B...) ## 拼接字符串
concat_ws(string SEP, string A, string B...)
示例:select concat("ab","xy") from dual;
select concat_ws(".","192","168","33","44") from dual;
length(string A)
示例:select length("192.168.33.44") from dual;
split(string str, string pat)
示例:select split("192.168.33.44",".") from dual; 错误的,因为.号是正则语法中的特定字符
select split("192.168.33.44","\\.") from dual;
upper(string str) ##转大写
- 时间函数
select current_timestamp;
select current_date;
## 取当前时间的毫秒数时间戳
select unix_timestamp();
## unix时间戳转字符串
from_unixtime(bigint unixtime[, string format])
示例:select from_unixtime(unix_timestamp());
select from_unixtime(unix_timestamp(),"yyyy/MM/dd HH:mm:ss");
## 字符串转unix时间戳
unix_timestamp(string date, string pattern)
示例: select unix_timestamp("2017-08-10 17:50:30");
select unix_timestamp("2017/08/10 17:50:30","yyyy/MM/dd HH:mm:ss");
## 将字符串转成日期date
select to_date("2017-09-17 16:58:32");
- 表生成函数
- 行转列函数:explode()
假如有以下数据:
1,zhangsan,化学:物理:数学:语文 2,lisi,化学:数学:生物:生理:卫生 3,wangwu,化学:语文:英语:体育:生物 |
映射成一张表:
create table t_stu_subject(id int,name string,subjects array<string>)
row format delimited fields terminated by ','
collection items terminated by ':';
使用explode()对数组字段“炸裂”
然后,我们利用这个explode的结果,来求去重的课程:
select distinct tmp.sub from (select explode(subjects) as sub from t_stu_subject) tmp; |
- 表生成函数lateral view
select id,name,tmp.sub
from t_stu_subject lateral view explode(subjects) tmp as sub;
理解: lateral view 相当于两个表在join
左表:是原表
右表:是explode(某个集合字段)之后产生的表
而且:这个join只在同一行的数据间进行
那样,可以方便做更多的查询:
比如,查询选修了生物课的同学
select a.id,a.name,a.sub from
(select id,name,tmp.sub as sub from t_stu_subject lateral view explode(subjects) tmp as sub) a
where sub='生物';
- 集合函数
array_contains(Array<T>, value) 返回boolean值
示例:
select moive_name,array_contains(actors,'吴刚') from t_movie;
select array_contains(array('a','b','c'),'c') from dual;
sort_array(Array<T>) 返回排序后的数组
示例:
select sort_array(array('c','b','a')) from dual;
select 'haha',sort_array(array('c','b','a')) as xx from (select 0) tmp;
size(Array<T>) 返回一个int值
示例:
select moive_name,size(actors) as actor_number from t_movie;
size(Map<K.V>) 返回一个int值
map_keys(Map<K.V>) 返回一个数组
map_values(Map<K.V>) 返回一个数组
- 条件控制函数
- case when
语法:
CASE [ expression ]
WHEN condition1 THEN result1
WHEN condition2 THEN result2
...
WHEN conditionn THEN resultn
ELSE result
END
示例:
select id,name,
case
when age<28 then 'youngth'
when age>27 and age<40 then 'zhongnian'
else 'old'
end
from t_user;
- IF
select id,if(age>25,'working','worked') from t_user;
select moive_name,if(array_contains(actors,'吴刚'),'好电影','rom t_movie;
- json解析函数:表生成函数
json_tuple函数
示例:
select json_tuple(json,'movie','rate','timeStamp','uid') as(movie,rate,ts,uid) from t_rating_json;
产生结果:
利用json_tuple从原始json数据表中,etl出一个详细信息表:
create table t_rate as select uid, movie, rate, year(from_unixtime(cast(ts as bigint))) as year, month(from_unixtime(cast(ts as bigint))) as month, day(from_unixtime(cast(ts as bigint))) as day, hour(from_unixtime(cast(ts as bigint))) as hour, minute(from_unixtime(cast(ts as bigint))) as minute, from_unixtime(cast(ts as bigint)) as ts from (select json_tuple(rateinfo,'movie','rate','timeStamp','uid') as(movie,rate,ts,uid) from t_json) tmp ; |
- 分析函数:row_number() over()——分组TOPN
- 需求
有如下数据:
1,18,a,male 2,19,b,male 3,22,c,female 4,16,d,female 5,30,e,male 6,26,f,female |
需要查询出每种性别中年龄最大的2条数据
- 实现:
使用row_number函数,对表中的数据按照性别分组,按照年龄倒序排序并进行标记
hql代码:
select id,age,name,sex,
row_number() over(partition by sex order by age desc) as rank
from t_rownumber
产生结果:
然后,利用上面的结果,查询出rank<=2的即为最终需求
select id,age,name,sex
from
(select id,age,name,sex,
row_number() over(partition by sex order by age desc) as rank
from t_rownumber) tmp
where rank<=2;
练习:求出电影评分数据中,每个用户评分最高的topn条数据
- 自定义函数
- 需求:
需要对json数据表中的json数据写一个自定义函数,用于传入一个json,返回一个数据值的数组
json原始数据表:
需要做ETL操作,将json数据变成普通表数据,插入另一个表中:
- 实现步骤:
1、开发JAVA的UDF类
public class ParseJson extends UDF{ // 重载 :返回值类型 和参数类型及个数,完全由用户自己决定 // 本处需求是:给一个字符串,返回一个数组 public String[] evaluate(String json) { String[] split = json.split("\""); String[] res = new String[]{split[3],split[7],split[11],split[15]}; return res; } } |
2、打jar包
在eclipse中使用export即可
- 上传jar包到运行hive所在的linux机器
- 在hive中创建临时函数:
在hive的提示符中:
hive> add jar /root/jsonparse.jar;
然后,在hive的提示符中,创建一个临时函数:
hive>CREATE TEMPORARY FUNCTION jsonp AS 'cn.edu360.hdp.hive.ParseJson';
- 开发hql语句,利用自定义函数,从原始表中抽取数据插入新表
insert into table t_rate select split(jsonp(json),',')[0], cast(split(jsonp(json),',')[1] as int), cast(split(jsonp(json),',')[2] as bigint), cast(split(jsonp(json),',')[3] as int) from t_rating_json; |
注:临时函数只在一次hive会话中有效,重启会话后就无效
如果需要经常使用该自定义函数,可以考虑创建永久函数:
拷贝jar包到hive的类路径中:
cp wc.jar apps/hive-1.2.1/lib/
创建了:
create function pfuncx as 'com.doit.hive.udf.UserInfoParser';
删除函数:
DROP TEMPORARY FUNCTION [IF EXISTS] function_name
DROP FUNCTION[IF EXISTS] function_name
- 综合查询案例
- 用hql来做wordcount
有以下文本文件:
hello tom hello jim hello rose hello tom tom love rose rose love jim jim love tom love is what what is love |
需要用hive做wordcount
-- 建表映射
create table t_wc(sentence string);
-- 导入数据
load data local inpath '/root/hivetest/xx.txt' into table t_wc;
hql答案:
SELECT word ,count(1) as cnts FROM ( SELECT explode(split(sentence, ' ')) AS word FROM t_wc ) tmp GROUP BY word order by cnts desc ; |
- 级联报表查询
有如下数据:
A,2015-01,5 A,2015-01,15 B,2015-01,5 A,2015-01,8 B,2015-01,25 A,2015-01,5 C,2015-01,10 C,2015-01,20 A,2015-02,4 A,2015-02,6 C,2015-02,30 C,2015-02,10 B,2015-02,10 B,2015-02,5 A,2015-03,14 A,2015-03,6 B,2015-03,20 B,2015-03,25 C,2015-03,10 C,2015-03,20 |
建表映射:
create table t_access_times(username string,month string,counts int) row format delimited fields terminated by ','; |
需要要开发hql脚本,来统计出如下累计报表:
用户 | 月份 | 月总额 | 累计到当月的总额 |
A | 2015-01 | 33 | 33 |
A | 2015-02 | 10 | 43 |
A | 2015-03 | 30 | 73 |
B | 2015-01 | 30 | 30 |
B | 2015-02 | 15 | 45 |