1、内置运算符
- 查看函数
show functions;
desc functions extended +;
- 关系运算符
# 大小比较
select 1>1;
select 1=1;
select 1>=1;
# 空值判断
select 'asd' is null;
select 'asd' is not null;
# 模糊查询
select 'itcast' like 'it____'; # _代表匹配一个字符
select 'itcast' like 'it%'; # _代表匹配任意个数字符
# 正则查询
select 'itcast' rlike 'it.';
select 'itcast' regexp 'it.';
- 算数运算符
# 加减乘除
select 1+1;
select 1/2;
select 1/0;
# 取整取余
select 3 div 2;
select 4 % 2;
# 位运算
0 0000
1 0001
2 0010
3 0011
4 0100
5 0101
6 0110
select 4 & 6; 0100 * 0110 0100
select 4 | 6 0100 | 0110 只要有一个位数是1 该位置上的数据就是1
- 逻辑运算符
# 与或非
select 1>1 and 1=1; and 所有条件成立
select 1>1 or 1=1; or 一个条件成则返回true
select !1>1; 取反
# 在范围内
select 1 in (2,3);
select 1 not in (2,3);
2、内置函数
- 字符串函数
-- 字符串长度函数
select length('asd');
-- 字符串反转函数
select reverse('asd');
-- 字符串连接函数
select concat('asd','zxc');
-- 分隔符字符串连接函数
select concat_ws('-','asd','zxc','aaa')
-- 字符串截取函数
select substr('itcast',2,2); -- 第一个截取字符的位置 第二个截取字符的个数
select substr('itcast',-2)
-- 字符串转大写函数
select upper('asd');
select lower('ASD');
-- 去空格函数
select trim(' aaa ');
select ltrim(' aaaa ');
select rtrim(' aaaa ');
-- 正则表达式替换函数
select regexp_replace('asd123','(\\w+)\d','aaa')
-- URL解析函数
select parse_url('http://www.baidu.com','HOST');
-- 割字符串函数
select split('asd,aaa',',')
- 时间日期函数
-- 获取当前日期
select `current_date`();
-- 获取当前时间戳
select `current_timestamp`();
-- UNIX时间戳转日期函数
select from_unixtime(1634528834);
-- 获取当前UNIX时间戳函数
select unix_timestamp();
-- 日期转UNIX时间戳函数
select unix_timestamp('2021-10-10 10:10:10');
-- 抽取日期函数
select to_date('2021-10-10 10:10:10');
-- 日期转年函数
select year('2021-10-10 10:10:10');
-- 日期比较函数
select datediff('2021-10-10 10:10:10','2021-10-12 10:10:10');
-- 日期增加函数
select date_add('2021-10-10 10:10:10',5);
select date_sub('2021-10-10 10:10:10',5);
- 数学函数
-- 取整函数
select round(3.1415);
-- 指定精度取整函数
select round(3.1415,2);
-- 向下取整函数
select `floor`(-1.1);
-- 向上取整函数
select ceil(1.1);
-- 进制转换函数
select conv()
-- 取随机数函数
select round(rand(3)*100);
-- 绝对值函数
select abs(-10)
- 集合函数
-- 集合元素数
select size(`array`('a','n','b'));
-- 取map集合keys函数
select map_keys(`map`('name','python','age',19));
-- 取map集合values函数
select map_values(`map`('name','python','age',19));
-- 判断数组是否包含指定元素
select array_contains(`array`(1,2,3),4);
select 1 in (1,2,3);
-- 数组排序函数
select sort_array(`array`(1,5,3,2));
select sort_array(`array`('g','a','f','c'));
- 条件函数
-- if条件判断
if(boolean testCondition, T valueTrue, T valueFalseOrNull)
-- 空判断函数
isnull( a )
-- 非空判断函数
isnotnull ( a )
-- 空值转换函数
nvl(T value, T default_value)
-- 非空查找函数
select coalesce(null,null,2);
-- 条件转换函数
CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END
select case 100 when 50 then 'tom' when 100 then 'mary' else 'tim' end;
select case sex when '男' then 'man' else 'women' end from student limit 3;
--nullif( a, b ):
-- 果a = b,则返回NULL;否则返回NULL。否则返回一个
select nullif(11,11);
select nullif(11,12);
--assert_true(condition)
--如果'condition'不为真,则引发异常,否则返回null
SELECT assert_true(11 >= 0);
SELECT assert_true(-1 >= 0);
- 类型转换
select cast('12' as int);
- 数据脱敏
select mask('asd123ZXC');
select mask('asd123ZXC','*','@','#');
-- 对前n个进行脱敏替换
select mask_first_n('asd123',3);
select mask_last_n('itcastpython',3);
-- 除了前n个字符,其余进行掩码处理
select mask_show_first_n('asdzxc',2);
select mask_show_last_n('qwertyu',2);
-- 返回字符串的hash编码。xxxxxxxxxx select mask('asd123ZXC');select mask('asd123ZXC','*','@','#');--对前n个进行脱敏替换select mask_first_n('asd123',3);select mask_last_n('itcastpython',3);--除了前n个字符,其余进行掩码处理select mask_show_first_n('asdzxc',2);select mask_show_last_n('qwertyu',2);--返回字符串的hash编码。-- 脱敏--对前n个进行脱敏替换--除了前n个字符,其余进行掩码处理--返回字符串的hash编码。
- 其他
-- 反射函数
select reflect("java.lang.Math","max",11,22);
-- 取哈希值函数
select hash("allen");
-- SHA-1加密
select sha1("allen");
--SHA-2 家族算法加密
select sha2("allen",224);
select sha2("allen",512);
-- MD5加密
select md5("allen");
--crc32加密:
select crc32("allen");
3、自定义函数(UDF)
- UDF 普通函数 输入一行输出一行
- UDAF 聚合函数 输入多行输出一行
- UDTF 输入一行输出多行
- explode(UDTF
-- NBA总冠军球队
create table the_nba_championship(
team_name string,
champion_year array<string>
) row format delimited
fields terminated by ','
collection items terminated by '|';
-- 使用explode函数对champion_year进行拆分
- lateral View
- 行列转换 concat
列比较
select count(case when gender='男' then id end) as '男',count(case when gender='女' then id end) as '女' from students
行比较
select gender,count(id) as count from student group by gender
- 行转列
create table row2col2(
col1 string,
col2 string,
col3 int
)row format delimited fields terminated by '\t';
- 列转行
create table col2row2(
col1 string,
col2 string,
col3 string
)row format delimited fields terminated by '\t';
- json数据处理
-- 单个字段
create table tb_json_test1 (
json string
);
get_json_object(json,'$.deviceType')
-- 多个字段
create table tb_json_test2 (
device string,
deviceType string,
signal double,
`time` string
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
json_tuple(json,'device','deviceType')
--serder
create table tb_json_test2 (
device string,
deviceType string,
signal double,
`time` string
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
-- 拓展--xml数据处理
desc function extended xpath;
SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/b/text()')
4、窗口函数
聚合函数、分析函数、排序函数 over(partition by 字段1... order by 字段1... rows 范围)
- 建表
---建表并且加载数据
create table website_pv_info(
cookieid string,
createtime string, --day
pv int
) row format delimited
fields terminated by ',';
create table website_url_info (
cookieid string,
createtime string, --访问时间
url string --访问页面
) row format delimited
fields terminated by ','
- 求出每个用户总pv数sum+group by普通常规聚合操作
select cookieid,sum(pv) from website_pv_info group by cookieid;
- 求出网站总的pv数 所有用户所有访问加起来
- 求出每个用户总pv数
select cookieid,sum(pv) over(partition by cookieid) from website_pv_info;
- 求出每个用户截止到当天,累积的总pv数
-- 求出每个用户截止到当天,累积的总pv数 between unbounded preceding and current row 等价 unbounded preceding
-- order by 默认有一个计算范围 unbounded preceding
select cookieid,sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row ) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime rows unbounded preceding) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime ) from website_pv_info;
- 自定义窗口范围
unbounded preceding
- 找出每个用户访问pv最多的Top3重复并列的不考虑
-- 排序 rank 遇到重复会生成不连续的序号 dense_rank连续序号 row_number生成行号
select *,rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,row_number() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
with tb1 as ( select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info )
from tb1
select *
where rank_num <= 3;
- 统计每个用户pv数最多的前3分之1天
with tb1 as (select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info)
from tb1
select *
where rank_num = 1;
- 分析函数
select *,lag(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 向下取值
select *,lead(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 取第一个值
select *,first_value(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 去最后一个值 注意取值范围 因为orderby 所用取值范围需要增加向下的范围
select *,last_value(pv) over (partition by cookieid order by pv desc rows between unbounded preceding and unbounded following) as rank_num from website_pv_info;
select sum(pv) sum(sum(pv)) over() from website_pv_info
5、数据压缩
- 压缩算法
- map压缩
1)开启hive中间传输数据压缩功能
set hive.exec.compress.intermediate=true;
2)开启mapreduce中map输出压缩功能
set mapreduce.map.output.compress=true;
3)设置mapreduce中map输出数据的压缩方式
Set mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;
- reduce压缩
1)开启hive最终输出数据压缩功能
set hive.exec.compress.output=true;
2)开启mapreduce最终输出数据压缩
set mapreduce.output.fileoutputformat.compress=true;
3)设置mapreduce最终数据输出压缩方式
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
4)设置mapreduce最终数据输出压缩为块压缩
set mapreduce.output.fileoutputformat.compress.type=BLOCK;
6、数据存储格式
- 行存储和列存储
- TextFILE
- ORC
- PAROUET
- 格式对比
TEXTfile
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
orc
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
parquet
create table log_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;