-- count(*),count(1),count(字段名)区别
select count(*) from score; -- 最慢的 一般不用它
select count(1) from score; -- 最快的 但是它会统计null值,需要注意。
select count (score.score) from score; -- 稍微慢于count(1),他不会统计null值。 注意 空字符串,0 丢不属于null值
-- hive 的语句的执行流程
-- from 表 join 表 where判断 group by 进行分组 having 分组后排序 select 若包含over() distinct 去重 order by 排序 limit 求前多少位的数据
-- 在 hive 中 where 条件里不支持不等式子查询 实际上是支持 in , not in , exists, not exists;
-- 使用 explain 查看SQL执行计划
explain select count (score.score) from score;
-- 加上 extended 查看详细的执行计划
explain extended select count (score.score) from score;
-- 通过数据库的名字可以直接点出其他数据库表中的字段
select a.bonus from fen_biao_tong.emp a;
-- nvl(字段,0) 如果字段中出现 null 的字段就用 0 来填充,然后进行计算
select nvl(a.bonus,0) from fen_biao_tong.emp a;
-- 条件函数 - if 使用方法 if(表达式,表达式成立的返回值,表达式不成立的返回值)
select if(1>2,666,888); --返回 888
select if(1>0,if(-1>0,000,666),888); --返回 666
select score ,if(score>95,'优秀',if(score>85,'良好',if(score>60,'及格','不及格')))from score;
-- case when --应用场景一 同上面一样的场景
select score,
case when score>95 then '优秀'
when score>85 then '良好'
when score>75 then '及格'
else '不及格'
end as `等级`
from score;
select
name,
case name when "施笑槐" then "槐ge"
when "吕金鹏" then "鹏ge"
when "单乐蕊" then "蕊jie"
else "算了,不叫了!!!"
end as `喊你一声你敢答应吗?`
from bigdata17.students2;
-- 日期函数(重中之重)
-- 当前日期
select current_date; -- 返回 2022-06-06
-- 返回指定日期前 n 天的日期
select date_sub(current_date,1) --2022-06-05
-- 返回指定日期后 n 天的日期
select date_add('2000-01-01',10); -- 2000-01-11
-- 返回开始日期减去结束日期的天数
select datediff('2021-06-05','2020-06-04'); -- 366
-- 获取当前时间戳
select unix_timestamp(); -- 1654521341
-- 日期时间转日期函数
select to_date('2021-06-05 13:34:12');
-- 转化Unix时间到当前时区的时间格式
select from_unixtime(1654521341,'YYYY-MM-dd')
select from_unixtime(unix_timestamp(),'YYYY-MM-dd');
-- 转化成另一种格式
select from_unixtime(unix_timestamp(),'YYYY|MM|dd');
-- 将给定的格式,转化成我想要的格式
-- '2021年01月14日'-->'2021-01-14'
select unix_timestamp('2021年01月14日','yyyy年mm月dd日');
select from_unixtime(unix_timestamp('2021年01月14日','yyyy年mm月dd日'),'yyyy_mm_dd');
-- "04牛2021数加16逼" -> "2021/04/16"
select from_unixtime(unix_timestamp("04牛2021数加16逼","mm牛yyyy数加dd逼"),'yyyy-mm-dd');
-- 字符串相关函数
select concat('123','456');
select concat('woshi','一个','大帅哥!!!')
select concat('123','456',null); -- 当其中一个为 null 时结果全部为null
-- 使用 自定义的符号 连接字符串
select concat_ws('**','a','v','c'); -- a**v**c
select concat_ws('+_+','00','^','^','00',null) -- 如果存在 null会直接过滤掉 null 值
-- cast(字段,as string) 将该字段转化为 string 类型的
select concat_ws("|",cast(id as string),name,cast(age as string),gender,clazz)
from bigdata17.students limit 10;
-- 字符串的切分
select substring('abcdefg',1,4); -- HQL中涉及到下标时,是从1开始的; 从第一个开始切,一直切四个;
-- '2021/01/14' -> '2021-01-14'
select concat_ws("-",substring('2021/01/14',1,4),substring('2021/01/14',6,2),substring('2021/01/14',9,2));
-- 当然 建议使用日期函数去做日期
select from_unixtime(unix_timestamp('2021/01/14','yyyy/MM/dd'),'yyyy-MM-dd');
-- split(字段,",") 函数,使用 , 对字符串进行分割
select split("sdfs,werw,weger,werwetg,423w,werhe,erw,ewergw,777,777,777,888,88,888,833,33,",",");
-- 数组下标从 0 开始, 截取第四个
select split("sdfs,werw,weger,werwetg,423w,werhe,erw,ewergw,777,777,777,888,88,888,833,33,",",")[3];
-- 统计各个子字符串出现的个数;
select b.a `字段名`,count(b.a) `字段的个数` from (select explode(split("sdfs,werw,weger,werwetg,423w,werhe,erw,ewergw,777,777,777,888,88,888,833,33,",",")) a) b
group by b.a order by `字段的个数`;
-- 创建表
create table t_fraction(
name string,
subject string,
score int)
row format delimited fields terminated by ","
lines terminated by '\n';
-- 加载数据
load data local inpath'/usr/local/soft/atao_file/t_fraction' into table t_fraction;
select * from t_fraction;
-- 开了一个大窗子(相当于没开窗)
select t_fraction.*,sum(score) over () from t_fraction; -- 求和
select t_fraction.*,min(score) over () from t_fraction; -- 最小值
select t_fraction.*,max(score) over () from t_fraction; -- 最大值
select t_fraction.*,avg(score) over () from t_fraction; -- 求平均
select t_fraction.*,count(score) over () from t_fraction; -- 计数
select t_fraction.*,lag(score) over () from t_fraction; -- 获取当前行上一行的数据
-- 各种行的聚合
-- 对姓名进行开创窗
select name,subject,score,sum(score) over() as sum1, sum(score) over (partition by name ) as sum2 from t_fraction;
-- 对学科进行开窗
select name,subject,score,sum(score) over() as sum1, sum(score) over (partition by subject ) as sum2 from t_fraction;
-- 由起点到当前行的窗口聚合
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject ) as sum2,
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3
from t_fraction;
-- 当前行和前面一行的聚合
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject ) as sum2,
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3,
sum(score) over (partition by subject order by score rows between 1 preceding and current row) as sum4
from t_fraction;
-- 当前行的前面一行到后面一行的窗口聚合 前一行+当前行+后一行
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject ) as sum2,
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3,
sum(score) over (partition by subject order by score rows between 1 preceding and current row) as sum4,
sum(score) over (partition by subject order by score rows between 1 preceding and 1 following) as sum5
from t_fraction;
-- 当前行和后面所有行
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject ) as sum2,
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3,
sum(score) over (partition by subject order by score rows between 1 preceding and current row) as sum4,
sum(score) over (partition by subject order by score rows between 1 preceding and 1 following) as sum5,
sum(score) over (partition by subject order by score rows between current row and unbounded following) as sum6
from t_fraction;
-- 聚合开窗函数实战
-- 创建表,并加载数据
create table business
(
name string,
orderdate string,
cost int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
drop table business;
load data local inpath '/usr/local/soft/atao_file/business.txt' into table business;
select * from business;
-- 需求1:查询在2017年4月份购买过的顾客及总人数
select distinct name, date_format(orderdate,'yyyy-MM'),count(*) over ( ) from business where date_format(orderdate,'yyyy-MM') = '2017-04';
-- 需求2:查询顾客的购买明细及月购买总额
select name,orderdate,cost,sum(cost) over(partition by name) from business;
-- 需求3:上述的场景,要将cost按照日期进行累加
select name,orderdate,sum(cost) over (partition by name order by date_format(orderdate,'yyyy-MM-dd') rows between unbounded preceding and current row ) from business;
-- 需求4:查询顾客上次的购买时间
select name ,orderdate,lag(orderdate) over (partition by name order by date_format(orderdate,'yyyy-MM-dd')) from business;
-- 需求5:查询前20%时间的订单信息
select * from (select name,orderdate,cost,ntile(5) over(order by orderdate) sortgroup_num from business) t where t.sortgroup_num=1;
-- hive 分析学生的成绩信息
-- 创建学生表并加载数据
create table score2
(
name string,
subject string,
score int
) row format delimited fields terminated by "\t";
load data local inpath '/usr/local/soft/atao_file/score2' into table score2;
select * from score2;
--需求1:每门学科学生成绩排名(是否并列排名、空位排名三种实现)
-- RANK() 排序相同时会重复,总数不会变
-- DENSE_RANK() 排序相同时会重复,总数会减少
-- ROW_NUMBER() 会根据顺序计算
-- PERCENT_RANK()计算给定行的百分比排名。可以用来计算超过了百分之多少的人(当前行的rank值-1)/(分组内的总行数-1)
select name,subject,score,
rank() over (partition by name order by score desc),
dense_rank() over (partition by name order by score desc ),
row_number() over (partition by name order by score desc )
from score2;
-- hive 行转列
-- 建表加加载数据
create table testArray2(
name string,
weight array<string>
)row format delimited
fields terminated by '\t'
COLLECTION ITEMS terminated by ',';
drop table testarray2 ;
load data local inpath'/usr/local/soft/atao_file/testArray.txt' into table testarray2 ;
select * from testarray2 ;
select name,col1 from testarray2 lateral view explode(weight) t1 as col1;
select explode(weight) from testarray2;
select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)
select key from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;
select value from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;
select name,col1,col2 from testarray2 lateral view explode(map('key1',1,'key2',2,'key3',3)) t1 as col1,col2;
-- 显示集合炸开的行数的下表
select name,pos,col1 from testarray2 lateral view posexplode(weight) t1 as pos,col1;
-- hive 列转行
-- 建表加加载数据
create table testLieToLine(
name string,
col1 int
)row format delimited
fields terminated by '\t';
load data local inpath '/usr/local/soft/atao_file/testLieToLine' into table testLieToLine;
select * from testlietoline
-- 使用 collect_list(字段) 解决, 注意 collect_list不去重,但是 collect_set 是去重的
select name,collect_list(col1) from testlietoline group by name;
-- 在进行简单的加工一下, 用 | 讲数组中的联系起来(注意,只有字符串能够用 concat 等进行连接,使用 cast(字段 as string) 进行转换)
select name,concat_ws('|',collect_list(cast(col1 as string))) from testlietoline group by name;
select * from f;
-- 重难点 电商必会问题;
-- 连续登录问题
-- 在电商、物流和银行可能经常会遇到这样的需求:统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数等
-- 建表导数据
create table deal_tb(
id string
,datestr string
,amount string
)row format delimited fields terminated by ',';
load data local inpath '/usr/local/soft/atao_file/deal_tb' into table deal_tb;
select * from deal_tb;
-- 正确答案
-- 步骤一:
SELECT id, datestr, sum(amount) AS sum_amount
FROM deal_tb
GROUP BY id, datestr
-- 步骤二:
SELECT t1.id AS id, t1.datestr AS datestr, t1.sum_amount AS sum_amount, row_number() OVER (PARTITION BY t1.id ORDER BY t1.datestr) AS rn
FROM (
SELECT id, datestr, sum(amount) AS sum_amount
FROM deal_tb
GROUP BY id, datestr
) t1
-- 步骤三:
SELECT tt1.id AS id, tt1.datestr AS datestr, tt1.sum_amount AS sum_amount
, date_sub(tt1.datestr, tt1.rn) AS grp
FROM (
SELECT t1.id AS id, t1.datestr AS datestr, t1.sum_amount AS sum_amount, row_number() OVER (PARTITION BY t1.id ORDER BY t1.datestr) AS rn
FROM (
SELECT id, datestr, sum(amount) AS sum_amount
FROM deal_tb
GROUP BY id, datestr
) t1
) tt1
-- 步骤四:
SELECT ttt1.id, ttt1.grp
, round(sum(ttt1.sum_amount), 2) AS user_sum_amount
, count(1) AS user_days
, min(ttt1.datestr) AS user_start_date
, max(ttt1.datestr) AS user_end_date
, datediff(ttt1.grp, lag(ttt1.grp, 1) OVER (PARTITION BY ttt1.id ORDER BY ttt1.grp)) AS interval_days
FROM (
SELECT tt1.id AS id, tt1.datestr AS datestr, tt1.sum_amount AS sum_amount
, date_sub(tt1.datestr, tt1.rn) AS grp
FROM (
SELECT t1.id AS id, t1.datestr AS datestr, t1.sum_amount AS sum_amount, row_number() OVER (PARTITION BY t1.id ORDER BY t1.datestr) AS rn
FROM (
SELECT id, datestr, sum(amount) AS sum_amount
FROM deal_tb
GROUP BY id, datestr
) t1
) tt1
) ttt1
GROUP BY ttt1.id, ttt1.grp;
-- 步骤一:先按照用户和日期分组求和,使得每个用户每天只有一条数据
select id,datestr,sum(amount) from deal_tb group by id,datestr ;
-- 步骤二:根据用户ID分组按日期排序,将日期和分组序号相减得到连续登陆的开始日期,如果开始日期相同说明连续登陆
select t.id,t.datestr,t.sum,row_number() over (partition by id order by datestr) flag from
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t ;
-- datediff(string end_date,string start_date);* 等于0说明连续登录
-- 统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数
select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from (select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1 ;
-- 统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数
select
ttt1.id,
count(1) over (partition by ttt1.id ,ttt1.flag),
min(ttt1.datestr) over (partition by ttt1.id,ttt1.flag order by ttt1.datestr ),
max(ttt1.datestr) over (partition by ttt1.id,ttt1.flag )
from (
select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from (select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1 ) ttt1 ;
========================================================================================
select distinct
tttt1.id,
tttt1.sum_time,
tttt1.start_time,
tttt1.end_time,
lag(tttt1.end_time) over (partition by tttt1.end_time)
from
(
select
ttt1.id id,
count(1) over (partition by ttt1.id ,ttt1.flag) sum_time,
min(ttt1.datestr) over (partition by ttt1.id,ttt1.flag order by ttt1.datestr) start_time,
max(ttt1.datestr) over (partition by ttt1.id,ttt1.flag ) end_time
from (
select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from (select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1 ) ttt1 ) tttt1 order by tttt1.id,tttt1.start_time;
-- 再来一次
select
distinct *
from
(
select
ttt1.id id,
sum(ttt1.sum) over (partition by ttt1.id,ttt1.flag),
count(1) over (partition by ttt1.id, ttt1.flag),
min(ttt1.datestr) over (partition by ttt1.id, ttt1.flag ) start_time,
max(ttt1.datestr) over (partition by ttt1.id, ttt1.flag),
from
(select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from
(select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1) ttt1 ) tttt1 order by tttt1.id,tttt1.start_time;
-- 最终结果
select distinct *
from
(
select
ttt1.id id,
sum(ttt1.sum) over (partition by ttt1.id,ttt1.flag),
count(1) over (partition by ttt1.id, ttt1.flag),
min(ttt1.datestr) over (partition by ttt1.id, ttt1.flag) start_time,
max(ttt1.datestr) over (partition by ttt1.id, ttt1.flag),
ttt1.flag,
lag(flag) over (partition by ttt1.id)
from
(select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from
(select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1) ttt1
) tttt1 order by tttt1.id, tttt1.start_time;
-- 课下作业
--sql:Hive实现按照指定格式输出每七天的消费平均数
--输出格式:
--2018-06-01~2018-06-07 12.29
--...
--2018-08-10~2018-08-16 80.67
create table f
(
date_time string,
cost string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
-- 导入数据
load data local inpath'/usr/local/soft/atao_file/f' into table f;
select * from f;
select avg(f.cost) over (order by from_unixtime(unix_timestamp(f.date_time,'yyyymmdd'),'yyyy-mm-dd') row 6 perceding) from f;
2018/6/1
-- 长度=8
select substring(f.date_time,1,4) from f; 2018
select concat('0',substring(f.date_time,6,1)) from f; 6
select concat('0',substring(f.date_time,8,1)) from f; 8
-- 长度=9
select substring(f.date_time,1,4) from f; 2018
select concat('0',substring(f.date_time,6,1)) from f; 6
select substring(f.date_time,8,2) from f; 8
-- 字符串的长度
select LENGTH(f.date_time) from f;
select if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))) from f;
select concat(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),'~',
date_sub(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),6)),
round(avg(f.cost) over (order by from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd') rows 6 preceding),2) from f;
select from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd')
from f;
select date_sub(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),6)
from f;
-- 日期
select concat(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),'~',
date_sub(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),6)) from f;
select if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))) from f;