hive 数组分成列 hive 分组计数

转载
技术笔耕者 2024-04-15 18:41:38
文章标签 hive 数组分成列 hive hadoop 数据仓库大数据 文章分类 Hive 大数据
hive

课下练习

-- count(*),count(1),count(字段名)区别
select count(*) from score; -- 最慢的 一般不用它
select count(1) from score; -- 最快的 但是它会统计null值，需要注意。
select count (score.score) from score; -- 稍微慢于count(1),他不会统计null值。 注意 空字符串，0 丢不属于null值

-- hive 的语句的执行流程
-- from 表 join 表 where判断 group by 进行分组 having 分组后排序 select 若包含over() distinct 去重 order by 排序 limit 求前多少位的数据

-- 在 hive 中 where 条件里不支持不等式子查询 实际上是支持 in , not in , exists, not exists;

-- 使用 explain 查看SQL执行计划
explain select count (score.score) from score;
-- 加上 extended 查看详细的执行计划
explain extended  select count (score.score) from score;

-- 通过数据库的名字可以直接点出其他数据库表中的字段
select a.bonus from fen_biao_tong.emp a;
-- nvl(字段，0) 如果字段中出现 null 的字段就用 0 来填充，然后进行计算
select nvl(a.bonus,0) from fen_biao_tong.emp a;

-- 条件函数 - if 使用方法 if(表达式，表达式成立的返回值，表达式不成立的返回值)
select if(1>2,666,888);  --返回 888
select if(1>0,if(-1>0,000,666),888); --返回 666

select score ,if(score>95,'优秀',if(score>85,'良好',if(score>60,'及格','不及格')))from score;

-- case when --应用场景一 同上面一样的场景
select score,
	   case when score>95 then '优秀'
	   		when score>85 then '良好'
	   		when score>75 then '及格'
	   else '不及格'
	   end as `等级`
from score;

select
	name,
	case name when "施笑槐" then "槐ge"
			  when "吕金鹏" then "鹏ge"
			  when "单乐蕊" then "蕊jie"
	else "算了，不叫了！！！"
	end as `喊你一声你敢答应吗？` 
from bigdata17.students2;

-- 日期函数(重中之重)
-- 当前日期
select current_date;   -- 返回 2022-06-06
-- 返回指定日期前 n 天的日期
select date_sub(current_date,1)  --2022-06-05
-- 返回指定日期后 n 天的日期
select date_add('2000-01-01',10);  -- 2000-01-11
-- 返回开始日期减去结束日期的天数
select datediff('2021-06-05','2020-06-04');  -- 366
-- 获取当前时间戳
select unix_timestamp();  -- 1654521341
-- 日期时间转日期函数
select to_date('2021-06-05 13:34:12');
-- 转化Unix时间到当前时区的时间格式
select from_unixtime(1654521341,'YYYY-MM-dd')
select from_unixtime(unix_timestamp(),'YYYY-MM-dd');
-- 转化成另一种格式
select from_unixtime(unix_timestamp(),'YYYY|MM|dd');
-- 将给定的格式，转化成我想要的格式
-- '2021年01月14日'-->'2021-01-14'
select unix_timestamp('2021年01月14日','yyyy年mm月dd日');
select from_unixtime(unix_timestamp('2021年01月14日','yyyy年mm月dd日'),'yyyy_mm_dd');
--  "04牛2021数加16逼" -> "2021/04/16"
select from_unixtime(unix_timestamp("04牛2021数加16逼","mm牛yyyy数加dd逼"),'yyyy-mm-dd');

-- 字符串相关函数
select concat('123','456');
select concat('woshi','一个','大帅哥！！！')
select concat('123','456',null);  -- 当其中一个为 null 时结果全部为null

-- 使用 自定义的符号 连接字符串
select concat_ws('**','a','v','c');    -- a**v**c
select concat_ws('+_+','00','^','^','00',null)  -- 如果存在 null会直接过滤掉 null 值

-- cast(字段，as string) 将该字段转化为 string 类型的
select concat_ws("|",cast(id as string),name,cast(age as string),gender,clazz)
from bigdata17.students limit 10;

-- 字符串的切分
select substring('abcdefg',1,4);  -- HQL中涉及到下标时，是从1开始的； 从第一个开始切，一直切四个；

-- '2021/01/14' -> '2021-01-14'
select concat_ws("-",substring('2021/01/14',1,4),substring('2021/01/14',6,2),substring('2021/01/14',9,2));
-- 当然 建议使用日期函数去做日期
select from_unixtime(unix_timestamp('2021/01/14','yyyy/MM/dd'),'yyyy-MM-dd');

-- split(字段，",") 函数，使用 ， 对字符串进行分割
select  split("sdfs,werw,weger,werwetg,423w,werhe,erw,ewergw,777,777,777,888,88,888,833,33,",",");
-- 数组下标从 0 开始， 截取第四个
select  split("sdfs,werw,weger,werwetg,423w,werhe,erw,ewergw,777,777,777,888,88,888,833,33,",",")[3];
-- 统计各个子字符串出现的个数；
select b.a `字段名`,count(b.a) `字段的个数` from (select explode(split("sdfs,werw,weger,werwetg,423w,werhe,erw,ewergw,777,777,777,888,88,888,833,33,",",")) a) b
group by b.a order by `字段的个数`;


-- 创建表
create table t_fraction(
name string,
subject string, 
score int) 
row format delimited fields terminated by ","
lines terminated by '\n';
-- 加载数据
load data local inpath'/usr/local/soft/atao_file/t_fraction' into table t_fraction;
select * from t_fraction;

-- 开了一个大窗子(相当于没开窗)
select t_fraction.*,sum(score) over () from t_fraction;	   -- 求和
select t_fraction.*,min(score) over () from t_fraction;	   -- 最小值
select t_fraction.*,max(score) over () from t_fraction;	   -- 最大值
select t_fraction.*,avg(score) over () from t_fraction;    -- 求平均
select t_fraction.*,count(score) over () from t_fraction;  -- 计数
select t_fraction.*,lag(score) over () from t_fraction;    -- 获取当前行上一行的数据

-- 各种行的聚合
-- 对姓名进行开创窗
select name,subject,score,sum(score) over() as sum1, sum(score) over (partition by name )  as sum2 from t_fraction;
-- 对学科进行开窗
select name,subject,score,sum(score) over() as sum1, sum(score) over (partition by subject )  as sum2 from t_fraction;
-- 由起点到当前行的窗口聚合
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject )  as sum2, 
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3 
from t_fraction;
-- 当前行和前面一行的聚合
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject )  as sum2, 
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3, 
sum(score) over (partition by subject order by score rows between 1 preceding and current row) as sum4
from t_fraction;
-- 当前行的前面一行到后面一行的窗口聚合  前一行+当前行+后一行
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject )  as sum2, 
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3, 
sum(score) over (partition by subject order by score rows between 1 preceding and current row) as sum4,
sum(score) over (partition by subject order by score rows between 1 preceding and 1 following) as sum5
from t_fraction;
-- 当前行和后面所有行
select name,subject,score,sum(score) over() as sum1,
sum(score) over (partition by subject )  as sum2, 
sum(score) over (partition by subject order by score rows between unbounded preceding and current row ) as sum3, 
sum(score) over (partition by subject order by score rows between 1 preceding and current row) as sum4,
sum(score) over (partition by subject order by score rows between 1 preceding and 1 following) as sum5,
sum(score) over (partition by subject order by score rows between current row and unbounded following) as sum6
from t_fraction;

-- 聚合开窗函数实战
-- 创建表，并加载数据
create table business
(
name string, 
orderdate string,
cost int
)ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';

drop table business;

load data local inpath '/usr/local/soft/atao_file/business.txt' into table business;
select * from business;

-- 需求1：查询在2017年4月份购买过的顾客及总人数
select distinct name, date_format(orderdate,'yyyy-MM'),count(*) over ( ) from business where date_format(orderdate,'yyyy-MM') = '2017-04';

-- 需求2：查询顾客的购买明细及月购买总额
select name,orderdate,cost,sum(cost) over(partition by name) from business;

-- 需求3：上述的场景,要将cost按照日期进行累加
select name,orderdate,sum(cost) over (partition by name order by date_format(orderdate,'yyyy-MM-dd') rows between unbounded preceding and current row ) from business;

-- 需求4：查询顾客上次的购买时间
select name ,orderdate,lag(orderdate) over (partition by name order by  date_format(orderdate,'yyyy-MM-dd')) from business;

-- 需求5：查询前20%时间的订单信息
select * from (select name,orderdate,cost,ntile(5) over(order by orderdate) sortgroup_num from business) t where t.sortgroup_num=1;


-- hive 分析学生的成绩信息
-- 创建学生表并加载数据
create table score2
(
name string,
subject string, 
score int
) row format delimited fields terminated by "\t";

load data local inpath '/usr/local/soft/atao_file/score2' into table score2;

select * from score2;

--需求1：每门学科学生成绩排名(是否并列排名、空位排名三种实现)
-- RANK() 排序相同时会重复，总数不会变
-- DENSE_RANK() 排序相同时会重复，总数会减少
-- ROW_NUMBER() 会根据顺序计算
-- PERCENT_RANK()计算给定行的百分比排名。可以用来计算超过了百分之多少的人(当前行的rank值-1)/(分组内的总行数-1)
select name,subject,score, 
rank() over (partition by name order by score desc),
dense_rank() over (partition by name order by score desc ),
row_number() over (partition by name order by score desc )
from score2;

-- hive 行转列
-- 建表加加载数据
create table testArray2(
    name string,
    weight array<string>
)row format delimited 
fields terminated by '\t'
COLLECTION ITEMS terminated by ',';

drop table testarray2 ;

load data local inpath'/usr/local/soft/atao_file/testArray.txt' into table testarray2 ;
select * from testarray2 ;

select name,col1  from testarray2 lateral view explode(weight) t1 as col1;

select explode(weight) from testarray2;

select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)
select key from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;
select value from (select explode(map('key1',1,'key2',2,'key3',3)) as (key,value)) t;

select name,col1,col2  from testarray2 lateral view explode(map('key1',1,'key2',2,'key3',3)) t1 as col1,col2;

-- 显示集合炸开的行数的下表
select name,pos,col1  from testarray2 lateral view posexplode(weight) t1 as pos,col1;


-- hive 列转行
-- 建表加加载数据
create table testLieToLine(
    name string,
    col1 int
)row format delimited 
fields terminated by '\t';
load data local inpath '/usr/local/soft/atao_file/testLieToLine' into table testLieToLine;
select * from testlietoline 

-- 使用 collect_list(字段) 解决， 注意 collect_list不去重，但是 collect_set 是去重的
select name,collect_list(col1) from testlietoline group by name;
-- 在进行简单的加工一下， 用 | 讲数组中的联系起来(注意，只有字符串能够用 concat 等进行连接，使用 cast(字段 as string) 进行转换) 
select name,concat_ws('|',collect_list(cast(col1 as string))) from testlietoline group by name;

select * from f;


-- 重难点 电商必会问题；
-- 连续登录问题
-- 在电商、物流和银行可能经常会遇到这样的需求：统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数等
-- 建表导数据
create table deal_tb(
    id string
    ,datestr string
    ,amount string
)row format delimited fields terminated by ',';
load data local inpath '/usr/local/soft/atao_file/deal_tb' into table deal_tb;
select * from deal_tb;

-- 正确答案
-- 步骤一：
SELECT id, datestr, sum(amount) AS sum_amount
			FROM deal_tb
			GROUP BY id, datestr
-- 步骤二：
			SELECT t1.id AS id, t1.datestr AS datestr, t1.sum_amount AS sum_amount, row_number() OVER (PARTITION BY t1.id ORDER BY t1.datestr) AS rn
		FROM (
			SELECT id, datestr, sum(amount) AS sum_amount
			FROM deal_tb
			GROUP BY id, datestr
		) t1		
-- 步骤三：
		SELECT tt1.id AS id, tt1.datestr AS datestr, tt1.sum_amount AS sum_amount
		, date_sub(tt1.datestr, tt1.rn) AS grp
	FROM (
		SELECT t1.id AS id, t1.datestr AS datestr, t1.sum_amount AS sum_amount, row_number() OVER (PARTITION BY t1.id ORDER BY t1.datestr) AS rn
		FROM (
			SELECT id, datestr, sum(amount) AS sum_amount
			FROM deal_tb
			GROUP BY id, datestr
		) t1
	) tt1
-- 步骤四：
SELECT ttt1.id, ttt1.grp
	, round(sum(ttt1.sum_amount), 2) AS user_sum_amount
	, count(1) AS user_days
	, min(ttt1.datestr) AS user_start_date
	, max(ttt1.datestr) AS user_end_date
	, datediff(ttt1.grp, lag(ttt1.grp, 1) OVER (PARTITION BY ttt1.id ORDER BY ttt1.grp)) AS interval_days
FROM (
	SELECT tt1.id AS id, tt1.datestr AS datestr, tt1.sum_amount AS sum_amount
		, date_sub(tt1.datestr, tt1.rn) AS grp
	FROM (
		SELECT t1.id AS id, t1.datestr AS datestr, t1.sum_amount AS sum_amount, row_number() OVER (PARTITION BY t1.id ORDER BY t1.datestr) AS rn
		FROM (
			SELECT id, datestr, sum(amount) AS sum_amount
			FROM deal_tb
			GROUP BY id, datestr
		) t1
	) tt1
) ttt1
GROUP BY ttt1.id, ttt1.grp;



-- 步骤一：先按照用户和日期分组求和，使得每个用户每天只有一条数据
select id,datestr,sum(amount) from deal_tb group by id,datestr ;

-- 步骤二:根据用户ID分组按日期排序，将日期和分组序号相减得到连续登陆的开始日期，如果开始日期相同说明连续登陆
select t.id,t.datestr,t.sum,row_number() over (partition by id order by datestr) flag from 
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t ;

-- datediff(string end_date,string start_date);* 等于0说明连续登录
-- 统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数
select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from (select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from 
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1 ;

-- 统计用户连续交易的总额、连续登陆天数、连续登陆开始和结束时间、间隔天数
select
ttt1.id,
count(1) over (partition by ttt1.id ,ttt1.flag),
min(ttt1.datestr) over (partition by ttt1.id,ttt1.flag order by ttt1.datestr ),
max(ttt1.datestr) over (partition by ttt1.id,ttt1.flag )
from (
select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from (select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from 
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1 ) ttt1 ;

========================================================================================
select distinct 
tttt1.id,
tttt1.sum_time,
tttt1.start_time,
tttt1.end_time,
lag(tttt1.end_time) over (partition by  tttt1.end_time)
from
(
select
ttt1.id id,
count(1) over (partition by ttt1.id ,ttt1.flag) sum_time,
min(ttt1.datestr) over (partition by ttt1.id,ttt1.flag order by ttt1.datestr) start_time,
max(ttt1.datestr) over (partition by ttt1.id,ttt1.flag ) end_time
from (
select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from (select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from 
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1 ) ttt1 ) tttt1 order by tttt1.id,tttt1.start_time;

-- 再来一次
select 
distinct *
from 
(
select 
ttt1.id id,
sum(ttt1.sum) over (partition by ttt1.id,ttt1.flag),
count(1) over (partition by ttt1.id, ttt1.flag),
min(ttt1.datestr) over (partition by ttt1.id, ttt1.flag ) start_time,
max(ttt1.datestr) over (partition by ttt1.id, ttt1.flag),

from 
(select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from 
(select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from 
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1) ttt1 ) tttt1 order by tttt1.id,tttt1.start_time;



-- 最终结果
select distinct *
from 
(

select 
ttt1.id id,
sum(ttt1.sum) over (partition by ttt1.id,ttt1.flag),
count(1) over (partition by ttt1.id, ttt1.flag),
min(ttt1.datestr) over (partition by ttt1.id, ttt1.flag) start_time,
max(ttt1.datestr) over (partition by ttt1.id, ttt1.flag),
ttt1.flag,
lag(flag) over (partition by ttt1.id)
from 
(select tt1.id id,tt1.datestr datestr, tt1.sum sum, day(tt1.datestr)-tt1.flag flag from 
(select t.id id,t.datestr datestr,t.sum sum,
row_number() over (partition by id order by datestr) flag from 
(select id,datestr,sum(amount) sum from deal_tb group by id,datestr) t) tt1) ttt1

) tttt1 order by tttt1.id, tttt1.start_time;


-- 课下作业
--sql:Hive实现按照指定格式输出每七天的消费平均数
--输出格式：
--2018-06-01~2018-06-07	12.29
--...
--2018-08-10~2018-08-16	80.67
create table f
(
date_time  string,
cost string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
-- 导入数据
load data local inpath'/usr/local/soft/atao_file/f' into table f;
select * from f;

select avg(f.cost) over (order by from_unixtime(unix_timestamp(f.date_time,'yyyymmdd'),'yyyy-mm-dd') row 6 perceding) from f;

2018/6/1

-- 长度=8
select substring(f.date_time,1,4) from f; 2018
select concat('0',substring(f.date_time,6,1)) from f; 6
select concat('0',substring(f.date_time,8,1)) from f; 8

-- 长度=9
select substring(f.date_time,1,4) from f; 2018
select concat('0',substring(f.date_time,6,1)) from f; 6
select substring(f.date_time,8,2) from f; 8

-- 字符串的长度
select LENGTH(f.date_time) from f;

select if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))) from f;



select concat(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),'~',
date_sub(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),6)),
round(avg(f.cost) over (order by from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd') rows 6 preceding),2) from f;




select from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd')
from f;

select date_sub(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),6)
from f;
-- 日期
select concat(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),'~',
date_sub(from_unixtime(unix_timestamp(if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))),'yyyymmdd'),'yyyy-mm-dd'),6)) from f;


select if(LENGTH(f.date_time) = 8,
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),concat('0',substring(f.date_time,8,1))),
concat(substring(f.date_time,1,4),concat('0',substring(f.date_time,6,1)),substring(f.date_time,8,2))) from f;
本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。