课堂hiveSql案例最终版
select * from t_user ;
select age,count(*) from t_user group by age
-- 开启本地模式
set hive.exec.mode.local.auto=FALSE ;
set hive.exec.mode.local.auto.inputbytes.max=50000000;
set hive.exec.mode.local.auto.input.files.max=10;
set hive.mapred.local.mem = 10000;
select id,count(*) from t_user group by id
/*
1,张三,8000.0,2019-9-9,1,抽烟-喝酒-烫头,中国银行|123456-建设银行|22334455,北京-10010
2,李四,9000.0,2019-8-9,0,抽烟-喝酒-烫头,中国银行|123456-建设银行|22334455,郑州-45000
3,王五,7000.0,2019-7-9,1,喝酒-烫头,中国银行|123456-建设银行|22334455,北京-10010
4,赵6,100.0,2019-10-9,0,抽烟-烫头,中国银行|123456-建设银行|22334455,郑州-45000
5,于谦,1000.0,2019-10-9,0,抽烟-喝酒,中国银行|123456-建设银行|22334455,北京-10010
6,郭德纲,1000.0,2019-10-9,1,抽烟-烫头,中国银行|123456-建设银行|22334455,天津-20010
*/
create table t_person (
id string, -- 用户的id
name string, -- 用户的姓名
salary double, -- 用户的工资
birthday date, -- 用户的生日
sex char(1), -- 用户的性别
hobbies array<string>, -- 用户的爱好
cards map<string,string>, -- 用户的银行卡
addr struct<city:string,zipCode:string> -- 用户的地址
)row format delimited -- 标识下方配置为分隔符配置
fields terminated by ',' -- 列与列之间的分隔符 【重要】
COLLECTION items terminated by '-' -- 数组类型中,元素与元素之间的分隔符,以及map中kv与kv之间的分隔符,以及 struct 结构体中各个属性直接的分隔符
map keys terminated by '|' -- map 数据类型中,key 与value直接的分隔符
LINES terminated by '\n'; -- 行与行直接的分隔符
-- 当使用local的时候,访问的是hive服务端开启的linux节点
-- 文件路径 后跟一个 overwrite ,可以进行覆盖写入,比较危险,它会将数据表中所有的数据清空,再写入
load data local inpath '/opt/data/person.txt' overwrite into table t_person;
-- 当不写local的时候,访问的是hdfs中的目录
load data inpath '/person.txt' into table t_person;
-- hdfs dfs -mv /person.txt /user/hive/wareh...
-- SQL的规范:
-- 1. 书写规范(顺序)
select ... from ... where ... group by ... having ... order by ... limit ...
-- 2. 执行顺序(SQL)
from ... where ... group by ... having ... select ... order by ... limit
--select * from (
--select sex sx,count(*) per_ct from t_person group by sex having count(*) > 2
--) c where c.per_ct > 2
-- 特殊字段类型的查询
select hobbies[0],a.cards['建设银行'],addr.city, sex ,name
from t_person a
where addr.city = '郑州' and ARRAY_contains(hobbies,'抽烟')
-- where 的机制
-- where 条件,后面编写的任何表达式的结果都是布尔值
select * from t_person where length(name) = 2;
select * from t_person where 1 = 1 ;
select * from t_person where name like '赵%'
select * from t_person where salary BETWEEN 1000 and 7000 -- salary >= 1000 and salary <= 7000
/*
String sex ;
sex = 1;
if (sex == 1 )
{
输出
}
*/
-- order by 默认升序 多个字段一同排序的时候,如果降序,需要给每一个字段一个desc关键字
select * from t_person order by salary desc ,sex desc ,id desc ;
-- limit topN
select * from t_person limit 3;
-- 去重
-- 有两种方式: distinct 和 group by
select count(distinct salary) from t_person ;
select salary from t_person group by salary;
-- 表连接:
create table dept(
deptno int,
dname string,
loc string
)
row format delimited fields terminated by '\t';
-- 员工表:
create table emp(
empno int,
ename string,
job string,
mgr int,
hiredate string,
sal double,
comm double,
deptno int)
row format delimited fields terminated by '\t';
-- 导入数据
load data local inpath '/opt/data/dept.txt' into table dept;
load data local inpath '/opt/data/emp.txt' into table emp;
-- 验证数据
select * from dept;
select * from emp;
-- 查询 每个员工的部门名称
select * from dept;
select * from emp;
-- 左连接查询:
-- 如果只写了join ,没有写on 条件,则会发生笛卡尔积(输出左右两侧表的数据量乘积数据)
-- left 左侧的表为主表,数据必然显示,在遇到副表中相同匹配的数据时,数据会重复
-- 如果副表中,没有配置上,则所有副表的输出字段为null
-- 左连接
select t1.*,t2.dname from emp t1 left join dept t2 on t1.deptno = t2.deptno
-- 右连接
select * from emp t1 right join dept t2 on t1.deptno = t2.deptno;
-- 内连接
select * from emp t1 join dept t2 on t1.deptno = t2.deptno;
-- 分组 group by
-- 一个SQL对一个字段进行了分组,那么输出的结果,一定会将这个字段的唯一值进行输出
-- group by 分组SQL,可以在group by 后面添加分组字段,而select中,只能写入三个东西:
-- 1. 分组字段
-- 2. 聚合函数
-- 3. 常量值
select
deptno -- 只写一个deptno ,代表以 deptno 为分组,展示分组字段
,count(*) -- 统计每个部门有多少人,累加每个分组字段对应的数据【行数】
,count(comm) -- 统计每个部门有多少人拥有提成,累加每个分组字段对应的提成出现次数,null 值不计数
,123 -- 常量(字面量)可以放在分组SQL中,它只有在最终的输出阶段才会被处理
from emp group by deptno
having count(*) > 3 -- 先将所有的select + group by 处理完,形成临时结果然后经由having进行判断
-- having 关键字后面所跟的 表达式,都是聚合函数结果与某个值的比较
show FUNCTIONS ;
select abs(-10),'abc' ;
desc function abs;
desc function extended sum;
-- 1. array_contains(列,值);
select hobbies from t_person a where array_contains(a.hobbies ,'抽烟')
-- 2. length(列)
select length(a.name),* from t_person a
-- 3. concat(列,列) 拼接【重点】
select concat(name,'的工资是:',salary) from t_person
-- 4. year(date),month(date) 时间的处理
select year(birthday ),month(birthday ) from t_person a;
-- 5. date_add(date,数字) date_sub(date,数字) 处理日期的加减【重点】
select date_add(a.birthday ,-25),a.birthday from t_person a
-- 6.返回当前日期
select sex,count(*),current_date() from t_person group by sex order by count(*)
-- 7.两个日期之间的日期差
-- 今天和2010年6月1日的天数差
select datediff(current_date(),'2023-01-01')
-- 8.nvl( value,default_value) 【重点】
-- 如果value为NULL,则NVL函数返回default_value的值,否则返回value的值
select nvl(comm,0) as cm,* from emp -- ifnull
-- hive_day2 homework
/*
-- 在本案例中总共有三份数据,分别是:
-- (1)users.dat 数据格式为: 2,M,56,16,70072
-- 对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
-- 用户id 性别 年龄 职业 城市邮编
-- (2)movies.dat 数据格式为: 2,Jumanji (1995),Adventure|Children's|Fantasy
-- 对应字段为:MovieID BigInt, Title String, Genres String
-- 电影id 标题 类型
-- (3)ratings.dat 数据格式为: 1,1193,5,978300760
-- 对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
-- 用户id 电影id 评分 时间戳
*/
create table users(
UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
)row format delimited
fields terminated by ',';
load data local inpath '/opt/data/hive_hwday2/users.dat' into table users;
create table movies(
MovieID BigInt, Title String, Genres String
)row format delimited
fields terminated by ',';
load data local inpath '/opt/data/hive_hwday2/movies.dat' into table movies;
create table ratings(
UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
)row format delimited
fields terminated by ',';
load data local inpath '/opt/data/hive_hwday2/ratings.dat' into table ratings;
select * from users;
select * from movies;
select * from ratings;
/*
2. 求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数) order by limit
提示:按照电影名进行分组统计,求出每部电影的评分次数并按照评分次数降序排序
*/
select t2.Title,count(*) ct from ratings t1
left join movies t2 on t1.movieid = t2.MovieID
group by t2.Title order by ct desc limit 10;
/*
3. 分别求男性,女性当中评分(评分最多)的10部电影(性别,电影名,影评分)
-- 女生和男生数据分别查询即可,男生一个select 女生一个select
提示:三表联合查询,按照性别过滤条件,电影名作为分组条件,影评分作为排序条件进行查询
*/
select * from users;
select * from movies;
select * from
(
select 'M',t3.title,count(*) ct from ratings t1
left join users t2 on t1.userid = t2.UserID
left join movies t3 on t1.movieid = t3.movieid
where t2.gender = 'M' group by t3.title
order by ct desc limit 10
) c1
union all -- limit 结尾的sql 不能放在union 前,union要求所有的字段类型一致,数量一致才可以拼接
select * from
(
select 'F',t3.title,count(*) ct from ratings t1
left join users t2 on t1.userid = t2.UserID
left join movies t3 on t1.movieid = t3.movieid
where t2.gender = 'F' group by t3.title
order by ct desc limit 10) c2
;
/*
4. 求movieid = 2116这部电影各年龄的平均影评(年龄,影评分)
提示:t_user和t_rating表进行联合查询,用movieid=2116作为过滤条件,用年龄作为分组条件
*/
select t2.age ,avg(Rating) from ratings t1
left join users t2 on t1.userid = t2.userid
where movieid = 2116
group by t2.age
-- 聚合函数
-- max、min、sum、avg、count
-- max 取组中的最大值,如果没有分组,取全表的最大值
select sex,max(salary) from t_person group by sex -- 在t_person表中,每个性别中最大的工资数
-- min 取组中的最小值,如果没有分组,取全表的最小值
select sex,min(salary) from t_person group by sex
-- sum 取组中所有数据之和,要求sum()处理的字段为数值类型
select sex,sum(salary) from t_person group by sex
--..
-- collect_list collect_set
create table t_visit_video (
username string,
video_name string,
video_date date
)row format delimited fields terminated by ',';
load data local inpath '/opt/data/visit_data.txt' into table t_visit_video;
-- 需求:查看每个用户观看的电影都有哪些?
-- collect_list 可以理解为字符串拼接(相加),生成一个集合
select username,collect_list(video_name) from t_visit_video group by username ;
-- collect_set 可以理解为字符串拼接(相加)并去重,生成一个集合
select username,collect_set(video_name) from t_visit_video group by username ;
-- concat_ws 将一个集合转换为一个字符串,还可以指定分隔符
select concat_ws('|',b.hobbies ) from t_person b;
select username,concat_ws(',',collect_list(video_name)) from t_visit_video group by username ;
-- 炸裂函数 explode
select * from t_person
select distinct hb from (
select explode(a.hobbies) hb from t_person a
) c;
select t1.*,t2.hobby from t_person t1
lateral view explode(hobbies) t2 as hobby;
-- 需求:查询每个爱好都有几个人喜欢
select t2.hobby,count(*) from t_person t1
lateral view explode(hobbies) t2 as hobby -- (select hobby from explode(hobbies)) t2
group by t2.hobby
;
desc function extended split
--
select c.cr ,count(*) from
(
select explode(split('ajkldskfjalidjfojvjalkdjfqifojmnalsdjfkfjglskdjfg','')) as cr
) c group by c.cr;
-- 子查询
-- 需求:求t_person 表中,大于平均工资的人有哪些?
select * from t_person where salary > (
select avg(salary) from t_person -- 求出一个一行一列的数值,代表平均数(全表)
)
-- 需求:在查看所有的人的信息同时,展示平均工资
select *,salary - (select avg(salary) from t_person) -- 求出一个一行一列的数值,代表平均数(全表)
,(select avg(salary) from t_person) -- 求出一个一行一列的数值,代表平均数(全表)
from t_person ;
-- 排序
-- order by
select * from t_person order by salary -- 默认全局排序,且为升序 (结果集中的第一条数据是最小的,最后一条最大)
select * from t_person order by name --[asc] -- 除了数值可以排序之外,字符串也可以排序
select * from t_person order by salary desc -- desc 可以让排序成为降序排序
-- sort by
-- sort by 不同于order by ,它是非全局排序,分区内有序
-- job.setNumReduceTasks = 2
-- 通过下面的语句,设置会话窗口中,SQL语句生成的mr程序需要有多少个reduce参与计算,也就是说会分区
set mapreduce.job.reduces = 3;
-- 通过insert overwrite local directory 可以将查询结果输出到本地服务器
insert overwrite local directory '/opt/data/soutby'
select * from t_person sort by salary
-- 为什么要使用sort by?
-- 在数据量特别大的时候,reduceTask 会承受特别大的压力,效率不高
-- sort by 使用的时候,会设置多个分区,来进行reduce端的分布式计算,提升效率
select * from t_person sort by salary desc limit 2 ;
-- distribute by
insert overwrite local directory '/opt/data/dis_sortby'
select salary ,',',sex from t_person distribute by sex sort by salary
-- 开窗函数:
create table t_person_wd (
fname string,
fcity string,
fage int,
fsalary int
)row format delimited
fields terminated by ',';
load data local inpath '/opt/data/kaichuang.txt' into table t_person_wd;
-- row_number() over(分组依据,排序依据)
select *,row_number() over() rn from t_person_wd ;
-- 分组依据 和排序依据
select *,
row_number() over(partition by fcity) rn ,
row_number() over(partition by fcity order by fsalary ) rn2
from t_person_wd;
-- 需求:根据开窗函数,取每一个城市中工资最高的人
-- row_number 去重
select * from
(
select *,ROW_NUMBER() over (partition by fcity order by fsalary desc) rn
from t_person_wd
) t1
where t1.rn = 1;
-- rank() over() 排名
-- rank 开窗函数,如果不指定分组依据和排序依据的话,会将每一行数据当成一组
-- 只要提供了一个排序依据,就可以对全表进行数据排名
-- 特性:排序依据中遇到了同样的数值,会占用排名数,最终结果是:最后一名的排名是分组中的数据量(会跳名次)
select *,rank() over() from t_person_wd;
select
*,
rank() over(order by fsalary ),
rank() over(partition by fcity order by fsalary )
from t_person_wd;
-- dense_rank()over(order by )
-- 特性:对数据进行排名,遇到相同的数据,排名数字不会跳过,递增+1
select
*,
dense_rank() over(order by fsalary ),
rank() over(order by fsalary )
-- dense_rank() over(partition by fcity order by fsalary )
from t_person_wd;
CREATE TABLE `employee` (
`id` int ,
`name` varchar(255) ,
`salary` int ,
`departmentid` int
) row format delimited
fields terminated by '\t';
load data local inpath '/opt/data/employee.txt' overwrite into table employee;
select * from employee;
CREATE TABLE `department` (
`id` int ,
`name` varchar(255)
) row format delimited
fields terminated by '\t';
load data local inpath '/opt/data/department.txt' overwrite into table department;
select * from department;
select * from
(select *,DENSE_RANK() over(partition by departmentid order by salary desc) rk from employee ) a
where rk <=3
;
select d.name ,t4.name as employee,t4.salary from tmp_t3 t3
left join employee t4 on t4.id = t3.id
left join department d on t4.departmentid = d.id
where t3.rank_sa < 3 ;
-- 需求: 在显示所有的员工信息的同时,显示每个人员所在城市的最大工资数,平均工资数,和工资总数
select *
,max(fsalary) over (partition by fcity) -- 每个城市中的最大工资
,avg(fsalary) over (partition by fcity)
,sum(fsalary) over (partition by fcity)
from t_person_wd;
select *
,sum(fsalary) over (partition by fcity order by fage ) -- 分层叠加
from t_person_wd;
-- hive的表分类:
-- 管理表:正常创建的一个表就是管理表:create table table_name
-- 特性:将整个表(元数据mysql+具体数据hdfs)管理起来
select * from t_user;
-- 如果在hive中删除一个管理表,则会将这个表的元数据与hdfs中存放的目录删除
drop table t_user;
-- 外部表:创建一个表的时候,添加一个关键字:create external table table_name
-- 特性:只能操作到元数据
create external table t_personout(
id int,
name string,
salary double,
birthday date,
sex char(1),
hobbies array<string>,
cards map<string,string>,
addr struct<city:string,zipCode:string>
)row format delimited
fields terminated by ',' --列的分割
collection items terminated by '-'--数组 struct的属性 map的kv和kv之间
map keys terminated by '|'
lines terminated by '\n'
location '/user/hive/warehouse/test_hive.db/t_personout';
load data local inpath '/opt/data/person.txt' into table t_personout;
select * from t_personout;
-- 在hive中删除一个外部表,会将MySQL中的元数据信息删除掉,hdfs中的数据不会发生改变
drop table t_personout;
select * from t_personout;
create external table t_personout2(
id int,
name string,
salary double,
birthday date,
sex char(1),
hobbies array<string>,
cards map<string,string>,
addr struct<city:string,zipCode:string>
)row format delimited
fields terminated by ',' --列的分割
collection items terminated by '-'--数组 struct的属性 map的kv和kv之间
map keys terminated by '|'
lines terminated by '\n'
location '/file';
select * from t_personout2
create table tb_student (
Student_name String , Term int, Total_score int
)row format delimited
fields terminated by ',';
load data local inpath '/opt/data/hive_hwday3/student.txt' into table tb_student;
select * from tb_student order by term;
-- hive day3 作业 第一题: 有两个选择:
-- row_number 如果第一名的成绩有相同的,则取其中一个
-- rank 如果第一名的成绩有相同的,则取所有
select * from (
select *,rank() over(partition by term order by total_score desc) rk from tb_student
) c where rk = 1
-- hive day3 作业 第二题:
-- 每个学生1-4学期进步了几次:当本学期的考试分数,大于上学期的考试分数,那本学期进步
-- lag() over() -- 取分组内排序情况下前一个lag传入的值
-- length()
select student_name , -- case when total_score > nvl(last_sc,0) then 1 else 0 end
count( if(total_score > nvl(last_sc,999),total_score,null) )
,sum( if(total_score > nvl(last_sc,999),1,0) )
from (
select *,lag(total_score) over(partition by Student_name order by term) last_sc from tb_student
) c group by student_name
-- 分区表:
-- 构建一个分区表,需要注意什么?
-- 分区字段一定是不在建表 字段 括号中的
-- partitioned by 指定的字段,也是表字段的一员
create table t_user_part(
id string,
name string,
birth date,
salary double
)
partitioned by(city string)--指定分区列,按照城市分区。
row format delimited
fields terminated by ','
lines terminated by '\n';
select * from t_user_part;
load data local inpath '/opt/data/test_partition/bj.txt' into table t_user_part ;
load data local inpath '/opt/data/test_partition/zz.txt' into table t_user_part partition(city='zz');
select * from t_user_part where city = 'bj';
alter table t_user_part drop partition (city='zz');
-- 分桶表
drop table stu_buck;
create table stu_buck(id int, name string)
clustered by(id) into 4 buckets
row format delimited fields terminated by ',';
set mapreduce.job.reduces = -1; -- reduces 个数,默认负一 ,代表没有特殊情况下,reduce个数是1 ,有分桶则变成分桶的个数
load data local inpath '/opt/data/std.txt' into table stu_buck;
desc formatted t_user_part;
-- 抽样查询:
-- tablesample 关键字可以让SQL语句变成一个抽样查询语句
-- 通过 bucket x out of y on col_name 来进行抽样设定
-- bucket 分桶
-- x 从各个桶中取第几个桶中的数据
-- y 一共将数据分为几个桶,桶中的数据量不确定
-- col_name 指的是要通过哪个字段进行分桶
-- x 不能大于 y ,分桶的个数如果是4 的话,不可能取到第五个桶数据的
select * from stu_buck tablesample(bucket 4 out of 4 on id );
select * from emp tablesample(bucket 4 out of 10 on empno );
-- 用户自定义函数
-- 查看系统中的所有函数
show functions
desc function day
-- 通过用户自己编写的一段java代码,完成函数的构建,并在SQL中使用
-- 如果是第一次导入,下面两个语句就够了
add jar /opt/app/funcHello.jar;
create temporary function hello_udf as "function.MyUDF";
-- 如果重写了自定义的函数,要让它生效,必须重新导入到hive缓冲中
-- 先删除,后添加
delete jar /opt/app/funcHello.jar;
drop temporary function hello_udf;
add jar /opt/app/funcHello.jar;
create temporary function hello_udf as "function.MyUDF";
-- length()
select hello_udf(name,cast(salary as int)),* from t_person ;
show functions ;
desc function hello_udf;
-- 不确定函数
delete jar /opt/app/funcHello.jar;
drop temporary function get_num;
add jar /opt/app/funcHello.jar;
create temporary function get_num as "function.NumberUDF";
select get_num() as num,uuid(),* from t_person
-- @UDFType(deterministic = false) -- 函数类型指定为不确定性函数
-- @UDFType(deterministic = true) -- 函数类型指定为确定性函数
-- 确定性函数在执行的时候,会有缓存 length('abc') ,将结果 3 进行缓存,当遇到同样的数据传入的时候,就不经由evaluate() 方法处理了
-- 如果一个函数处理的数据,进来的是什么样子,输出的时候,结果是确定则需要设定为确定函数
-- 不确定性函数举例:
select CURRENT_date() ,rand(),* from t_person
-- 用户自定义函数UDTF
-- UDTF 就是 炸裂函数所属的一个函数类型
select myudtf('zs,lisi,wangwu',',') from t_person
-- 要求结果为:
/*
zs
lisi
wangwu
*/
show functions
-- [100,90,95] [语文,数学,英语]
select posexplode(hobbies ) from t_person
delete jar /opt/app/funcHello.jar;
drop temporary function my_zhalie;
add jar /opt/app/funcHello.jar;
create temporary function my_zhalie as "function.MyUDTF";
select my_zhalie('zs,lisi,wangwu',',');
select * from movies
lateral view my_zhalie(genres ,'\\|') t2 as genre,len;
delete jar /opt/app/funcHello.jar;
drop temporary function my_zhalie;
add jar /opt/app/funcHello.jar;
create temporary function jiami as "function.PhoneUDF";
desc function jiami
Select jiami('13838384380') union all
Select jiami('13838384') union all
Select jiami('13838384cc') ;
show create table t_person
select * from t_person ;
CREATE TABLE `t_dwd_person`(
`id` string,
`name` string,
`salary` double,
`birthday` date,
`sex` char(1),
hobby String
);
insert into t_dwd_person
select a.id ,a.name ,a.salary , a.birthday ,a.sex,b.hobby from t_person a
lateral view explode(a.hobbies ) b as hobby;
-- 男的喝酒多还是女的喝酒多
select sex,count(if(hobby='喝酒' ,sex,null)) from t_dwd_person group by sex
create table t_dwd_person as
select a.id ,a.name ,a.salary , a.birthday ,a.sex,b.hobby from t_person a
lateral view explode(a.hobbies ) b as hobby;
-- delete from tmp_dwd_person where id = 1;
create table tmp_no_id_1 as
select * from t_person where id <> 1;
insert overwrite table t_person
select * from tmp_no_id_1 ;
select * from t_person
-- update
show create table t_person
create table tmp_ps_update as
select
`id` ,
`name` ,
`salary` ,
`birthday` ,
case when sex = 1 then '男'
when sex = 0 then '女'
else '未知' end sex,
-- if(sex in(1,2) ,if(sex = 1,'男','女'),'其他')
`hobbies` ,
`cards` ,
`addr`
from t_person ;
insert overwrite table t_person
select * from tmp_ps_update
;
select * from t_person;
--- map端的压缩:
-- 设置开启mr程序中的数据传输阶段压缩功能
set hive.exec.compress.intermediate=true;
-- 设置开启MapTask输出临时文件时的压缩功能
set mapreduce.map.output.compress=true;
-- 设置map端的压缩方式:
set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
select count(*) from emp;
-- reduce 端的压缩:
-- 设置开启hive的压缩
set hive.exec.compress.output=true;
-- 设置开启MapReduce的输出压缩
set mapreduce.output.fileoutputformat.compress=true;
-- 指定压缩的方式
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
-- 指定压缩的类型为块压缩
set mapreduce.output.fileoutputformat.compress.type=BLOCK;
-- 测试压缩:
insert overwrite local directory -- 将数据导出到本地服务器(hive运行的linux)中
'/opt/data/result_no_compress' select * from ratings a distribute by a.movieid sort by a.userid desc;
insert overwrite local directory -- 将数据导出到本地服务器(hive运行的linux)中
'/opt/data/result_snappy_compress' select * from emp distribute by deptno sort by empno desc;
insert overwrite local directory -- 将数据导出到本地服务器(hive运行的linux)中
'/opt/data/result_no_compress_rat' select * from ratings a distribute by a.movieid sort by a.userid desc;
insert overwrite local directory -- 将数据导出到本地服务器(hive运行的linux)中
'/opt/data/result_compress_rat' select * from ratings a distribute by a.movieid sort by a.userid desc;
-- hive中的文件格式:
-- 行式存储 ,默认情况下就是这个存储格式,导入的数据与原始数据一致,肉眼可查
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as textfile;
load data local inpath '/opt/data/hive_file_type/log.data' into table log_text;
-- orc 列式存储
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
-- 分隔符指定
row format delimited fields terminated by '\t'
-- stored as 关键字用作指定创建的表为什么存储格式
stored as orc
-- 列式存储,默认需要指定压缩格式
tblproperties("orc.compress"="NONE");
-- load data local inpath '/opt/data/hive_file_type/log.data' into table log_orc;
insert into log_orc
select * from log_text;
-- parquet 列式存储类型
create table log_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as parquet ;
insert into log_parquet
select * from log_text;
select * from log_text; -- 27.263
select * from log_orc; -- 0.308
select * from log_parquet; -- 0.468
create table log_orc_zlib(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="ZLIB");
insert into log_orc_zlib
select * from log_text;
create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="SNAPPY");
insert into log_orc_snappy
select * from log_text;
select count(ip ) from log_orc_snappy;
select count(ip ) from log_orc_zlib;
create table t_video(
videoId string,
uploader string,
age int,
category array<string>,
length int, -- 视频时间长度
views int, -- 视频观看数
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&";
load data inpath '/hive_homework/readydata' overwrite into table t_video;
select * from t_video;
create table t_user(
uploader string,
videos int, -- 上传了多少个视频
friends int)
row format delimited
fields terminated by "\t" ;
load data inpath '/hive_homework/user' into table t_user;
select * from t_user;
-- 1.统计视频观看数 Top10(需求不明确,用最简单的方式)
select * from t_video order by views desc limit 10 ;
-- 2.统计视频类别热度 Top10
select t2.categ,sum(t1.views) sv from t_video t1
lateral view explode(category) t2 as categ
group by t2.categ
order by sv desc
limit 10;
-- 3.统计视频观看数 Top20 所属类别以及类别包含的 Top20 的视频个数
select t2.categ,count(t1.videoId) from(
select category,videoId,views from t_video a order by views desc limit 20
) t1 -- 视频id 类别数组
LATERAL VIEW explode(t1.category) t2 as categ
group by t2.categ
-- 4.统计视频观看数 Top50 所关联视频的所属类别 Rank(排名)
-- 统计视频观看数 Top50 所关联视频
select *,rank() over(order by ct desc ) from (
select t6.categ,count(*) ct from
(
select t3.related,t4.category from
(
select related from -- 视频id(观看量top50所关联的视频id)
(select relatedId,views from t_video order by views desc limit 50 ) t1
lateral view explode(t1.relatedId) t2 as related ) t3
left join t_video t4 on t3.related = t4.videoId
) t5 lateral view explode(t5.category) t6 as categ
group by t6.categ ) t7 ;
-- 第二种方式:
-- 1). 创建一个临时表,用于存放观看数前50的视频信息(相关视频id数组、观看数、视频id)
create table tmp_relatedId_from_top50 as
select relatedId,views,videoid from t_video order by views desc limit 50 ;
select * from tmp_relatedId_from_top50;
-- 2). 创建一个临时表,用于存放所有的关联视频id(具体的id值,该值与t_video 表中的videoid有关联)
create table tmp_relatedId as
select t2.related from tmp_relatedId_from_top50 t1
lateral view explode(t1.relatedId) t2 as related
select * from tmp_relatedId;
-- 3). 创建一个临时表,用于存放“相关视频”与t_video表关联上的所有类别
create table tmp_relatedId_category as
select t1.related,t2.category/*t_video表中的类别数组*/
from tmp_relatedId t1 left join t_video t2 on t1.related = t2.videoid
select * from tmp_relatedId_category;
-- 4). 创建一个临时表,用于存放将关联视频的类别数组炸裂开的数据
create table tmp_category_for_related as
select categ from tmp_relatedId_category t1
lateral view explode(category) t2 as categ ;
select * from tmp_category_for_related;
-- 5). 根据表中数据出现的次数,完成排名
select categ,count(*) ct from tmp_category_for_related group by categ order by ct desc ;
select *,rank() over(order by ct desc ) from(
select categ,count(*) ct from tmp_category_for_related group by categ
) t1 ;
-- 5.统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频
-- 第一种,想简单的方式:
select * from (
select uploader from t_user order by videos desc limit 10
) t1 left join t_video t2 on t1.uploader = t2.uploader
order by t2.views desc limit 20;
-- 第二种,每个用户上传视频中,观看数前20的视频
select t3.uploader,t3.videoid,t3.views from (
select t2.uploader,t2.videoid,t2.views,rank() over(partition by t2.uploader order by t2.views desc ) rk from (
select uploader,videos from t_user order by videos desc limit 10
) t1 left join t_video t2 on t1.uploader = t2.uploader
) t3 where t3.rk <=20
-- 6.统计每个类别中的视频热度 Top10
select * from (
select *,rank()over (partition by t2.categ order by t1.views desc) rk from t_video t1
lateral view explode(category) t2 as categ
) t3 where t3.rk <= 10;