hive开窗获取上一条记录 hive 取第一条记录

转载
mob64ca13f40f3d 2024-07-24 09:22:03
文章标签 hive开窗获取上一条记录 hive sql 字段 ide 文章分类 Hive 大数据
课堂hiveSql案例最终版
select * from t_user ;

select age,count(*) from t_user group by age 

-- 开启本地模式  
set hive.exec.mode.local.auto=FALSE ;

set hive.exec.mode.local.auto.inputbytes.max=50000000;
set hive.exec.mode.local.auto.input.files.max=10;
set hive.mapred.local.mem = 10000;

select id,count(*) from t_user group by id 

/*
1,张三,8000.0,2019-9-9,1,抽烟-喝酒-烫头,中国银行|123456-建设银行|22334455,北京-10010
2,李四,9000.0,2019-8-9,0,抽烟-喝酒-烫头,中国银行|123456-建设银行|22334455,郑州-45000
3,王五,7000.0,2019-7-9,1,喝酒-烫头,中国银行|123456-建设银行|22334455,北京-10010
4,赵6,100.0,2019-10-9,0,抽烟-烫头,中国银行|123456-建设银行|22334455,郑州-45000
5,于谦,1000.0,2019-10-9,0,抽烟-喝酒,中国银行|123456-建设银行|22334455,北京-10010
6,郭德纲,1000.0,2019-10-9,1,抽烟-烫头,中国银行|123456-建设银行|22334455,天津-20010
*/

create table t_person (
    id string,      -- 用户的id
    name string,    -- 用户的姓名
    salary double,  -- 用户的工资
    birthday date,  -- 用户的生日
    sex char(1),    -- 用户的性别
    hobbies array<string>,  -- 用户的爱好
    cards map<string,string>,     -- 用户的银行卡
    addr struct<city:string,zipCode:string>  -- 用户的地址
)row format delimited   -- 标识下方配置为分隔符配置
fields terminated by ','   -- 列与列之间的分隔符 【重要】
COLLECTION items terminated by '-'   -- 数组类型中，元素与元素之间的分隔符，以及map中kv与kv之间的分隔符，以及 struct 结构体中各个属性直接的分隔符
map keys terminated by '|'  -- map 数据类型中，key 与value直接的分隔符 
LINES terminated by '\n';    -- 行与行直接的分隔符

--  当使用local的时候，访问的是hive服务端开启的linux节点 
-- 文件路径 后跟一个 overwrite ，可以进行覆盖写入，比较危险，它会将数据表中所有的数据清空，再写入
load data local inpath '/opt/data/person.txt' overwrite into table t_person;

-- 当不写local的时候，访问的是hdfs中的目录
load data inpath '/person.txt' into table t_person;
--  hdfs dfs -mv /person.txt /user/hive/wareh...


-- SQL的规范：
-- 1. 书写规范(顺序)
select ... from  ... where ...  group by ... having ... order by ... limit ... 

-- 2. 执行顺序（SQL）
from ... where ... group by ... having ... select ... order by ... limit 

--select * from (
--select sex sx,count(*) per_ct from t_person  group by sex having count(*) > 2
--) c where c.per_ct  > 2

-- 特殊字段类型的查询
select hobbies[0],a.cards['建设银行'],addr.city, sex ,name 
from t_person a 
where addr.city = '郑州' and  ARRAY_contains(hobbies,'抽烟')

-- where  的机制
-- where 条件，后面编写的任何表达式的结果都是布尔值
select * from t_person where length(name) = 2;
select * from t_person where 1 = 1 ;
select * from t_person where name like '赵%'
select * from t_person where salary BETWEEN 1000 and 7000  -- salary >= 1000 and salary <= 7000


/*
String sex ; 
sex = 1;
if (sex == 1 )
{
     输出
}
*/

-- order by 默认升序  多个字段一同排序的时候，如果降序，需要给每一个字段一个desc关键字
select * from t_person order by salary desc ,sex desc ,id desc ;

-- limit  topN
select * from t_person limit 3;


-- 去重
-- 有两种方式： distinct  和 group by
select  count(distinct salary)  from t_person ;
select salary from t_person group by salary;



--  表连接：
create table dept(
deptno int,
dname string,
loc string
)
row format delimited fields terminated by '\t';

--  员工表：
create table  emp(
empno int,
ename string,
job string,
mgr int,
hiredate string, 
sal double, 
comm double,
deptno int)
row format delimited fields terminated by '\t';

-- 导入数据
load data local inpath '/opt/data/dept.txt' into table dept;
load data local inpath '/opt/data/emp.txt' into table emp;

-- 验证数据
select * from dept;
select * from emp;

-- 查询 每个员工的部门名称
select * from dept;
select * from emp;

-- 左连接查询：
-- 如果只写了join ，没有写on 条件，则会发生笛卡尔积（输出左右两侧表的数据量乘积数据）
-- left 左侧的表为主表，数据必然显示，在遇到副表中相同匹配的数据时，数据会重复
-- 如果副表中，没有配置上，则所有副表的输出字段为null
-- 左连接
select t1.*,t2.dname from emp t1 left join dept t2 on t1.deptno = t2.deptno

-- 右连接
select * from emp t1 right join dept t2 on t1.deptno = t2.deptno;

-- 内连接
select * from emp t1  join dept t2 on t1.deptno = t2.deptno;


-- 分组  group by 
-- 一个SQL对一个字段进行了分组，那么输出的结果，一定会将这个字段的唯一值进行输出
-- group by 分组SQL，可以在group by 后面添加分组字段，而select中，只能写入三个东西：
-- 1. 分组字段
-- 2. 聚合函数
-- 3. 常量值
 select  
deptno    -- 只写一个deptno ，代表以 deptno 为分组，展示分组字段
,count(*) -- 统计每个部门有多少人，累加每个分组字段对应的数据【行数】
,count(comm) -- 统计每个部门有多少人拥有提成，累加每个分组字段对应的提成出现次数，null 值不计数
,123    -- 常量（字面量）可以放在分组SQL中，它只有在最终的输出阶段才会被处理
from emp  group by deptno 
having count(*) > 3  -- 先将所有的select + group by 处理完，形成临时结果然后经由having进行判断
-- having 关键字后面所跟的 表达式，都是聚合函数结果与某个值的比较


show FUNCTIONS ;

select abs(-10),'abc' ;

desc function abs;

desc function extended sum;

-- 1. array_contains(列,值);
select hobbies from t_person  a where array_contains(a.hobbies ,'抽烟')


-- 2. length(列)
select length(a.name),* from t_person  a  
 
-- 3. concat(列,列)  拼接【重点】
select concat(name,'的工资是：',salary)  from t_person 

-- 4. year(date),month(date) 时间的处理
select year(birthday ),month(birthday ) from t_person a;


-- 5. date_add(date,数字) date_sub(date,数字) 处理日期的加减【重点】
select date_add(a.birthday ,-25),a.birthday from t_person a 

-- 6.返回当前日期
select sex,count(*),current_date() from t_person group by sex order by count(*)

-- 7.两个日期之间的日期差 
-- 今天和2010年6月1日的天数差
select datediff(current_date(),'2023-01-01')


-- 8.nvl( value，default_value) 【重点】
-- 如果value为NULL，则NVL函数返回default_value的值，否则返回value的值
select nvl(comm,0) as cm,* from emp   -- ifnull

-- hive_day2 homework 


/*
-- 在本案例中总共有三份数据，分别是：
-- （1）users.dat 数据格式为： 2,M,56,16,70072
-- 对应字段为：UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
--          用户id          性别            年龄        职业            城市邮编   

-- （2）movies.dat 数据格式为： 2,Jumanji (1995),Adventure|Children's|Fantasy
-- 对应字段为：MovieID BigInt, Title String, Genres String
--              电影id          标题        类型

-- （3）ratings.dat 数据格式为： 1,1193,5,978300760
-- 对应字段为：UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
--          用户id          电影id          评分            时间戳
 */

create table users(
UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
)row format delimited 
fields terminated by ',';

load data local inpath '/opt/data/hive_hwday2/users.dat' into table users;

create table movies(
MovieID BigInt, Title String, Genres String
)row format delimited 
fields terminated by ',';
load data local inpath '/opt/data/hive_hwday2/movies.dat' into table movies;

create table ratings(
UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
)row format delimited 
fields terminated by ',';
load data local inpath '/opt/data/hive_hwday2/ratings.dat' into table ratings;


select * from users;
select * from movies;
select * from ratings;

/*
2. 求被评分次数最多的10部电影，并给出评分次数（电影名，评分次数） order by  limit 
   提示:按照电影名进行分组统计，求出每部电影的评分次数并按照评分次数降序排序
   */
select t2.Title,count(*) ct from ratings t1
left join movies t2 on t1.movieid = t2.MovieID
group by t2.Title order by ct desc limit 10;

/*
3. 分别求男性，女性当中评分(评分最多)的10部电影（性别，电影名，影评分）
    -- 女生和男生数据分别查询即可，男生一个select 女生一个select
    提示：三表联合查询，按照性别过滤条件，电影名作为分组条件，影评分作为排序条件进行查询
 */
select * from users;

select * from movies;


select * from 
(
select 'M',t3.title,count(*) ct from ratings t1 
left join users t2 on t1.userid = t2.UserID  
left join movies t3 on t1.movieid = t3.movieid 
where t2.gender = 'M' group by t3.title
order by  ct desc limit 10
) c1
union all -- limit 结尾的sql 不能放在union 前，union要求所有的字段类型一致，数量一致才可以拼接
select * from  
(
select 'F',t3.title,count(*) ct from ratings t1 
left join users t2 on t1.userid = t2.UserID  
left join movies t3 on t1.movieid = t3.movieid 
where t2.gender = 'F' group by t3.title
order by  ct desc limit 10) c2
;


/*
4. 求movieid = 2116这部电影各年龄的平均影评（年龄，影评分）
   提示：t_user和t_rating表进行联合查询，用movieid=2116作为过滤条件，用年龄作为分组条件
 */

select t2.age ,avg(Rating) from ratings t1 
left join users t2 on t1.userid = t2.userid 
where movieid = 2116
group by t2.age 

-- 聚合函数
-- max、min、sum、avg、count
-- max 取组中的最大值，如果没有分组，取全表的最大值
select sex,max(salary) from t_person group by sex  -- 在t_person表中，每个性别中最大的工资数
-- min 取组中的最小值，如果没有分组，取全表的最小值
select sex,min(salary) from t_person group by sex  
-- sum 取组中所有数据之和，要求sum()处理的字段为数值类型
select sex,sum(salary) from t_person group by sex

--..

--  collect_list   collect_set
create table t_visit_video (
    username string,
    video_name string,
    video_date date
)row format delimited fields terminated by ',';

load data local inpath '/opt/data/visit_data.txt' into table t_visit_video;

-- 需求：查看每个用户观看的电影都有哪些？
-- collect_list 可以理解为字符串拼接（相加），生成一个集合
select username,collect_list(video_name) from t_visit_video group by username ;

-- collect_set 可以理解为字符串拼接（相加）并去重，生成一个集合
select username,collect_set(video_name) from t_visit_video group by username ;

-- concat_ws 将一个集合转换为一个字符串,还可以指定分隔符
select concat_ws('|',b.hobbies ) from  t_person b;

select username,concat_ws(',',collect_list(video_name)) from t_visit_video group by username ;


-- 炸裂函数 explode
select * from t_person

select distinct hb  from (
select   explode(a.hobbies) hb from t_person a
) c;

select t1.*,t2.hobby  from t_person t1 
lateral view explode(hobbies) t2 as hobby;

-- 需求：查询每个爱好都有几个人喜欢
select  t2.hobby,count(*)  from t_person t1 
lateral view explode(hobbies) t2 as hobby   --  (select hobby from explode(hobbies)) t2
group by t2.hobby
;

desc function extended split
-- 
select c.cr ,count(*) from
(
select explode(split('ajkldskfjalidjfojvjalkdjfqifojmnalsdjfkfjglskdjfg','')) as cr
) c group by c.cr;


-- 子查询
-- 需求：求t_person 表中，大于平均工资的人有哪些？
select * from t_person where salary > (
select avg(salary) from t_person -- 求出一个一行一列的数值，代表平均数（全表）
) 

-- 需求：在查看所有的人的信息同时，展示平均工资
select *,salary - (select avg(salary) from t_person) -- 求出一个一行一列的数值，代表平均数（全表）
  ,(select avg(salary) from t_person) -- 求出一个一行一列的数值，代表平均数（全表）
from t_person    ;


-- 排序
-- order by 
select * from t_person order by salary -- 默认全局排序，且为升序 （结果集中的第一条数据是最小的，最后一条最大）
select * from t_person order by name --[asc] -- 除了数值可以排序之外，字符串也可以排序
select * from t_person order by salary desc  -- desc 可以让排序成为降序排序

-- sort by
-- sort by 不同于order by ，它是非全局排序，分区内有序
-- job.setNumReduceTasks = 2
-- 通过下面的语句，设置会话窗口中，SQL语句生成的mr程序需要有多少个reduce参与计算，也就是说会分区
set mapreduce.job.reduces = 3;

-- 通过insert overwrite local directory 可以将查询结果输出到本地服务器
insert overwrite local directory '/opt/data/soutby'
select * from t_person sort by salary    
-- 为什么要使用sort by？
-- 在数据量特别大的时候，reduceTask 会承受特别大的压力，效率不高
-- sort by 使用的时候，会设置多个分区，来进行reduce端的分布式计算，提升效率
select * from t_person sort by salary desc limit 2 ;

-- distribute by 
insert overwrite local directory '/opt/data/dis_sortby'
select salary ,',',sex from t_person  distribute by sex sort by salary 


--  开窗函数：
create table t_person_wd (
	fname string,
	fcity string,
	fage int,
	fsalary int
)row format delimited
fields terminated by ',';

load data local inpath '/opt/data/kaichuang.txt' into table t_person_wd;

-- row_number() over(分组依据，排序依据)
select *,row_number() over() rn  from t_person_wd ;
-- 分组依据 和排序依据
select *,
row_number() over(partition by fcity) rn ,
row_number() over(partition by fcity order by fsalary ) rn2 
from t_person_wd;

-- 需求：根据开窗函数，取每一个城市中工资最高的人
-- row_number 去重  
select * from 
(
select *,ROW_NUMBER() over (partition by fcity order by fsalary desc) rn 
from t_person_wd 
) t1 
where  t1.rn = 1;

-- rank() over()  排名
-- rank 开窗函数，如果不指定分组依据和排序依据的话，会将每一行数据当成一组
-- 只要提供了一个排序依据，就可以对全表进行数据排名
-- 特性：排序依据中遇到了同样的数值，会占用排名数，最终结果是：最后一名的排名是分组中的数据量（会跳名次）
select *,rank() over() from t_person_wd;
select 
*,
rank() over(order by fsalary ),
rank() over(partition by fcity order by fsalary )
from t_person_wd;

-- dense_rank()over(order by )
-- 特性：对数据进行排名，遇到相同的数据，排名数字不会跳过，递增+1
select 
*,
dense_rank() over(order by fsalary ),
rank() over(order by fsalary )
-- dense_rank() over(partition by fcity order by fsalary )
from t_person_wd;

CREATE TABLE `employee` (
  `id` int   ,
  `name` varchar(255) ,
  `salary` int  ,
  `departmentid` int   
) row format delimited
fields terminated by '\t';
load data local inpath '/opt/data/employee.txt' overwrite into table employee;
select * from employee;


CREATE TABLE `department` (
  `id` int   ,
  `name` varchar(255)   
)  row format delimited
fields terminated by '\t';
load data local inpath '/opt/data/department.txt' overwrite into table department;
select  * from department;



select * from 
(select *,DENSE_RANK() over(partition by departmentid order by salary desc) rk from employee ) a 
where rk <=3
;

select d.name ,t4.name as employee,t4.salary from  tmp_t3 t3 
left join employee t4 on t4.id = t3.id
left join department d on t4.departmentid = d.id
where t3.rank_sa < 3 ;


-- 需求： 在显示所有的员工信息的同时，显示每个人员所在城市的最大工资数，平均工资数，和工资总数
select *
,max(fsalary) over (partition by fcity) -- 每个城市中的最大工资
,avg(fsalary) over (partition by fcity)
,sum(fsalary) over (partition by fcity)
from t_person_wd;

select *
,sum(fsalary) over (partition by fcity order by fage )  -- 分层叠加
from t_person_wd;


-- hive的表分类：
-- 管理表：正常创建的一个表就是管理表：create table table_name
-- 特性：将整个表（元数据mysql+具体数据hdfs）管理起来
select * from t_user;
-- 如果在hive中删除一个管理表，则会将这个表的元数据与hdfs中存放的目录删除
drop table t_user;

-- 外部表：创建一个表的时候，添加一个关键字：create external table table_name
-- 特性：只能操作到元数据
create external table t_personout(
        id int,
        name string,
        salary double,
        birthday date,
        sex char(1),
        hobbies array<string>,
        cards map<string,string>,
        addr struct<city:string,zipCode:string>
    )row format delimited
    fields terminated by ',' --列的分割
    collection items terminated by '-'--数组 struct的属性 map的kv和kv之间
    map keys terminated by '|'
    lines terminated by '\n'
    location '/user/hive/warehouse/test_hive.db/t_personout';
   
   
load data local inpath '/opt/data/person.txt' into table t_personout;

select * from t_personout;
-- 在hive中删除一个外部表，会将MySQL中的元数据信息删除掉，hdfs中的数据不会发生改变
drop table t_personout;

select * from t_personout;

create external table t_personout2(
        id int,
        name string,
        salary double,
        birthday date,
        sex char(1),
        hobbies array<string>,
        cards map<string,string>,
        addr struct<city:string,zipCode:string>
    )row format delimited
    fields terminated by ',' --列的分割
    collection items terminated by '-'--数组 struct的属性 map的kv和kv之间
    map keys terminated by '|'
    lines terminated by '\n'
    location '/file';
   
   select * from t_personout2
   
   
   create table tb_student (
   Student_name String ,	Term int,	Total_score int
   )row format delimited 
   fields terminated by ',';
  
  load data local inpath '/opt/data/hive_hwday3/student.txt' into table tb_student;
   
select * from tb_student order by term;

--   hive day3 作业 第一题： 有两个选择：
-- row_number  如果第一名的成绩有相同的，则取其中一个
-- rank        如果第一名的成绩有相同的，则取所有
select * from (
select *,rank() over(partition by term order by total_score desc) rk from tb_student
) c where rk = 1

--   hive day3 作业 第二题：
-- 每个学生1-4学期进步了几次：当本学期的考试分数，大于上学期的考试分数，那本学期进步
-- lag() over()   -- 取分组内排序情况下前一个lag传入的值
-- length()

select student_name , -- case when total_score > nvl(last_sc,0)  then 1  else 0 end 
count( if(total_score > nvl(last_sc,999),total_score,null) ) 
,sum( if(total_score > nvl(last_sc,999),1,0) )
from (
select *,lag(total_score) over(partition by Student_name order by term) last_sc from tb_student
) c group by student_name


--  分区表：
-- 构建一个分区表，需要注意什么？
-- 分区字段一定是不在建表 字段 括号中的
-- partitioned by 指定的字段，也是表字段的一员
create   table t_user_part(
	id string,
	name string,
	birth date,
	salary double
)
partitioned by(city string)--指定分区列,按照城市分区。

row format delimited
fields terminated by ',' 
lines terminated by '\n';


select * from t_user_part;

load data local inpath '/opt/data/test_partition/bj.txt' into table t_user_part ;
load data local inpath '/opt/data/test_partition/zz.txt' into table t_user_part partition(city='zz');

select * from t_user_part where city = 'bj';

alter table t_user_part drop partition (city='zz');



-- 分桶表
drop table stu_buck;
create table stu_buck(id int, name string)
clustered by(id) into 4 buckets
row format delimited fields terminated by ',';

set mapreduce.job.reduces = -1;  -- reduces 个数，默认负一   ，代表没有特殊情况下，reduce个数是1 ，有分桶则变成分桶的个数

load data local inpath '/opt/data/std.txt' into table stu_buck;

desc formatted t_user_part;

-- 抽样查询：
-- tablesample 关键字可以让SQL语句变成一个抽样查询语句
-- 通过 bucket x out of y on col_name 来进行抽样设定
-- bucket  分桶
-- x 从各个桶中取第几个桶中的数据
-- y 一共将数据分为几个桶，桶中的数据量不确定
-- col_name 指的是要通过哪个字段进行分桶
-- x 不能大于 y ，分桶的个数如果是4 的话，不可能取到第五个桶数据的
select * from stu_buck tablesample(bucket 4 out of 4 on id );
select * from emp tablesample(bucket 4 out of 10 on empno );

-- 用户自定义函数

-- 查看系统中的所有函数
show functions
desc function day

-- 通过用户自己编写的一段java代码，完成函数的构建，并在SQL中使用
-- 如果是第一次导入，下面两个语句就够了
add jar /opt/app/funcHello.jar;
create temporary function hello_udf as "function.MyUDF";


-- 如果重写了自定义的函数，要让它生效，必须重新导入到hive缓冲中
-- 先删除，后添加
delete jar /opt/app/funcHello.jar;
drop temporary function hello_udf;
add jar /opt/app/funcHello.jar;
create temporary function hello_udf as "function.MyUDF";


-- length()
select hello_udf(name,cast(salary as int)),* from t_person ;

show functions ;
desc function hello_udf;

--  不确定函数

delete jar /opt/app/funcHello.jar;
drop temporary function get_num;
add jar /opt/app/funcHello.jar;
create temporary function get_num as "function.NumberUDF";


select get_num() as num,uuid(),* from t_person

-- @UDFType(deterministic = false)   -- 函数类型指定为不确定性函数
-- @UDFType(deterministic = true)    -- 函数类型指定为确定性函数
-- 确定性函数在执行的时候，会有缓存    length('abc')  ，将结果 3  进行缓存，当遇到同样的数据传入的时候，就不经由evaluate() 方法处理了
-- 如果一个函数处理的数据，进来的是什么样子，输出的时候，结果是确定则需要设定为确定函数
-- 不确定性函数举例：
select CURRENT_date() ,rand(),* from t_person 


-- 用户自定义函数UDTF
-- UDTF 就是 炸裂函数所属的一个函数类型

select myudtf('zs,lisi,wangwu',',') from t_person
-- 要求结果为：
/*
  zs
  lisi
  wangwu
 */
show functions
-- [100,90,95]     [语文,数学,英语]
select posexplode(hobbies ) from t_person


delete jar /opt/app/funcHello.jar;
drop temporary function my_zhalie;
add jar /opt/app/funcHello.jar;
create temporary function my_zhalie as "function.MyUDTF";


select my_zhalie('zs,lisi,wangwu',',');


select * from movies 
lateral view my_zhalie(genres ,'\\|') t2 as genre,len;


delete jar /opt/app/funcHello.jar;
drop temporary function my_zhalie;
add jar /opt/app/funcHello.jar;
create temporary function jiami as "function.PhoneUDF";

desc function jiami
Select jiami('13838384380') union all
Select jiami('13838384') union all
Select jiami('13838384cc')   ;
show create table t_person
select * from t_person ;

CREATE TABLE `t_dwd_person`(
  `id` string, 
  `name` string, 
  `salary` double, 
  `birthday` date, 
  `sex` char(1),
  hobby String
  );
 
 
 insert into t_dwd_person
 
 select a.id ,a.name ,a.salary , a.birthday ,a.sex,b.hobby from t_person a
lateral view explode(a.hobbies ) b as hobby;


-- 男的喝酒多还是女的喝酒多
select sex,count(if(hobby='喝酒' ,sex,null))  from t_dwd_person  group by sex
  
create table t_dwd_person as
 select a.id ,a.name ,a.salary , a.birthday ,a.sex,b.hobby from t_person a
lateral view explode(a.hobbies ) b as hobby;

-- delete  from tmp_dwd_person where id = 1;

create table tmp_no_id_1 as
select * from t_person where id <> 1;

insert overwrite table t_person 
select * from tmp_no_id_1 ;

select * from t_person
--  update 
show create table t_person
create table tmp_ps_update as 
select 
  `id`  , 
  `name`  , 
  `salary`  , 
  `birthday`  , 
  case when sex = 1 then '男' 
       when sex = 0 then '女' 
       else '未知' end sex, 
  -- if(sex in(1,2) ,if(sex = 1,'男','女'),'其他') 
  `hobbies`  , 
  `cards`  , 
  `addr`  
from t_person ;

insert overwrite table t_person 
select * from tmp_ps_update 
;

select * from t_person;

---  map端的压缩： 
--  设置开启mr程序中的数据传输阶段压缩功能
set hive.exec.compress.intermediate=true;
--  设置开启MapTask输出临时文件时的压缩功能
set mapreduce.map.output.compress=true;
-- 设置map端的压缩方式：
set mapreduce.map.output.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
select count(*) from emp;


--  reduce 端的压缩：
-- 设置开启hive的压缩
set hive.exec.compress.output=true;

-- 设置开启MapReduce的输出压缩
set mapreduce.output.fileoutputformat.compress=true;

-- 指定压缩的方式
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;

-- 指定压缩的类型为块压缩
set mapreduce.output.fileoutputformat.compress.type=BLOCK;

-- 测试压缩：
insert overwrite local directory   -- 将数据导出到本地服务器（hive运行的linux）中
 '/opt/data/result_no_compress' select * from ratings a distribute by a.movieid sort by a.userid desc;

insert overwrite local directory   -- 将数据导出到本地服务器（hive运行的linux）中
 '/opt/data/result_snappy_compress' select * from emp distribute by deptno sort by empno desc;

insert overwrite local directory   -- 将数据导出到本地服务器（hive运行的linux）中
 '/opt/data/result_no_compress_rat' select * from ratings a distribute by a.movieid sort by a.userid desc;
 
insert overwrite local directory   -- 将数据导出到本地服务器（hive运行的linux）中
 '/opt/data/result_compress_rat' select * from ratings a distribute by a.movieid sort by a.userid desc;
 




--  hive中的文件格式：
-- 行式存储 ，默认情况下就是这个存储格式，导入的数据与原始数据一致，肉眼可查
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as textfile;



load data local inpath '/opt/data/hive_file_type/log.data' into table log_text;


-- orc 列式存储  
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
-- 分隔符指定
row format delimited fields terminated by '\t'
-- stored as 关键字用作指定创建的表为什么存储格式
stored as orc
-- 列式存储，默认需要指定压缩格式
tblproperties("orc.compress"="NONE");


-- load data local inpath '/opt/data/hive_file_type/log.data' into table log_orc;
insert into log_orc 
select * from log_text;

-- parquet 列式存储类型
create table log_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as parquet ;


insert into log_parquet 
select * from log_text;


select * from log_text;  -- 27.263

select * from log_orc;   --  0.308

select * from log_parquet;  -- 0.468


create table log_orc_zlib(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="ZLIB");

insert into log_orc_zlib 
select * from log_text;


create table log_orc_snappy(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
row format delimited fields terminated by '\t'
stored as orc
tblproperties("orc.compress"="SNAPPY");


insert into log_orc_snappy 
select * from log_text;

select count(ip ) from log_orc_snappy;
select count(ip ) from log_orc_zlib;




create table t_video(
    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int,  -- 视频时间长度
    views int,   -- 视频观看数 
    rate float, 
    ratings int, 
    comments int,
    relatedId array<string>)
row format delimited 
fields terminated by "\t"
collection items terminated by "&";

load data  inpath '/hive_homework/readydata' overwrite into table t_video;

select * from t_video;

create table t_user(
    uploader string,
    videos int,   -- 上传了多少个视频
    friends int)
row format delimited 
fields terminated by "\t" ;

load data  inpath '/hive_homework/user' into table t_user;

select * from t_user;


-- 1.统计视频观看数 Top10(需求不明确，用最简单的方式)
select * from t_video order by views desc limit 10 ;

-- 2.统计视频类别热度 Top10

select t2.categ,sum(t1.views) sv from t_video  t1 
lateral view explode(category) t2 as categ 
group by t2.categ
order by sv desc 
limit 10;

-- 3.统计视频观看数 Top20 所属类别以及类别包含的 Top20 的视频个数

select t2.categ,count(t1.videoId) from(
select category,videoId,views from t_video a order by views desc limit 20
) t1  -- 视频id    类别数组
LATERAL VIEW explode(t1.category) t2 as  categ 
group by t2.categ 

-- 4.统计视频观看数 Top50 所关联视频的所属类别 Rank(排名)


-- 统计视频观看数 Top50  所关联视频
select *,rank() over(order by ct desc ) from (
select t6.categ,count(*) ct  from 
(
select t3.related,t4.category from 
(
select related from  -- 视频id（观看量top50所关联的视频id）
(select relatedId,views from t_video order by views desc limit 50 ) t1 
lateral view explode(t1.relatedId) t2 as related ) t3
left join t_video  t4 on t3.related = t4.videoId 
) t5 lateral view explode(t5.category) t6 as categ
group by t6.categ ) t7 ;

-- 第二种方式：
-- 1). 创建一个临时表，用于存放观看数前50的视频信息（相关视频id数组、观看数、视频id）
create table tmp_relatedId_from_top50 as 
select relatedId,views,videoid from t_video order by views desc limit 50 ;
select * from tmp_relatedId_from_top50;
-- 2). 创建一个临时表，用于存放所有的关联视频id（具体的id值，该值与t_video 表中的videoid有关联）
create table tmp_relatedId as 
select t2.related from tmp_relatedId_from_top50 t1 
lateral view explode(t1.relatedId) t2 as related
select * from tmp_relatedId;
-- 3). 创建一个临时表，用于存放“相关视频”与t_video表关联上的所有类别
create table tmp_relatedId_category as 
select t1.related,t2.category/*t_video表中的类别数组*/ 
from tmp_relatedId t1 left join t_video t2 on t1.related = t2.videoid 
select * from tmp_relatedId_category;
-- 4). 创建一个临时表，用于存放将关联视频的类别数组炸裂开的数据
create table tmp_category_for_related as 
select categ from tmp_relatedId_category t1
lateral view explode(category) t2 as  categ ;
select * from tmp_category_for_related;

-- 5). 根据表中数据出现的次数，完成排名
select categ,count(*) ct from tmp_category_for_related group by categ order by ct desc ;

select *,rank() over(order by ct desc ) from(
select categ,count(*) ct from tmp_category_for_related group by categ
) t1 ;

-- 5.统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 的视频
-- 第一种，想简单的方式：
select * from (
select uploader from t_user order by videos desc limit 10
) t1 left join t_video t2 on t1.uploader = t2.uploader 
order by t2.views desc limit 20;

-- 第二种，每个用户上传视频中，观看数前20的视频
select t3.uploader,t3.videoid,t3.views from (
select t2.uploader,t2.videoid,t2.views,rank() over(partition by t2.uploader order by t2.views desc ) rk from (
select uploader,videos from t_user order by videos desc limit 10
) t1 left join t_video t2 on t1.uploader = t2.uploader 
 ) t3  where t3.rk <=20


-- 6.统计每个类别中的视频热度 Top10
select * from (
 select *,rank()over (partition by t2.categ order by t1.views desc) rk from t_video t1 
 lateral view explode(category) t2 as categ 
 ) t3 where t3.rk <= 10;
本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。