一、字段说明
视频表
字段 | 备注 | 详细描述 |
video_id | 视频唯一id(String) | 11位字符串 |
uploader | 视频上传者(String) | 上传视频的用户名String |
age | 视频年龄(int) | 视频在平台上的整数天 |
category | 视频类别(Array) | 上传视频指定的视频分类 |
length | 视频长度(Int) | 整形数字标识的视频长度 |
views | 观看次数(Int) | 视频被浏览的次数 |
rate | 视频评分(Double) | 满分5分 |
Ratings | 流量(Int) | 视频的流量,整型数字 |
conments | 评论数(Int) | 一个视频的整数评论数 |
related_ids | 相关视频id(Array) | 相关视频的id,最多20个 |
用户表
字段 | 备注 | 详细描述 |
uploader | 视频上传者(String) | 上传视频的用户名String |
videos | 视频上传数量(Int) | 用户上传的视频数量 |
friend | 用户好友数量(Int) | 用户好友数量 |
二、准备工作
1.创建项目数据库
CREATE DATABASE gulivideo;
2.创建外部表,指定数据存储路径
-- 创建video_ori
CREATE EXTERNAL TABLE
video_ori( video_id string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate double,
Ratings int,
conments int,
related_ids array<string> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '&' LOCATION '/gulivideo/video';
-- 创建user_ori
CREATE EXTERNAL TABLE
user_ori( uploader string,
videos int,
friend int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/gulivideo/user';
3.创建内部表,导入数据
-- 内部video_orc表
CREATE TABLE
video_orc( video_id string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate double,
Ratings int,
conments int,
related_ids array<string> ) TBLPROPERTIES("orc.compress"="snappy");
-- 内部user_orc表
CREATE TABLE
user_orc( uploader string,
videos int,
friend int ) TBLPROPERTIES("orc.compress"="snappy");
-- 加载数据
INSERT INTO TABLE video_orc
SELECT * FROM video_ori;
INSERT INTO TABLE user_orc
SELECT * FROM user_ori;
三、需求实现
1.统计视频观看数top10
全局排序取前10
SELECT video_id,viewS FROM video_orc ORDER BY viewS DESC LIMIT 10;
2.统计视频类别热度Top10
定义热度
视频类别热度:分了中包含的视频数量越多热度越高。
1).炸裂视频类别
-- 炸裂视频类别,取出类别和其对应的视频
SELECT cate,video_id
FROM video_orc
LATERAL VIEW EXPLODE(category) tbl as cate;
2).取类别前十
-- 第1步的查询为t1,COUNT统计各类别视频数量,排序取前10
SELECT cate,count(*) cnt
FROM t1
GROUP BY cate
ORDER BY cnt DESC
LIMIT 10;
最终sql
SELECT
cate,
count(*) cnt
FROM
(
SELECT cate,
video_id
FROM
video_orc LATERAL VIEW EXPLODE(category) tbl as cate)t1
GROUP BY
cate
ORDER BY
cnt DESC
LIMIT 10;
3.统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
1).观看数最高的20个视频和他们所属分类
-- t1
SELECT video_id,category,views
FROM video_orc
ORDER BY views DESC LIMIT 20;
2).炸裂分类
-- t2 video_id 就是top20视频的id
SELECT video_id,cate
FROM t1
LATERAL VIEW EXPLODE(category) tbl as cate;
3). count统计分类中包含的视频数
-- count(video_id)
SELECT cate,count(video_id)
FROM t2
GROUP BY cate;
最终sql
SELECT
cate,
count(video_id) cnt
FROM
(
SELECT
video_id,
cate
FROM
(
SELECT
video_id,
category,
views
FROM
video_orc
ORDER BY
views DESC
LIMIT 20)t1 LATERAL VIEW EXPLODE(category) tbl as cate )t2
GROUP BY
cate
ORDER BY
cnt DESC;
3.4.统计视频观看数Top50所关联视频的所属类别,并按照类别热度排序
1).先求出视频观看数Top50的视频和关联视频
-- t1
SELECT video_id,views,related_ids
FROM video_orc
ORDER BY views
LIMIT 50;
2).炸裂关联视频
-- t2
SELECT EXPLODE(related_ids) related_id
FROM t1;
3).求出关联视频的类别
-- t3
SELECT DISTINCT related_id,category
FROM t2
LEFT JOIN video_orc v
ON t2.related_id = v.video_id;
4).炸裂关联视频类别
-- t4
SELECT related_id,cate
FROM t3
LATERAL VIEW EXPLODE(category) tbl as cate;
5).求出总表每个类的类别热度(视频数量)
-- t5
SELECT category,video_id
FROM video_orc;
SELECT cate,COUNT(video_id) hot
FROM (SELECT category,video_id
FROM video_orc)tt1
LATERAL VIEW EXPLODE(category) tbl as cate
GROUP BY cate;
6).关联视频类别 join 每个类的类别热度
-- t4是炸裂开的视频类别,要去重
SELECT DISTINCT t4.cate,hot
FROM t4
JOIN t5
ON t4.cate=t5.cate
ORDER BY hot DESC;
最终sql
SELECT
DISTINCT t4.cate,
hot
FROM
(
SELECT
related_id,
cate
FROM
(
SELECT
DISTINCT related_id,
category
FROM
(
SELECT
EXPLODE(related_ids) related_id
FROM
(
SELECT video_id,
views,
related_ids
FROM
video_orc
ORDER BY
views
LIMIT 50)t1)t2
LEFT JOIN video_orc v ON
t2.related_id = v.video_id)t3 LATERAL VIEW EXPLODE(category) tbl as cate)t4
JOIN (
SELECT
cate,
COUNT(video_id) hot
FROM
(
SELECT
category,
video_id
FROM
video_orc)tt1 LATERAL VIEW EXPLODE(category) tbl as cate
GROUP BY
cate)t5 ON
t4.cate = t5.cate
ORDER BY
hot DESC;
3.5.统计每个类别中的视频热度Top10,以Music为例
1).炸裂类,拼接类和视频id
-- 可存储为中间表,下面的需求还用的到
CREATE TABLE video_cate_tmp
TBLPROPERTIES("orc.compress"="SNAPPY") AS
SELECT video_id,cate
FROM
video_orc
LATERAL VIEW EXPLODE(category) tbl as cate;
2).统计Music分类中视频数量并排序取前10
SELECT cate,count(*) hot
FROM video_cate_tmp
WHERE cate='Music'
GROUP BY cate
ORDER BY
hot DESC
LIMIT 10;
3.6. 统计每个类别中视频流量Top10,以Music为例
利用临时表,根据视频id找到流量ratings,排序取前10
SELECT
t1.video_id,
ratings
FROM
( -- 先取出视频id和对应分类
SELECT cate,
video_id
FROM
video_cate_tmp) t1
JOIN (
SELECT video_id,ratings
FROM
video_orc)t2 ON
t1.video_id = t2.video_id
WHERE
cate = 'Music'
ORDER BY
ratings DESC
LIMIT 10;
3.7.统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频
第一种理解:前10用户上传的视频每人被观看次数前20
1).先统计上传视频最多的用户前10
-- t1
SELECT uploader,videos
FROM user_orc
ORDER BY videos DESC
LIMIT 10;
2).找出这10名用户传的视频
-- t2
SELECT v.uploader,video_id,views
FROM t1 JOIN video_orc v
ON t1.uploader=v.uploader;
3).这10个用户每人播放量的前20
SELECT * FROM(SELECT uploader,views,
rank() over(PARTITION BY uploader ORDER BY views DESC) n
FROM t2)tbl
WHERE tbl.n<=20;
最终sql
SELECT * FROM(SELECT
uploader,
views,
rank() over(PARTITION BY uploader ORDER BY views DESC) n
FROM
(
SELECT
v.uploader,
video_id,
views
FROM
(
SELECT uploader,
videos
FROM
user_orc
ORDER BY
videos DESC
LIMIT 10)t1
JOIN video_orc v ON
t1.uploader = v.uploader)t2)tbl
WHERE tbl.n<=20;
第二种理解:播放次数前20的视频,有哪些是这10个人传的
1).先统计上传视频最多的用户前10
-- t1
SELECT uploader,videos
FROM user_orc
ORDER BY videos DESC
LIMIT 10;
2).找出这10名用户传的视频
-- t2
SELECT v.uploader,video_id
FROM t1 JOIN video_orc v
ON t1.uploader=v.uploader;
3).播放次数前20的视频
-- t3
SELECT video_id
FROM video_orc
ORDER BY views
LIMIT 20;
4).连接video_id
SELECT t3.video_id,t2.uploader
FROM t3 LEFT JOIN t2
ON t3.video_id=t2.video_id;
最终sql
SELECT
t3.video_id,
t2.uploader
FROM
(
SELECT
video_id
FROM
video_orc
ORDER BY
views
LIMIT 20)t3
LEFT JOIN (
SELECT
v.uploader,
video_id
FROM
(
SELECT uploader,
videos
FROM
user_orc
ORDER BY
videos DESC
LIMIT 10)t1
JOIN video_orc v ON
t1.uploader = v.uploader)t2 ON
t3.video_id = t2.video_id;
3.8.统计每个类别视频观看数Top10
1).炸裂类别
-- t1
SELECT
video_id,views,cate
FROM video_orc
LATERAL VIEW EXPLODE(category) tbl as cate;
2).按类别分区,按views排序
-- t2
SELECT cate,views,
RANK() OVER(PARTITION BY cate ORDER BY views DESC) n
FROM t1;
3).取前10
SELECT * FROM t2
WHERE t2.n<=10;
最终sql
SELECT
*
FROM
(
SELECT
cate,
views,
RANK() OVER(PARTITION BY cate ORDER BY views DESC) n
FROM
(
SELECT
video_id,
views,
cate
FROM
video_orc LATERAL VIEW EXPLODE(category) tbl as cate)t1)t2
WHERE
t2.n <= 10;