一、字段说明

视频表

字段

备注

详细描述

video_id

视频唯一id(String)

11位字符串

uploader

视频上传者(String)

上传视频的用户名String

age

视频年龄(int)

视频在平台上的整数天

category

视频类别(Array)

上传视频指定的视频分类

length

视频长度(Int)

整形数字标识的视频长度

views

观看次数(Int)

视频被浏览的次数

rate

视频评分(Double)

满分5分

Ratings

流量(Int)

视频的流量,整型数字

conments

评论数(Int)

一个视频的整数评论数

related_ids

相关视频id(Array)

相关视频的id,最多20个

用户表

字段

备注

详细描述

uploader

视频上传者(String)

上传视频的用户名String

videos

视频上传数量(Int)

用户上传的视频数量

friend

用户好友数量(Int)

用户好友数量

二、准备工作

1.创建项目数据库

CREATE DATABASE gulivideo;

2.创建外部表,指定数据存储路径

-- 创建video_ori
CREATE EXTERNAL TABLE
    video_ori( video_id string,
    uploader string,
    age int,
    category array<string>,
    length int,
    views int,
    rate double,
    Ratings int,
    conments int,
    related_ids array<string> ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
COLLECTION ITEMS TERMINATED BY '&' LOCATION '/gulivideo/video';
-- 创建user_ori
CREATE EXTERNAL TABLE
    user_ori( uploader string,
    videos int,
    friend int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/gulivideo/user';

3.创建内部表,导入数据

-- 内部video_orc表
CREATE TABLE
    video_orc( video_id string,
    uploader string,
    age int,
    category array<string>,
    length int,
    views int,
    rate double,
    Ratings int,
    conments int,
    related_ids array<string> ) TBLPROPERTIES("orc.compress"="snappy");
-- 内部user_orc表
CREATE TABLE
    user_orc( uploader string,
    videos int,
    friend int ) TBLPROPERTIES("orc.compress"="snappy");
-- 加载数据
INSERT INTO TABLE video_orc 
SELECT * FROM video_ori;

INSERT INTO TABLE user_orc 
SELECT * FROM user_ori;

三、需求实现

1.统计视频观看数top10

全局排序取前10

SELECT video_id,viewS FROM video_orc ORDER BY viewS DESC LIMIT 10;

2.统计视频类别热度Top10

定义热度

视频类别热度:分了中包含的视频数量越多热度越高。

1).炸裂视频类别

-- 炸裂视频类别,取出类别和其对应的视频
SELECT cate,video_id
FROM video_orc
LATERAL VIEW EXPLODE(category) tbl as cate;

2).取类别前十

-- 第1步的查询为t1,COUNT统计各类别视频数量,排序取前10
SELECT cate,count(*) cnt
FROM t1
GROUP BY cate
ORDER BY cnt DESC
LIMIT 10;

最终sql

SELECT
    cate,
    count(*) cnt
FROM
    (
    SELECT cate,
        video_id
    FROM
        video_orc LATERAL VIEW EXPLODE(category) tbl as cate)t1
GROUP BY
    cate
ORDER BY
    cnt DESC
LIMIT 10;

3.统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数

1).观看数最高的20个视频和他们所属分类

-- t1 
SELECT video_id,category,views
FROM video_orc
ORDER BY views DESC LIMIT 20;

2).炸裂分类

-- t2 video_id 就是top20视频的id
SELECT video_id,cate
FROM t1
LATERAL VIEW EXPLODE(category) tbl as cate;

3). count统计分类中包含的视频数

-- count(video_id)
SELECT cate,count(video_id)
FROM t2
GROUP BY cate;

最终sql

SELECT
    cate,
    count(video_id) cnt
FROM
    (
    SELECT
        video_id,
        cate
    FROM
        (
        SELECT
            video_id,
            category,
            views
        FROM
            video_orc
        ORDER BY
            views DESC
        LIMIT 20)t1 LATERAL VIEW EXPLODE(category) tbl as cate )t2
GROUP BY
    cate
ORDER BY
    cnt DESC;

3.4.统计视频观看数Top50所关联视频的所属类别,并按照类别热度排序

1).先求出视频观看数Top50的视频和关联视频

-- t1
SELECT video_id,views,related_ids
FROM video_orc
ORDER BY views
LIMIT 50;

2).炸裂关联视频

-- t2
SELECT EXPLODE(related_ids) related_id
FROM t1;

3).求出关联视频的类别

-- t3
SELECT DISTINCT related_id,category
FROM t2
LEFT JOIN video_orc v
ON t2.related_id = v.video_id;

4).炸裂关联视频类别

-- t4
SELECT related_id,cate
FROM t3
LATERAL VIEW EXPLODE(category) tbl as cate;

5).求出总表每个类的类别热度(视频数量)

-- t5
SELECT category,video_id
FROM video_orc;

SELECT cate,COUNT(video_id) hot
FROM (SELECT category,video_id
    FROM video_orc)tt1
LATERAL VIEW EXPLODE(category) tbl as cate
GROUP BY cate;

6).关联视频类别 join 每个类的类别热度

-- t4是炸裂开的视频类别,要去重
SELECT DISTINCT t4.cate,hot
FROM t4 
JOIN t5
ON t4.cate=t5.cate
ORDER BY hot DESC;

最终sql

SELECT
    DISTINCT t4.cate,
    hot
FROM
    (
    SELECT
        related_id,
        cate
    FROM
        (
        SELECT
            DISTINCT related_id,
            category
        FROM
            (
            SELECT
                EXPLODE(related_ids) related_id
            FROM
                (
                    SELECT video_id,
                    views,
                    related_ids
                FROM
                    video_orc
                ORDER BY
                    views
                LIMIT 50)t1)t2
        LEFT JOIN video_orc v ON
            t2.related_id = v.video_id)t3 LATERAL VIEW EXPLODE(category) tbl as cate)t4
JOIN (
    SELECT
        cate,
        COUNT(video_id) hot
    FROM
        (
        SELECT
            category,
            video_id
        FROM
            video_orc)tt1 LATERAL VIEW EXPLODE(category) tbl as cate
    GROUP BY
        cate)t5 ON
    t4.cate = t5.cate
ORDER BY
    hot DESC;

3.5.统计每个类别中的视频热度Top10,以Music为例

1).炸裂类,拼接类和视频id

-- 可存储为中间表,下面的需求还用的到
CREATE TABLE video_cate_tmp 
TBLPROPERTIES("orc.compress"="SNAPPY") AS
SELECT video_id,cate
FROM 
video_orc
LATERAL VIEW EXPLODE(category) tbl as cate;

2).统计Music分类中视频数量并排序取前10

SELECT cate,count(*) hot
FROM video_cate_tmp
WHERE cate='Music'
GROUP BY cate
ORDER BY 
hot DESC
LIMIT 10;

3.6. 统计每个类别中视频流量Top10,以Music为例

利用临时表,根据视频id找到流量ratings,排序取前10

SELECT
    t1.video_id,
    ratings
FROM
    (   -- 先取出视频id和对应分类
        SELECT cate,
        video_id
    FROM
        video_cate_tmp) t1
JOIN (
        SELECT video_id,ratings
    FROM
        video_orc)t2 ON
    t1.video_id = t2.video_id
WHERE
    cate = 'Music'
ORDER BY
    ratings DESC
LIMIT 10;

3.7.统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频

第一种理解:前10用户上传的视频每人被观看次数前20

1).先统计上传视频最多的用户前10
-- t1
SELECT uploader,videos
FROM user_orc
ORDER BY videos DESC
LIMIT 10;
2).找出这10名用户传的视频
-- t2
SELECT v.uploader,video_id,views
FROM t1 JOIN video_orc v
ON t1.uploader=v.uploader;
3).这10个用户每人播放量的前20
SELECT * FROM(SELECT uploader,views,
rank() over(PARTITION BY uploader ORDER BY views DESC) n
FROM t2)tbl
WHERE tbl.n<=20;
最终sql
SELECT * FROM(SELECT
    uploader,
    views,
    rank() over(PARTITION BY uploader ORDER BY views DESC) n
FROM
    (
    SELECT
        v.uploader,
        video_id,
        views
    FROM
        (
            SELECT uploader,
            videos
        FROM
            user_orc
        ORDER BY
            videos DESC
        LIMIT 10)t1
    JOIN video_orc v ON
        t1.uploader = v.uploader)t2)tbl
WHERE tbl.n<=20;

第二种理解:播放次数前20的视频,有哪些是这10个人传的

1).先统计上传视频最多的用户前10
-- t1
SELECT uploader,videos
FROM user_orc
ORDER BY videos DESC
LIMIT 10;
2).找出这10名用户传的视频
-- t2
SELECT v.uploader,video_id
FROM t1 JOIN video_orc v
ON t1.uploader=v.uploader;
3).播放次数前20的视频
-- t3
SELECT video_id
FROM video_orc
ORDER BY views 
LIMIT 20;
4).连接video_id
SELECT t3.video_id,t2.uploader
FROM t3 LEFT JOIN t2
ON t3.video_id=t2.video_id;
最终sql
SELECT
    t3.video_id,
    t2.uploader
FROM
    (
    SELECT
        video_id
    FROM
        video_orc
    ORDER BY
        views
    LIMIT 20)t3
LEFT JOIN (
    SELECT
        v.uploader,
        video_id
    FROM
        (
            SELECT uploader,
            videos
        FROM
            user_orc
        ORDER BY
            videos DESC
        LIMIT 10)t1
    JOIN video_orc v ON
        t1.uploader = v.uploader)t2 ON
    t3.video_id = t2.video_id;

3.8.统计每个类别视频观看数Top10

1).炸裂类别

-- t1
SELECT 
video_id,views,cate
FROM video_orc
LATERAL VIEW EXPLODE(category) tbl as cate;

2).按类别分区,按views排序

-- t2
SELECT cate,views,
RANK() OVER(PARTITION BY cate ORDER BY views DESC) n
FROM t1;

3).取前10

SELECT * FROM t2
WHERE t2.n<=10;

最终sql

SELECT
    *
FROM
    (
    SELECT
        cate,
        views,
        RANK() OVER(PARTITION BY cate ORDER BY views DESC) n
    FROM
        (
        SELECT
            video_id,
            views,
            cate
        FROM
            video_orc LATERAL VIEW EXPLODE(category) tbl as cate)t1)t2
WHERE
    t2.n <= 10;