目录

一、设计hive数据仓库

1.创建数据仓库

2.创建事实表

3.加载数据到事实表

4.查询数据

5.创建明细表ods_jobdata_detail 用户细化薪资字段的数据

6.加教数据到明细表

7.对薪资进行扁平化处理,.将处理结果存储到临时中间表t_ods_tmp_salary

8.对临时中间表t_ods_tmp_salary中的每一条数据进行泛化处理,将处理结果存储到中间表t_ods_tmp_salary_dist中

9.对福利标签字段内容进行扁平化处理,将处理的结果储存到临时中间t_ods_tmp_company

10.对技能标签字段内容进行扁平化处理,将处理的结果储存到临时中间表t_ods_tmp_kill

11.创建维度表t_ods_company ,t_ods_company,t_ods_kill ,t_ods _city用于存储福利标签.薪资分布..技能标签.城市统计的结果

二、分析数据

1.职位区域分析

2.倒叙查询职位区域的信息

3.职位薪资分析

4.查看维度表t_ods_salary 中的分析结果...使用sort by参数对表虫的count列进行倒叙排序

5.平均值

6.众数

7.中位数

8.各城市平均薪资待遇

9.公司福利分析

10.查询维度表中的分析结果,倒叙,查询前10个 

11.职位技能要求分析

12.查看技能维度表中的分析结果,倒叙,查看前3个


一、设计hive数据仓库

1.创建数据仓库

[root@hadoop1 ~]# systemctl start mariadb
[root@hadoop1 ~]# hive

create database jobdata;

hive> use jobdata; 

2.创建事实表

CREATE TABLE ods_jobdata_origin(
city string COMMENT '城市',
salary array<String>COMMENT '薪资',
company array<String>COMMENT '福利',
kill array<String>COMMENT '技能')
COMMENT '原始职位数据表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
COLLECTION ITEMS TERMINATED BY '-'
STORED AS TEXTFILE;

3.加载数据到事实表

LOAD DATA INPATH '/JobData/output/part-r-00000' OVERWRITE INTO TABLE ods_jobdata_origin;

4.查询数据

select * from ods_jobdata_origin;

5.创建明细表ods_jobdata_detail 用户细化薪资字段的数据

create table ods_jobdata_detail(
city string comment '城市',
salary array<String>comment '薪资',
company array<String>comment '福利',
kill array<String>comment '技能',
low_salary int comment '低薪资',
high_salary int comment '高薪资',
avg_salary double comment '平均薪资')
COMMENT '职位数据明细表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;

6.加教数据到明细表

insert overwrite table ods_jobdata_detail
select city,salary,company,kill,salary[0],salary[1],
(salary[0]+salary[1])/2 from ods_jobdata_origin;

7.对薪资进行扁平化处理,.将处理结果存储到临时中间表t_ods_tmp_salary

create table t_ods_tmp_salary as select explode(ojo.salary) from ods_jobdata_origin ojo;

8.对临时中间表t_ods_tmp_salary中的每一条数据进行泛化处理,将处理结果存储到中间表t_ods_tmp_salary_dist中

create table t_ods_tmp_salary_dist as
select case when col>=0 and col<=5 then "0-5"
when col>=6 and col<=10 then "6-10" 
when col>=11 and col<=15 then "11-15" 
when col>=16 and col<=20 then "16-20" 
when col>=21 and col<=25 then "21-25" 
when col>=26 and col<=30 then "26-30" 
when col>=31 and col<=35 then "31-35" 
when col>=36 and col<=40 then "36-40" 
when col>=41 and col<=45 then "41-45" 
when col>=46 and col<=50 then "46-50" 
when col>=51 and col<=55 then "51-55" 
when col>=56 and col<=60 then "56-60" 
when col>=61 and col<=65 then "61-65" 
when col>=66 and col<=70 then "66-70" 
when col>=71 and col<=75 then "71-75" 
when col>=76 and col<=80 then "76-80" 
when col>=81 and col<=85 then "81-85" 
when col>=86 and col<=90 then "86-90" 
when col>=91 and col<=95 then "91-95" 
when col>=96 and col<=100 then "96-100"
when col >= 101 then ">101" end from t_ods_tmp_salary;

9.对福利标签字段内容进行扁平化处理,将处理的结果储存到临时中间t_ods_tmp_company

create table t_ods_tmp_company as select explode (ojo.company) from ods_jobdata_origin ojo;

10.对技能标签字段内容进行扁平化处理,将处理的结果储存到临时中间表t_ods_tmp_kill

create table t_ods_tmp_kill as select explode (ojo.kill) from ods_jobdata_origin ojo;

11.创建维度表t_ods_company ,t_ods_company,t_ods_kill ,t_ods _city用于存储福利标签.薪资分布..技能标签.城市统计的结果

create table t_ods_kill(
every_kill String comment '技能标签',
count int comment '词频')
COMMENT	'技能标签词频统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;
create table t_ods_company(
every_company String comment '福利标签',
count int comment '词频')
COMMENT '福利标签词频统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;
create table t_ods_salary(
every_partition String comment '薪资分布',
count int comment '聚合统计')
COMMENT '薪资分布聚合统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;
create table t_ods_city(
every_city String comment '城市',
count int comment '词频')
COMMENT '城市统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;

二、分析数据

1.职位区域分析

insert overwrite table t_ods_city
select city,count(1) from ods_jobdata_origin group by city;

2.倒叙查询职位区域的信息

select * from t_ods_city sort by count desc;

3.职位薪资分析

//查看表结构

hive>  desc t_ods_tmp_salary_dist;
OK
c0                      string        

Time taken: 0.052 seconds, Fetched: 1 row(s)                

insert overwrite table t_ods_salary
select `c0`,count(1) from t_ods_tmp_salary_dist group by `c0`;

4.查看维度表t_ods_salary 中的分析结果...使用sort by参数对表虫的count列进行倒叙排序

select * from t_ods_salary sort by count desc;

5.平均值

select avg(avg_salary) from ods_jobdata_detail;

6.众数

select avg_salary,count(1) as cnt from ods_jobdata_detail group by avg_salary order by cnt desc limit 1;

7.中位数

select percentile(cast(avg_salary as BIGINT),0.5) from ods_jobdata_detail;

8.各城市平均薪资待遇

select city,count(city),round(avg(avg_salary),2) as cnt from ods_jobdata_detail group by city order by cnt desc;

9.公司福利分析

insert overwrite table t_ods_company select col,count(1) from t_ods_tmp_company group by col;

10.查询维度表中的分析结果,倒叙,查询前10个 

select every_company,count from t_ods_company sort by count desc limit 10;

11.职位技能要求分析

insert overwrite table t_ods_kill select col,count(1) from t_ods_tmp_kill group by col;

12.查看技能维度表中的分析结果,倒叙,查看前3个

select every_kill,count from t_ods_kill sort by count desc limit 3;