一 数据源,将数据添加到 linux 本地,生成静态文件
数据源
用户工资组成表 sal.txt
uid jb jj tc deptno tb_sal
1,2000,3000,1500,1
2,5000,500,1000,2
3,1500,1000,3000,2
4,3000,6000,8000,3
5,1500,2000,1800,1
6,2500,1000,1900,1
bm.txt 部门表 tb_departmen
1,销售
2,技术
3,行政
员工信息表 tb_employee
yg.txt
uid name gender age
1,zs,M,28
2,ww,F,36
3,zl,F,48
4,pp,M,44
5,wb,M,32
6,TQ,F,32
将三个结构化数据导入到本地 /root/hive/company/ 目录下
[root@linux03 companydata]# vi sal.txt --工资表信息
[root@linux03 companydata]# vi department.txt --部门表信息
[root@linux03 companydata]# vi employee.txt --员工表信息
[root@linux03 companydata]# ll --查看文件
-rw-r--r--. 1 root root 19 Sep 3 01:13 department.txt
-rw-r--r--. 1 root root 79 Sep 3 01:12 employee.txt
-rw-r--r--. 1 root root 105 Sep 3 01:14 sal.txt
二 建表 ,将数据加载到表里面 ,然后查询数据加载情况
0: jdbc:hive2://linux03:10000>
删除tb_sal ,重新创建一个tb_sal ,然后将sal.txt文件数据加载到表里面
drop table tb_sal;
create table tb_sal(
uid int,
jb int,
jj int,
tc int,
deptno int)
row format delimited fields terminated by ",";
load data local inpath "/root/hive/companydata/sal.txt" into table tb_sal;
删除tb_employee ,重新创建一个tb_employee ,然后将employee.txt文件数据加载到表里面
drop table tb_employee;
create table tb_employee(
uid int,
name string,
gender string,
age int)
row format delimited fields terminated by ",";
load data local inpath "/root/hive/companydata/employee.txt" into table tb_employee;
删除tb_department ,重新创建一个tb_department ,然后将department.txt文件数据加载到表里面
drop table tb_department;
create table tb_department(
deptno int,
bname string)
row format delimited fields terminated by ",";
load data local inpath "/root/hive/companydata/department.txt" into table tb_department;
查看数据插入情况
0: jdbc:hive2://linux03:10000> select * from tb_sal;
+-------------+------------+------------+------------+----------------+
| tb_sal.uid | tb_sal.jb | tb_sal.jj | tb_sal.tc | tb_sal.deptno |
+-------------+------------+------------+------------+----------------+
| 1 | 2000 | 3000 | 1500 | 1 |
| 2 | 5000 | 500 | 1000 | 2 |
| 3 | 1500 | 1000 | 3000 | 2 |
| 4 | 3000 | 6000 | 8000 | 3 |
| 5 | 1500 | 2000 | 1800 | 1 |
| 6 | 2500 | 1000 | 1900 | 1 |
+-------------+------------+------------+------------+----------------+
0: jdbc:hive2://linux03:10000>select * from tb_employee;
+------------------+-------------------+---------------------+------------------+
| tb_employee.uid | tb_employee.name | tb_employee.gender | tb_employee.age |
+------------------+-------------------+---------------------+------------------+
| 1 | zs | M | 28 |
| 2 | ww | F | 36 |
| 3 | zl | F | 48 |
| 4 | pp | M | 44 |
| 5 | wb | M | 32 |
| 6 | TQ | F | 32 |
+------------------+-------------------+---------------------+------------------+
0: jdbc:hive2://linux03:10000>select * from tb_department;
+-----------------------+----------------------+
| tb_department.deptno | tb_department.bname |
+-----------------------+----------------------+
| 1 | 销售 |
| 2 | 技术部 |
| 3 | 行政部 |
+-----------------------+----------------------+
三 需求的实现
1 需求1: 求出公司中每个员工的姓名 和 三类收入中最高的那种收入的类型
1.1 根据需求 ,将员工表和工资表关联起来
0: jdbc:hive2://linux03:10000>
select
*
from
tb_employee as yg
join
tb_sal as gz
on
yg.uid=gz.uid;
+---------+----------+------------+---------+---------+--------+--------+--------+------------+
| yg.uid | yg.name | yg.gender | yg.age | gz.uid | gz.jb | gz.jj | gz.tc | gz.deptno |
+---------+----------+------------+---------+---------+--------+--------+--------+------------+
| 1 | zs | M | 28 | 1 | 2000 | 3000 | 1500 | 1 |
| 2 | ww | F | 36 | 2 | 5000 | 500 | 1000 | 2 |
| 3 | zl | F | 48 | 3 | 1500 | 1000 | 3000 | 2 |
| 4 | pp | M | 44 | 4 | 3000 | 6000 | 8000 | 3 |
| 5 | wb | M | 32 | 5 | 1500 | 2000 | 1800 | 1 |
| 6 | TQ | F | 32 | 6 | 2500 | 1000 | 1900 | 1 |
+---------+----------+------------+---------+---------+--------+--------+--------+------------+
1.2 然后根据以上表信息,取出想要的字段(yg.name gz.jb | gz.jj | gz.tc )
0: jdbc:hive2://linux03:10000>
select
yg.name,
gz.jb,
gz.jj,
gz.tc
from
tb_employee as yg
join
tb_sal as gz
on
yg.uid=gz.uid;
+----------+--------+--------+--------+
| yg.name | gz.jb | gz.jj | gz.tc | --- 取出员工名字和其对应的工资(三种类型的收入)
+----------+--------+--------+--------+
| zs | 2000 | 3000 | 1500 |
| ww | 5000 | 500 | 1000 |
| zl | 1500 | 1000 | 3000 |
| pp | 3000 | 6000 | 8000 |
| wb | 1500 | 2000 | 1800 |
| TQ | 2500 | 1000 | 1900 |
+----------+--------+--------+--------+
1.3 根据以上这个表,取工资类型中的最大值
0: jdbc:hive2://linux03:10000>
select
yg.name,
greatest(gz.jb,gz.jj,gz.tc) greatest_sal 取出其中最大的值
from
tb_employee as yg
join
tb_sal as gz
on
yg.uid=gz.uid;
+----------+---------------+
| yg.name | greatest_sal | 最大值取出来了 ,但是需要知道是gz的那部分,还需要处理
+----------+---------------+
| zs | 3000 |
| ww | 5000 |
| zl | 3000 |
| pp | 8000 |
| wb | 2000 |
| TQ | 2500 |
+----------+---------------+
1.4 根据以上这个表,将得到的最大值跟工资的三种类型进行匹配 ,得到最大值是哪个类型
0: jdbc:hive2://linux03:10000>
select
yg.name,
greatest(gz.jb,gz.jj,gz.tc) greatest_sal,
case
when gz.jb = greatest_sal then "jb" 系统不认识 greatest_sal 这个别名,需要对这里修改
when gz.jj = greatest_sal then "jj" 将其修改为 : greatest(gz.jb,gz.jj,gz.tc)
when gz.tc = greatest_sal then "tc"
end as hightest_sal
from
tb_employee as yg
join
tb_sal as gz
on
yg.uid=gz.uid;
如果按照以上方式进行查询,会出现以下错误 :
Error: Error while compiling statement: FAILED: SemanticException [Error 10004]: Line 5:13 Invalid table alias or column reference 'greatest_sal': (possible column names are: yg.uid, yg.name, yg.gender, yg.age, gz.uid, gz.jb, gz.jj, gz.tc, gz.deptno)
以上语句在执行时 ,系统不认识 greatest_sal 这个别名字段 ,所有需要进行以下修改
0: jdbc:hive2://linux03:10000>
select
yg.name,
greatest(gz.jb ,gz.jj ,gz.tc) greatest_sal, 取其中一个最大的值 ,并赋别名
case case..when 条件 then ..else ..end
when gz.jb = greatest(gz.jb,gz.jj,gz.tc) then "jb" 当条件符合 ,就(then)..,否则(else)..,end结束
when gz.jj = greatest(gz.jb,gz.jj,gz.tc) then "jj"
when gz.tc = greatest(gz.jb,gz.jj,gz.tc) then "tc"
end as hightest_sal
from
tb_employee as yg
join
tb_sal as gz
on
yg.uid=gz.uid;
+----------+---------------+---------------+
| yg.name | greatest_sal | hightest_sal |
+----------+---------------+---------------+
| zs | 3000 | jj |
| ww | 5000 | jb |
| zl | 3000 | tc |
| pp | 8000 | tc |
| wb | 2000 | jj |
| TQ | 2500 | jb |
+----------+---------------+---------------+
2 需求2:求出公司中每个岗位的薪资总和 --部门表 ,工资表
2.1 第一种实现方法
先将部门表和工资表关联起来
0: jdbc:hive2://linux03:10000>
select
*
from
tb_department bm
join
tb_sal gz
on bm.deptno = gz.deptno;
+------------+-----------+---------+--------+--------+--------+------------+
| bm.deptno | bm.bname | gz.uid | gz.jb | gz.jj | gz.tc | gz.deptno |
+------------+-----------+---------+--------+--------+--------+------------+
| 1 | 销售 | 1 | 2000 | 3000 | 1500 | 1 |
| 2 | 技术部 | 2 | 5000 | 500 | 1000 | 2 |
| 2 | 技术部 | 3 | 1500 | 1000 | 3000 | 2 |
| 3 | 行政部 | 4 | 3000 | 6000 | 8000 | 3 |
| 1 | 销售 | 5 | 1500 | 2000 | 1800 | 1 |
| 1 | 销售 | 6 | 2500 | 1000 | 1900 | 1 |
+------------+-----------+---------+--------+--------+--------+------------+
然后按照部门进行分组 ,再将每个部门的工资的三个类型全部累加起来
0: jdbc:hive2://linux03:10000>
select
bm.bname,
sum(gz.jb + gz.jj + gz.tc) sum_sal
from
tb_department bm
join
tb_sal gz
on bm.deptno = gz.deptno
group by bm.bname;
+-----------+----------+
| bm.bname | sum_sal |
+-----------+----------+
| 技术部 | 12000 |
| 行政部 | 17000 |
| 销售 | 17200 |
+-----------+----------+
2.2 第二种实现方法(with a as....将as后面的查询信息得到的表命名为 a )--这个方法相对较块,比较有优势
先按照部门编号进行分组 ,然后每个部门的三种工资类型进行分别累加
0: jdbc:hive2://linux03:10000>
select
deptno,
sum(jb+jj+tc)
from
tb_sal
group by deptno;
+---------+--------+
| deptno | _c1 |
+---------+--------+
| 1 | 17200 |
| 2 | 12000 |
| 3 | 17000 |
+---------+--------+
然后将以上得到的表数据与部门表进行关联
0: jdbc:hive2://linux03:10000>
with a as(
select
deptno,
sum(jb+jj+tc) sum_sal
from
tb_sal
group by deptno)
select
bm.bname,
a.sum_sal
from
a
join
tb_department bm
on a.deptno=bm.deptno;
+-----------+------------+
| bm.bname | a.sum_sal |
+-----------+------------+
| 销售 | 17200 |
| 技术部 | 12000 |
| 行政部 | 17000 |
+-----------+------------+
3 需求3:求出公司中每个岗位不同性别员工薪资总和 --部门表/员工表/工资表
3.1 第一种实现方法
先找到三张表的关联点 ,将三张表关联起来 ,然后将想要的字段输出来
0: jdbc:hive2://linux03:10000>
select
bm.bname,
yg.name,
yg.gender,
gz.jb,
gz.jj,
gz.tc,
gz.deptno
from
tb_department bm
join
tb_employee yg
join
tb_sal gz
on
bm.deptno=gz.deptno
and
gz.uid=yg.uid;
+-----------+----------+------------+--------+--------+--------+------------+
| bm.bname | yg.name | yg.gender | gz.jb | gz.jj | gz.tc | gz.deptno |
+-----------+----------+------------+--------+--------+--------+------------+
| 销售 | zs | M | 2000 | 3000 | 1500 | 1 |
| 技术部 | ww | F | 5000 | 500 | 1000 | 2 |
| 技术部 | zl | F | 1500 | 1000 | 3000 | 2 |
| 行政部 | pp | M | 3000 | 6000 | 8000 | 3 |
| 销售 | wb | M | 1500 | 2000 | 1800 | 1 |
| 销售 | TQ | F | 2500 | 1000 | 1900 | 1 |
+-----------+----------+------------+--------+--------+--------+------------+
将上表作为数据源 ,求出公司中每个岗位不同性别员工薪资总和
以岗位和性别分组 ,求出工资相加累加
0: jdbc:hive2://linux03:10000>
select
t1.bname,
t1.gender,
sum(t1.jb+t1.jj+t1.tc) sum_sal
from
(
select
bm.bname,
yg.name,
yg.gender,
gz.jb,
gz.jj,
gz.tc,
gz.deptno
from
tb_department bm
join
tb_employee yg
join
tb_sal gz
on
bm.deptno=gz.deptno
and
gz.uid=yg.uid
)t1 将数据源表起别名为 t1 ,
group by t1.bname,t1.gender; 取 t1 表的数据
+-----------+------------+----------+
| t1.bname | t1.gender | sum_sal |
+-----------+------------+----------+
| 技术部 | F | 12000 |
| 行政部 | M | 17000 |
| 销售 | F | 5400 |
| 销售 | M | 11800 |
+-----------+------------+----------+
3.2 第二种实现方法(相对较有优势 ,使用了 with a as... 将as后面的数据命名为a)
三张表关联起来的总表为 a ,a 表作为数据源,将 a 表的 bname 和 gender分组,组内工资累加
0: jdbc:hive2://linux03:10000>
with a as(
select
yg.name,
yg.gender,
gz.jb ,
gz.jj,
gz.tc ,
gz.deptno,
bm.bname
from
tb_employee yg
join
tb_sal gz
join
tb_department bm
on
yg.uid = gz.uid
and
bm.deptno = gz.deptno
)
select
bname ,
gender ,
sum(jb+jj+tc) sum_sal
from
a
group by bname , gender ;
+--------+---------+----------+
| bname | gender | sum_sal |
+--------+---------+----------+
| 技术部 | F | 12000 |
| 行政部 | M | 17000 |
| 销售 | F | 5400 |
| 销售 | M | 11800 |
+--------+---------+---------
4 需求4:求出公司中不同性别、不同年龄阶段(20-30,31-40,41-50)的员工薪资总和 ---员工表 ,工资表
4.1 先将员工表和工资表关联起来并筛选出想要的字段,然后对年龄进行区间的匹配
先将员工表和工资表关联起来
0: jdbc:hive2://linux03:10000>
select
yg.name,
yg.gender,
yg.age,
gz.jb,
gz.jj,
gz.tc
from
tb_employee yg
join
tb_sal gz
on
yg.uid=gz.uid;
+----------+------------+---------+--------+--------+--------+
| yg.name | yg.gender | yg.age | gz.jb | gz.jj | gz.tc |
+----------+------------+---------+--------+--------+--------+
| zs | M | 28 | 2000 | 3000 | 1500 |
| ww | F | 36 | 5000 | 500 | 1000 |
| zl | F | 48 | 1500 | 1000 | 3000 |
| pp | M | 44 | 3000 | 6000 | 8000 |
| wb | M | 32 | 1500 | 2000 | 1800 |
| TQ | F | 32 | 2500 | 1000 | 1900 |
+----------+------------+---------+--------+--------+--------+
然后对年龄进行年龄区间的匹配
0: jdbc:hive2://linux03:10000>
select
yg.name,
yg.gender,
yg.age,
gz.jb,
gz.jj,
gz.tc,
case
when age>=20 and age<30 then "20-30"
when age>=30 and age<40 then "30-40"
when age>=40 and age<50 then "40-50"
end as stage
from
tb_employee yg
join
tb_sal gz
on
yg.uid=gz.uid;
+----------+------------+---------+--------+--------+--------+--------+
| yg.name | yg.gender | yg.age | gz.jb | gz.jj | gz.tc | stage |
+----------+------------+---------+--------+--------+--------+--------+
| zs | M | 28 | 2000 | 3000 | 1500 | 20-30 |
| ww | F | 36 | 5000 | 500 | 1000 | 30-40 |
| zl | F | 48 | 1500 | 1000 | 3000 | 40-50 |
| pp | M | 44 | 3000 | 6000 | 8000 | 40-50 |
| wb | M | 32 | 1500 | 2000 | 1800 | 30-40 |
| TQ | F | 32 | 2500 | 1000 | 1900 | 30-40 |
+----------+------------+---------+--------+--------+--------+--------+
4.2 需求的第一种实现方法(需要按性别和年龄段分组 ,求出工资总和 , with a as....)
0: jdbc:hive2://linux03:10000>
with a as ( 将括号里面的汇总表起上一个别名 a,并以此作为一个数据源
select
yg.name,
yg.gender,
yg.age,
gz.jb,
gz.jj,
gz.tc,
case
when age>=20 and age<30 then "20-30"
when age>=30 and age<40 then "30-40"
when age>=40 and age<50 then "40-50"
end as stage
from
tb_employee yg
join
tb_sal gz
on
yg.uid=gz.uid)
select 在这里以a为数据源,对a表的性别和年龄区域进行分组,然后各自的工资类型进行累加
gender,
stage,
sum(jb+jj+tc) sum_sal
from
a
group by gender,stage;
+---------+--------+----------+
| gender | stage | sum_sal |
+---------+--------+----------+
| F | 30-40 | 11900 |
| F | 40-50 | 5500 |
| M | 20-30 | 6500 |
| M | 30-40 | 5300 |
| M | 40-50 | 17000 |
+---------+--------+----------+
4.3 需求的第二种实现方法(普通的方法)
0: jdbc:hive2://linux03:10000>
select
t1.gender,
t1.stage,
sum(jj+jb+tc) sum_sal 分组后,求出工资总和,并起别名
from
( 以括号里面的这个汇总表为数据源 ,对gender和不同年龄阶段stage进行分组,求出工资总和
select
yg.name,
yg.gender,
yg.age,
gz.jb,
gz.jj,
gz.tc,
case
when age>=20 and age<30 then "20-30"
when age>=30 and age<40 then "30-40"
when age>=40 and age<50 then "40-50"
end as stage
from
tb_employee yg
join
tb_sal gz
on
yg.uid=gz.uid
)t1
group by t1.gender,t1.stage; 按gender 和 stage分组
+------------+-----------+--------+
| t1.gender | t1.stage |sum_sal |
+------------+-----------+--------+
| F | 30-40 | 11900 |
| F | 40-50 | 5500 |
| M | 20-30 | 6500 |
| M | 30-40 | 5300 |
| M | 40-50 | 17000 |
+------------+-----------+--------+