数据库操作
数据库的创建
-- 数据库的创建
create database python;
-- 指定数据库的备注和属性和位置
create database python_db comment 'python_database' location '/hive2' with dbproperties ('name' = 'python');
数据库的查看和删除
-- 查看数据库
desc database extended python;
-- 查看数据库创建过程
show create database python_db;
-- 数据库删除
drop database python_db;
-- 强制删除有表的数据
drop database python cascade;
数据库的修改
-- 修改数据库
-- 修改数据库属性
alter database python set dbproperties ('age' = '18');
-- 修改数据库路径-指定HDFS上的路径
alter database python set location 'hdfs:///hive2';
-- 修改拥有者
alter database python set owner user python;
数据表操作
表的创建
-- 表的基本创建,默认采用的分隔符为\001 在vim中显示^A 在编辑器里是SOH
create table if not exists test_tb
(
id int comment 'idhao'
);
-- 指定分割符创建
create table if not exists test2_db
(
id int,
name string,
age int,
gender string
) row format delimited fields terminated by ',';
查看表删除表
-- 查看表
-- 详情信息查看
desc extended test2_db;
-- 格式化展示详细信息
desc formatted test2_db;
-- 查看建表语句
show create table test2_db;
-- 删除表(元数据和文件)
drop table test2_db;
-- 删除表数据,不删除表目录
truncate table test2_db;
修改表
-- 修改表
-- 修改表属性
alter table test2_db set tblproperties ('name' = 'itcast');
-- 修改表名
alter table test2_db rename to test3_tb;
-- 修改表字段属性
alter table test3_tb change id id string;
-- 修改name字段名
alter table test3_tb change name username string;
-- 添加address新字段
alter table test3_tb add columns (address string);
-- 修改表的存储路径到根路劲hive2下
alter table test3_tb set location '/hive2';
内部表和外部表的创建
-- 内部表的创建,删除内部表元数据和文件全部删除
create table test4_tb_nei
(
id int,
name string
) row format delimited fields terminated by ',';
-- 外部表的创建,删除外表只删除表元数据,并指定外表表文件路径
create external table test4_tb_nei
(
id int,
name string
) row format delimited fields terminated by ','
location '/hive2';
分区表
-- 创建分区表
create table test_tb_part
(
id int,
name string,
age int,
gender string
) partitioned by (sex string) row format delimited fields terminated by ',';
-- 静态分区表导入
load data local inpath '/root/boy.txt' into table test_tb_part partition (sex = 'boy');
load data local inpath '/root/girl.txt' into table test_tb_part partition (sex = 'girl');
#-------------------------------------------
-- 动态分区,可以根据指点的字段进行动态分区
-- 创建动态分区表
create table test_tb_part_dong(
id int,
name string,
age int,
gender string
) partitioned by (sex string) row format delimited fields terminated by ',';
-- 设置允许动态分区写入
set hive.exec.dynamic.partition.mode=nonstrict;
-- 插入动态分区设置
insert into table test_tb_part_dong partition (sex) select t.* t.gender from test_tb as t;
分区表的查看/删除/修改
-- 查看表中分区的内容
show partitions test_tb_part;
-- 删除分区
alter table test_tb_part drop partition (sex = 'girl');
-- 分区的修改,添加新的分区
alter table test_tb_part add partition (sex = 'aaa');
-- 修改分区名称
alter table test_tb_part partition (sex = 'aaa') rename to partition (sex = 'bbb');
-- 修改分区路径
alter table test_tb_part partition (sex = 'bbb') set location '/.....';
-- 分区的修复(在HDFS上创建了分区,但元数据中没有,需要修复一致)
msck repair table test_tb_part;
分桶表
-- 分桶表
-- 创建原始数据表
create table if not exists t_usa_covid19
(
count_date string,
county string,
state string,
fips int,
cases int,
deaths int
)
row format delimited fields terminated by ',';
-- 创建分通表
create table if not exists bucket_usa_covid19
(
count_date string,
county string,
state string,
fips int,
cases int,
deaths int
) clustered by (state) sorted by (cases desc) into 5 buckets;
-- 将原始数据表进行分桶操作
insert into bucket_usa_covid19
select *
from t_usa_covid19;
数据的导入/写入
-- 数据的导入
-- 导入本地数据
load data local inpath '/root/boy.txt' into table test_tb_part partition(sex='boy');
-- HDFS上数据的导入
load data inpath '/hive2/a.txt' into table test_tb_part partition (sex = 'bbb');
-- 覆盖导入
load data inpath '/hive2/b.txt' overwrite into table test_tb_part partition (sex = 'bbb');
-- insert into插入数据
insert into team_player values (11, '石家庄刘', '清新');
-- 批量将一张表的数据导入另一张表中
-- insert into ....select....插入数据(是追加导入数据)
insert into test_tb
select *
from test_tb_new;
-- insert overwrite ....select....插入数据(是覆盖导入数据)
insert overwrite table test_tb
select *
from test_tb_new;
-- 多批次导入,将数据导入多张表
from test_tb_new
insert
into test_tb_name
select name
insert
into test_tb_age
select age;
-- 动态导入,t.gender 最后一个字段作为分区导入时的字段划分依据
insert into table test_tb_part_D partition(sex) select t.*,t.gender from test_tb_f t;
-- 多个分区字段导入
insert into table test_tb_part_D partition(yesr,month) select t.*, nian,yue from test_tb_f t;
-------------------------------------------
-- 文件的写入,把文件写入到路径/hive2下(默认的格式\001)
insert overwrite directory '/hive2'
select *
from test_tb_new;
-- 按照指定的格式写入
insert overwrite directory '/hive2' row format delimited fields terminated by '-'
select *
from test_tb_new;
-- 按照指定的格式写入到本地
insert overwrite local directory '/hive2' row format delimited fields terminated by '-'
select *
from test_tb_new;
DQL操作
-- 创建分区表
CREATE TABLE t_usa_covid19_p
(
county string,
fips int,
cases int,
deaths int
)
partitioned by (count_date string,state string)
row format delimited fields terminated by ",";
-- 设置动态分区
set hive.exec.dynamic.partition.mode=nonstrict;
-- 导入分区数据
insert into t_usa_covid19_p partition (count_date, state)
select county, fips, cases, deaths, count_date, state
from t_usa_covid19;
- 基本查询的执行顺询(from - join on – where-- group by-- having–order by – 展示select查询字段–limit分页展示)
- where是在分组之前筛选条件, having是在分组后的筛选条件
-- 按条件查询
select * from t_usa_covid19_p where state = 'Alabama'and deaths > 100;
-- 对数据去重查询
select distinct county from t_usa_covid19_p;
-- 分组查询
select state, sum(deaths)
from t_usa_covid19_p
group by state;
-- 分组数据过滤
select state, sum(deaths)
from t_usa_covid19_p
group by state
having sum(deaths) > 10000;
-- 排序
select * from t_usa_covid19_p where state='Alabama' order by deaths desc ;
-- 分页处理,展示5条数据
select *
from t_usa_covid19_p
where state = 'Alabama'
order by deaths desc
limit 5;
-- 分页处理,从第三行开始,展示5条数据
select *
from t_usa_covid19_p
where state = 'Alabama'
order by deaths desc
limit 3,5;
高阶查询
- order by 是全局排序,强制对全局使用一个reduce进行排序
- cluster 默认情况下只是用一个reduce进行排序,跟order by效果一样
#cluster by 查询
select * from t_usa_covid19_p where state = 'Alabama' cluster by deaths;
-- 可以手动设置reduce的数量,会分成两份数据排序展示
set mapreduce.job.reduces = 2;
select * from t_usa_covid19_p where state = 'Alabama' cluster by deaths;
- 排序的第二种方式
- distribute by 指定拆分字段 sort by 指定排序字段
select * from t_usa_covid19_p where state = 'Alabama' distribute by deaths sort by cases;
- 将两个字段相同的表进行联合, distinct可以去除联合后的重复数据(类似于join)
select *
from test_tb_un
union
distinct
select *
from test_tb_f;
CTE语法
-- with as 子查询
with tb1 as (select * from test_tb)
select *
from tb1;
-- 多个子查询
with tb1 as (select * from test_tb),
tb2 as (select * from test_tb_un)
select *
from tb1
union
select *
from tb2;
关联查询
-- 创建三个员工表
CREATE TABLE employee(
id int,
name string,
deg string,
salary int,
dept string
) row format delimited
fields terminated by ',';
--table2:员工住址信息表
CREATE TABLE employee_address (
id int,
hno string,
street string,
city string
) row format delimited
fields terminated by ',';
--table3:员工联系方式表
CREATE TABLE employee_connection (
id int,
phno string,
email string
) row format delimited
fields terminated by ',';
-- 内关联 join查询,查询两个表共同的信息
select *
from employee as e1
join employee_address as e2 on e1.id = e2.id;
-- 左连接left join
select *
from employee as e1
left join employee_address e2 on e1.id = e2.id;
-- 右连接right join
select *
from employee e1
right join employee_address e2 on e1.id = e2.id;
-- 全外连接full join
select *
from employee e1
full join employee_address e2 on e1.id = e2.id;
-- 左半开连接left semi, 先找两张表相同的部分,再进行左连接展示
select *
from employee e1
left semi join employee_address e2 on e1.id = e2.id;
-- 交叉连接 cross join
select *
from employee
cross join employee_address;
窗口函数
-- 用窗口函数求
SELECT cookieid,
sum(pv) OVER(PARTITION BY cookieid)
FROM website_pv_info;
-- 求出每个用户截止到当天,累积的总pv数(order by 默认有一个计算范围 unbounded preceding)
SELECT cookieid,
pv,
sum(pv) OVER(PARTITION BY cookieid
ORDER BY createtime)
FROM website_pv_info;
-- 找出每个用户访问pv最多的Top3重复并列的不考虑
-- 排序 rank 遇到重复会生成不连续的序号
SELECT *,
rank() over(PARTITION BY cookieid
ORDER BY pv DESC) AS runk_num
FROM website_pv_info;
-- dense_rank遇到重复会生成连续序号
SELECT *,
dense_rank() over(PARTITION BY cookieid
ORDER BY pv DESC) AS runk_num
FROM website_pv_info;
-- row_number生成行号
SELECT *,
row_number() over(PARTITION BY cookieid
ORDER BY pv DESC) AS runk_num
FROM website_pv_info;
-- 用子查询查询前三名
WITH pv1 as (SELECT *,
rank() over(PARTITION BY cookieid
ORDER BY pv DESC) AS runk_num
FROM website_pv_info)
SELECT * FROM pv1 WHERE runk_num <= 3;
-- 统计每个用户pv数最多的前3分之1天,ntile(3)把每一组中平均分成3分
SELECT *,
ntile(3) over (partition BY cookieid
ORDER BY pv DESC) AS rank_num
FROM website_pv_info;
-- 子查询查找三分之一的用户
WITH tpv2 AS
(SELECT *,
ntile(3) over (partition BY cookieid
ORDER BY pv DESC) AS rank_num
FROM website_pv_info)
SELECT *
FROM tpv2
WHERE rank_num = 1;
指定取值范围
-- 从当前行向下查找3行(当前行也算相当于4行相加)
SELECT *,
sum(pv) OVER(PARTITION BY cookieid
ORDER BY createtime ROWS BETWEEN CURRENT ROW AND 3 FOLLOWING)
FROM website_pv_info;
-- 以当前行为基准向上取2行向下取3行
SELECT *,
sum(pv) OVER(PARTITION BY cookieid
ORDER BY createtime ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING)
FROM website_pv_info;
分析函数
--分析函数
-- 向上取值
SELECT *,
lag(pv) OVER(PARTITION BY cookieid
ORDER BY pv DESC) AS runk_n
FROM website_pv_info;
-- 向下取值
SELECT *,
lead(pv) OVER(PARTITION BY cookieid
ORDER BY pv DESC) AS runk_n
FROM website_pv_info;
-- 取第一个值
SELECT *,
first_value(pv) OVER(PARTITION BY cookieid
ORDER BY pv DESC) AS runk_n
FROM website_pv_info;
-- 取最后一个值(需要指定取值范围全局)
SELECT *,
last_value(pv) OVER(PARTITION BY cookieid
ORDER BY pv DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS runk_n
FROM website_pv_info;