hive 普通表变成分区表

转载

mob6454cc696f04 2024-09-19 13:01:44

文章标签 hive 普通表变成分区表分区表数据 hdfs 文章分类 Hive 大数据

1. 相关说明

在Hive中处理数据时，当处理的一张表的数据量过大的时候，每次查询都是遍历整张表，显然对于计算机来说，是负担比较重的。所以我们可不可以针对数据进行分类，查询时只遍历该分类中的数据，这样就能有效的解决问题。所以就会Hive在表的架构下，就会有分区的这个概念，就是为了满足此需求。

分区表的一个分区对应hdfs上的一个目录

分区表包括静态分区表和动态分区表，根据分区会不会自动创建来区分

多级分区表，即创建的时候指定 PARTITIONED BY (event_month string,loc string)，根据顺序，级联创建 event_month=XXX/loc=XXX目录，其他和一级的分区表是一样的

准备测试用的数据：

for i in $(seq 20)
do 
  echo -e  "$(date -d "$RANDOM minute ago"  +%F' '%T'.'%N)"
done|sort -nk1|awk '{print NR"\t"$0}'>/tmp/order_created.txt
cat /tmp/order_created.txt

2. 静态分区表

创建静态分区表，加载数据：

use test1;
drop table test1.order_created_partition purge;
CREATE TABLE order_created_partition (
order_number string,
event_time string
)
PARTITIONED BY (event_month string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY "\t";
-- 建表语句，指定分区字段为event_month，这个字段是伪列
-- 会在数据load到表的这个分区时，在hdfs上创建名为event_month=2017-12的子目录

LOAD DATA LOCAL INPATH "/tmp/order_created.txt" 
OVERWRITE INTO TABLE order_created_partition
PARTITION (event_month='2017-12');
-- 使用 hdfs dfs -cat /user/hive/warehouse/test1.db/order_created_partition/event_month=2017-12/order_created.txt
-- 查看数据文件中并没有event_month这一列

select * from test1.order_created_partition;
-- 分区表全表扫描，不推荐
select * from test1.order_created_partition
 where event_month='2017-12';
-- 使用where子句，过滤分区字段，遍历某个分区
-- 以上两个SQL可以查到列event_month信息
-- 而使用hdfs dfs -cat看不到该列，说明分区表的分区列是伪列
-- 实际上是hdfs中的分区目录的体现

添加分区，load数据：

alter table test1.order_created_partition
  add partition (event_month='2018-01');
LOAD DATA LOCAL INPATH "/tmp/order_created.txt" 
OVERWRITE INTO TABLE order_created_partition
PARTITION (event_month='2018-01');
-- 添加一个分区，将一模一样的数据文件加载到该分区
select * from test1.order_created_partition
 where event_month='2018-01';
-- 查到了该分区的记录

LOAD DATA LOCAL INPATH "/tmp/order_created.txt"
INTO TABLE order_created_partition
PARTITION (event_month='2018-01');
select * from test1.order_created_partition
 where event_month='2018-01';
-- 不使用OVERWRITE参数，会追加数据到分区

LOAD DATA INPATH "/user/hive/warehouse/test1.db/order_created_partition/event_month=2017-12/order_created.txt"
INTO TABLE order_created_partition
PARTITION (event_month='2018-01');
-- 如果从hdfs中加载数据，则原来的路径文件被转移掉

可以看出，所谓的分区，是人为定义的，跟业务数据实际上是不是属于该分区没关系，比如将相同的数据分别插入两个分区中，再比如插入的数据有2017-12月份的和2018-01月份的数据

删除分区：

alter table test1.order_created_partition
  drop partition (event_month='2018-01');
-- 从hdfs中已经看不到event_month=2018-01的分区子目录了

查询装入数据：

insert overwrite table order_created_partition 
 partition(event_month='2017-11')
select order_number,event_time
  from order_created_partition 
 where event_month='2018-02';
-- 使用查询装入数据到分区中，分区可以是当前不存在的
-- 因为查询的是分区表，需要注意伪列和被装入的分区表列的对应关系7

手工创建hdfs目录和文件，添加分区的情况：
静态分区表如果手工创建对应的hdfs目录上传文件，而不使用分区创建命令和load数据到分区的命令，分区表中无法查到该分区信息，需要刷新，这种添加分区的途径是不合法的：

hdfs dfs -mkdir -p /user/hive/warehouse/test1.db/order_created_partition/event_month=2018-02
hdfs dfs -put /tmp/order_created.txt  /user/hive/warehouse/test1.db/order_created_partition/event_month=2018-02
hdfs dfs -ls /user/hive/warehouse/test1.db/order_created_partition/event_month=2018-02
hive
select * from test1.order_created_partition
 where event_month='2018-02';
-- 此时是查不到该分区的
MSCK REPAIR TABLE order_created_partition;
-- 修复表信息之后可以查询
show partitions order_created_partition;
-- 查看该表的所有分区

3. 动态分区表

use test1;
select * from emp;
-- 根据从前实验创建的emp表
-- 将emp表的数据按照部门分组，并将数据加载到其对应的分组中去
create table emp_partition(
empno int, 
ename string, 
job string, 
mgr int, 
hiredate string, 
sal double, 
comm double)
PARTITIONED BY (deptno int)
row format delimited fields terminated by '\t';
-- 根据部门编号分区，原表中的部门编号字段就没有必要创建了
-- 而是由分区表创建的伪列来替代
set hive.exec.dynamic.partition.mode=nonstrict;
-- 设置动态分区模式为非严格模式
insert into table emp_partition partition(deptno)
select empno,ename,job,mgr,hiredate,sal,comm ,deptno from emp;
-- 动态分区表的数据插入语句
-- partition(deptno) 而不是 partition(deptno=XXX)
-- select 子句从原表查出来的列数和列序要和分区表列数和列序保持一致
-- select 子句最后一列要为分区表的分区列
-- 不在需要where子句
-- 设置动态分区模式为非严格模式
-- set hive.exec.dynamic.partition.mode=nonstrict;

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。