hive 创建表和连接 hive建表parquet

转载

mob6454cc6caa80 2023-07-14 14:38:09

文章标签 hive 创建表和连接 hive hdfs 数据 文章分类 Hive 大数据

创建parquet分区的表速度更快

----创建parquet表，并指定压缩格式

create table xyy_temp_data.temp_dwd_b2b_ec_tb_order_detail 
(
 id bigint                             comment '订单明细',                                                       
  order_no string                       comment '订单编号',                                                  
  branch_code string                    comment '区域编码',                                                                                               
  shop_pattern_code string             comment '店铺业务模式编码'
)
COMMENT '销售订单明细表_全量表'
stored as parquet 
 TBLPROPERTIES ('parquet.compression'='SNAPPY');
;TBLPROPERTIES ('parquet.compression'='GZIP');  ----GZIP压缩格式  可以大大降低存储
 
COMMENT '销售订单明细表_全量表'
partitioned by (dt string,erp_code string )
stored as parquet 
TBLPROPERTIES ('parquet.compression'='SNAPPY'); 
CREATE TABLE xyy_test.demo(
  merchant_id string COMMENT '商户id')
stored as parquet
;

-------创建临时表 指定 parquet

create table xyy_test.demo1 stored as parquet
as select * from xyy_test.demo
;

create table test (
id int,
name string,
score string
)partitioned by(dt string)
stored as parquet;

drop table xyy_temp_data.temp_new_so_fact ;
 create table xyy_temp_data.temp_new_so_fact
stored as parquet TBLPROPERTIES ('parquet.compression'='SNAPPY')
 as 
SELECT
    order_no,
    sub_order_no,
    outside_order_code,      -- 外部订单号, 目前只有智鹿b2c, 使用神农订单号
    parent_id,
    status,
    org_id,
    sub_branch_code,
    merchant_oem,          -- 御元集协议药店tag
    order_source           -- 订单终端  0手动添加(管理后台下单-基本为客服操作补单) 1Android 2IOS 3H5 4PC 
FROM xyy_bigdata_dwd.dwd_b2b_ec_subsidy_order_all
WHERE create_time >= '2020-01-01 00:00:00.0' -- 常量, 无需调整
; 
 
CREATE TABLE if not exists xyy_bigdata_dim.order_activity_sponsor_table(
  id int comment '主键', 
  activity_id bigint comment '活动id' , 
  activity_type string comment '活动类型 活动类型 1-原价,2-特惠价格,3-秒杀价格,4-套餐价格,5-一口价价格,6-直降,7-赠品,8-拼团,9-满减,10-满折,11-满赠,12-满减赠,13-返点返券,14-通用券,15-商品券,16-折扣券,17-礼品券,18-新人券,19-叠加券,20-店铺券,22-手动优惠,23-KA公司优惠,24-KA厂商优惠,26-余额抵扣,28-大转盘' , 
  initiator_id bigint comment '活动发起方ID' , 
  name string comment '发起方名称' 
  )
comment  '订单活动发起方汇总表'
partitioned by (dt string )
;

CREATE TABLE  if not exists xyy_app_data.purchase_stock_turnover_days
(
  provice_code string comment '省份编码', 
  drug_code string comment '商品编码' , 
  qualified_stock_amount double comment '期末库存合格库库存金额', 
  storage_total_amt_tax double comment '期初库存总金额含税' , 
  sale_gmv double comment 'gmv' ,  
  kucn_day double comment '库存周转天数' 
  )
comment '采购报表_库存周转天数'
PARTITIONED BY ( dt string)
stored as parquet;

创建 \t 分割符的表

CREATE TABLE `xyy_test.purcase_20200330_data`(
`buyer` string, 
`user_name` string, 
`org_code` string, 
`drug_code` string, 
`qualified_stock_amount` double, 
`storage_total_amt_tax` double, 
`sale_gmv` double, 
`_c7` double)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'

hdfs dfs -ls /data/hive/warehouse/xyy_test.db/purcase_20200330_data 查看当前表的存储路径

----执行结果如下：
< dev@node04-bigdata-prod-bj1:~/liu >$ hdfs dfs -ls /data/hive/warehouse/xyy_test.db/purcase_20200330_data
Found 1 items
-rwxrwxrwx   2 admin hive     459959 2020-03-30 16:16 /data/hive/warehouse/xyy_test.db/purcase_20200330_data/000000_0

hdfs dfs -get /data/hive/warehouse/xyy_test.db/purcase_20200330_data/000000_0 ./

下载到当前路径下   
< dev@node04-bigdata-prod-bj1:~/liu >$  hdfs dfs -get  /data/hive/warehouse/xyy_test.db/purcase_20200330_data/000000_0  ./
< dev@node04-bigdata-prod-bj1:~/liu >$ ls -l
total 56932
-rw-r--r--. 1 dev dev   459959 Mar 30 16:20 000000_0

下载到本地
< dev@node04-bigdata-prod-bj1:~/liu >$ sz 000000_0
rz
 zmodem trl+C ȡ

  100%     449 KB  449 KB/s 00:00:01       0 Errors

--  删除库
drop database if exists db_name;
--  强制删除库
drop database if exists db_name cascade;

--  删除表
drop table if exists employee;

--  清空表
truncate table employee;
--  清空表，第二种方式
insert overwrite table employee select * from employee where 1=0; 

--  删除分区
alter table employee_table drop partition (stat_year_month>='2018-01');

--  按条件删除数据
insert overwrite table employee_table select * from employee_table where id>'180203a15f';
-- 覆盖分区里的数据
insert overwrite table part_test_3 partition(month_id='201805',day_id='20180509') select * from part_test_temp;
---修改表的注释
ALTER TABLE xyy_app_data.user_portrait_ec_sku_coun SET TBLPROPERTIES('comment' = '用户画像es用到的表');

Hive根据表中某个字段动态分区

使用hive储存数据时，需要对做分区，如果从kafka接收数据，将每天的数据保存一个分区（按天分区），保存分区时需要根据某个字段做动态分区，而不是傻傻的将数据写到某一个临时目录最后倒入到某一个分区，这是静态分区。

Hive动态分区步骤如下：

1、建立某一个源表模拟数据源并插入一些数据

create table t_test_p_source (
    id string,
    name string,
    birthday string
) 
row format delimited fields terminated by '\t'
stored as textfile;

insert into t_test_p_source values ('a1', 'zhangsan', '2018-01-01');
insert into t_test_p_source values ('a2', 'lisi', '2018-01-02');
insert into t_test_p_source values ('a3', 'zhangsan', '2018-01-03');
insert into t_test_p_source values ('a4', 'wangwu', '2018-01-04');
insert into t_test_p_source values ('a5', 'sanzang', '2018-01-05');
insert into t_test_p_source values ('a6', 'zhangsan2', '2018-01-01');

2、建立一张分区表（按ds字段分区）

create table t_test_p_target (
    id string,
    name string
)
partitioned by (ds string)
row format delimited fields terminated by '\t'
stored as textfile;

3、向分区表中插入数据

SET hive.exec.dynamic.partition=true;   #是否开启动态分区，默认是false，所以必须要设置成true
SET hive.exec.dynamic.partition.mode=nonstrict;    # 动态分区模式，默认为strict, 表示表中必须一个分区为静态分区，nostrict表示允许所有字段都可以作为动态分区

insert into table t_test_p_target partition (ds) select id, name, birthday as ds from t_test_p_source;



set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;     限制最大分区的个数
set hive.exec.max.dynamic.partitions =1000;      限制最大分区的个数

4、测试是否动态分区了

hive 创建表和连接 hive建表parquet_数据

2018-01-01这个分区只有2条数据，再来看下HDFS上的分区目录

hive 创建表和连接 hive建表parquet_hive_02

至此，hive动态分区已经完成了。

hive从回收站恢复历史数据

恢复分区数据，带分区的表

步骤一、hdfs dfs -ls /user/admin/.Trash/Current/data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv  查找出回收站里的数据
    从/user/dev  或user/admin 一层一层往下找  一般在 .Trash这个里面
步骤二、执行之前的建表语句
CREATE TABLE if not exists   xyy_test.sale4_core_commodity_gmv(
province_name string comment '省份', 
create_time string comment '时间', 
hexin_gmv double comment '核心商品GMV')
comment '省区报表_核心商品GMV'
partitioned by (dt string)
stored as parquet
通过 show create  table xyy_test.sale4_core_commodity_gmv   /data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv  查看地址
步骤三、通过命令查看出 和步骤二 里的路径一致  
< dev@node04-bigdata-prod-bj1:~ >$ hdfs dfs -ls /user/admin/.Trash/Current/data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv
Found 3 items
drwxrwxrwx   - admin hive          0 2020-04-10 17:17 /user/admin/.Trash/Current/data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv/dt=20200301
drwxrwxrwx   - admin hive          0 2020-04-10 17:17 /user/admin/.Trash/Current/data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv/dt=20200302
drwxrwxrwx   - admin hive          0 2020-04-10 17:17 /user/admin/.Trash/Current/data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv/dt=20200303

步骤四、执行下面的命令把文件 从回收站里 移出来  
hdfs dfs -mv /user/admin/.Trash/Current/data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv/*  /data/hive/warehouse/xyy_test.db/sale4_core_commodity_gmv
步骤五、将文件加载到表里 
alter table xyy_test.sale4_core_commodity_gmv add partition (dt = '20200301');  --- 一次性加载一个分区
alter table xyy_app_data.sale6_core_commodity_gmv add if not exists partition (dt = '20200302')       --- 一次性加载一个分区
alter  table xyy_test.sale4_core_commodity_gmv drop partition (dt >='20200301')  --批量删除分区
alter table siebel_member drop if exists
partition(dt='20180401'),
partition(dt='20180402'),
partition(dt='20180403'),
partition(dt='20180404');

--批量加载分区不怎么好用
alter table xyy_test.sale4_core_commodity_gmv  add 
partition(dt='20200301') location '20200301'
partition(dt='20200302') location '20200302'
partition(dt='20200303') location 'dt='20200303';

恢复不带分区数据按照以上步骤，执行到步骤四就行了

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。