hive中set hive中set的运用

转载

mob6454cc6ba5a5 2023-08-10 02:04:39

文章标签 hive中set hive ci 字段 文章分类 Hive 大数据

hive（一）_基础使用

1 设置
2 库

2.1 创建库
2.2 库信息
2.3 操作库

3 表

3.1 创建表
3.2 表信息
3.3 操作表

4 数据
5 查询
6调优
7 其他

1 设置

启动hive前会寻址$HIVE_HOME/.hiverc，并限制性设置
hive -i path指定设置路径

-- 查看设置
set;

-- 更多信息
set -v;

-- 开启显示字段名称
set hive.cli.print.header=true;

-- 提示当前所在数据库
set hive.cli.print.current.db=true;

-- 严格模式(严禁提交对分区表不使用where子句的查询)
set hive.mapred.mode=strict | nostrict;

-- 提高聚合性能，需要更多内存
set hive.map.aggr=true;

-- 尝试使用本地模式（伪分布式需要用）
set hive.exec.mode.local.auto=true;

-- 启动mapjoin优化(不支持right outer join和full outer join)
set hive.auto.convert.join=true;
-- 配置小表的大小
set hive.mapjoin.smalltable.filesize=25000000;

-- 开启动态分区功能
set hive.exec.dynamic.partition = true;
-- 严格模式至少要有一个分区字段是静态
set hive.exec.dynamic.partition.mode = nonstrict;
-- 每个mapper或reducer可以创建的最大动态分区个数
set hive.max.dynamic.partitions.pernode = 100;
-- 一个动态分区语句可以创建最大动态分区个数
set hive.max.dynamic.partitions = +1000;
-- 全局可以创建最大的文件个数
set hive.max.created.files = 1000000;

-- 分桶抽样相关设置
set hive.enforce.bucketing=true;
hive (default)> set mapreduce.job.reduces=-1;

2 库

2.1 创建库

-- 语法
-- 默认路径：/user/hive/warehouse/db_name.db/table_name/partition_name
CREATE DATABASE [IF NOT EXISTS] db_name
　　[COMMENT db_comment]　　　　　　								--关于数据块的描述
　　[LOCATION hdfs_path]　　　　　　　　　　						--数据库在HDFS上的路径
　　[WITH DBPROPERTIES (property_name=property_value[,...])];　	--指定数据块属性

-- 建库
create database testdb;

-- 建立前判断存在
create database if not exists testdb;

-- 加注释
create database testdb comment 'learning';

-- 带属性,通过desc查询到
create database testdb with dbproperties('creator'='hadoop','date'='2018-04-05');

-- 指定位置创建库
create database testdb location '/testdb.db;

2.2 库信息

-- 查看所有库
show databases;

-- 查询正则匹配库
show databases like 't.*';

-- 查看库的详细信息
desc database [extended] testdb;

-- 查询库的创建语句
show create database testdb;

2.3 操作库

-- 使用库
use testdb;

-- 查看正在使用的库
show current_database();

-- 删除库
drop database [if exists] testdb [cascade];

-- 修改库(只能更改属性信息)
alter database testdb set dbproperties('creator'='hadoop','date'='2018-04-05');

3 表

3.1 创建表

-- 语法
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]tb_name				--外部表
　　(col_name data_type [COMMENT col_comment][,...]) 					--字典
　　[COMMENT table_comment] 												--注释
　　[TBLPROPERTIES table_pro]											--表属性																		
　　[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] 	--分区
　　[CLUSTERED BY (col1,col2,...) [SORTED BY (col1 [ASC|DESC],...)] INTO num_buckets BUCKETS] --分桶
　　[ROW FORMAT row_format]  	--行格式
　　[STORED AS file_format]		--存储格式
　　[LOCATION hdfs_path]  		--指定HDFS路径


-- 建表例子
-- json格式里，map和stuct是一样的，都是对象
creata table if not exists testdb.emp(
	 name string comment 'emp name'
	,sal float comment 'emp sal'
	,subordinates array<string> comment 'name of subordinates'
	,deduction map<string,float> comment 'keys are names,values are pct'
	,address struct<street:strubg,city:string,state:string,zip:int> comment 'home address'
)
comment 'table comment'
tblproperties ('creator'='hadoop','created_at'='2019-01-01 10:00:00')
location '/user/hive/warehouse/testdb.db/emp';


-- 手动指定分隔符
creata table if not exists testdb.emp(
	 name string comment 'emp name'
	,sal float comment 'emp sal'
	,subordinates array<string> comment 'name of subordinates'
	,deduction map<string,float> comment 'keys are names,values are pct'
	,address struct<street:strubg,city:string,state:string,zip:int> comment 'home address'
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003'
lines terminated by '\n'  	-- 行分隔仅支持\n
stored as textfile;			-- 默认数据格式


-- 管理表（删除影响物理层）
create table if not exists student(
     sno string
    ,sname string
    ,sage int
    ,ssex string
)
row format delimited fields terminated by ',';


-- 外部表(删除不影响物理层，相当于引用表，使用应该指定路径）
create external table student_ext(
     sno string
    ,sname string
    ,sage int
    ,ssex string
)
row format delimited fields terminated by ','
location '/user/hive/warehouse/testdb.db/student';


-- 复制表结构创建表（仅可以修改location语句）
create external table if not exists testdb.student_ext like testdb.student 
location '/user/hive/warehouse/testdb.db/student';

-- 复制表，携带数据
create table emp2 as select * from emp;


-- 创建分区表（分区表drop table不影响物理层）
create [external] table student_pt(
     sno string
    ,sname string
    ,sage int
    ,ssex string
) 
partitioned by (country string,city string)
row format delimited fields terminated by ','; 

-- 分桶表
create table if not exists stu_buck(
     sno string
    ,sname string
    ,sage int
    ,ssex string
)
clustered by(sno) into 4 buckets
row format delimited fields terminated by ',';

3.2 表信息

-- 显示所有表名
show tables;

-- 正则显示表名
show tables like 's.*';

-- 显示建表语句
show create table tb_name;

-- 显示表结构详细信息
desc [extended | formatted] student;

-- 显示表属性信息
show tblproperties student;

-- 查看所有分区键信息
show partitions student_pt [partition(country='china'[,...])];

-- 查看分区所在路径
desc formatted student_pt partition(country='china',city='gz');

3.3 操作表

-- 删除表
drop table [if exists] tb_name;

-- 清空表
truncate table tb_name;

-- 改表名
alter table old_name rename to new_name;

-- 修改列,名字，类型，注释，位置
alter table tb_name change old_col new_col int 
comment 'col comment'
after other_col;
 
-- 增加字段
alter tb_name add columns(
	 a string comment '111'
	,b string comment '222'
);

-- 替换字段
alter table tb_name replace columns(
	 a string comment '111'
	,b string comment '222'
);

-- 修改表属性（可添加或者修改，无法删除）
alter table tb_name set tblproperties ('nodes'='111');

-- 增加分区
alter table student_pt add [if not exists] 
partition (country='china',city='gz') [location hdfs_path]
partition (country='china',city='sz') [location hdfs_path];

-- 修改分区路径
alter table student_pt partition(country='china',city='gz') set location '/user/hive/partition/country=china/city=gz';

-- 删除分区(内部影响物理层，外部不影响)
alter table student_pt drop if exists partition (country='china',city='sz');

-- 触发钩子执行，（存储文件在hive以外被修改）
alter table tb_name touch partition(country='china',city='gz');

-- 修改存储格式

-- 分区打包
alter table tb_name archive | unarchive partition(country='china',city='gz'); 

-- 禁止删除分区
alter table tb_name partition(country='china',city='gz') enable | disable no_drop;

-- 禁止查询分区
alter table tb_name partition(country='china',city='gz') enable | disable offline;

4 数据

inpath:限制路径下不能包含目录
overwrite:先删除原来数据，没有则覆盖同名文件并增加新文件

-- 装载数据
load data local inpath '${env:HOME}/test_data/STUDENT.csv' into table student;

-- 装载分区数据（如没有分区会自动增加）
load data local inpath '${env:HOME}/test_data/' overwrite into table student_pt partition (country='china',city='gz');

-- insert装载(覆盖|追加)
insert overwrite | into table student_pt 
	partition (country='china',city='gz1') 
	select * from student_pt st where st.country='china' and st.city='gz'

-- insert机构类型
create table if not exists stu2(
    sno string,
    ins array<string>,
    pro map<string,string>,
    score struct<math:int,english:int>
);
INSERT INTO stu2  values(
	's001',
	array('y','s','m'),
	str_to_map('age:18,sex:m'),
	named_struct('math',90,'english',90)
);
INSERT INTO stu2  values(
    's002',
    array('s','m'),
    map('age','18','sex','l'),
    named_struct('math',81,'english',82)
);

-- 多行insert装载
from student_pt st
insert overwrite | into table student_pt 
	partition (country='china',city='gz1') 
	select * where st.country='china' and st.city='gz'
insert overwrite | into table student_pt 
	partition (country='china',city='gz2') 
	select * where st.country='china' and st.city='gz';

-- 动态分区(根据位置匹配分区)
insert overwrite table emp
partition (country,city)
select ...,se.cnty,se.ct from emp_src se;

-- 导出数据
from student_pt st
insert overwrite [local] directory '/output/student_01' 
	select * where st.country='china' and st.city='gz'
insert overwrite directory '/output/student_02' 
	select * where st.country='china' and st.city='gz'
insert overwrite directory '/output/student_03' 
	select * where st.country='china' and st.city='gz';

5 查询

-- 查询
from student select *;

-- 正则匹配列名
select symbol,`price.*` from stocks;

-- 存在链接
select s.ymd s.symbol 
from stocks s left semi join dividends d on s.ymd = d.ymd;
等价于
select s.ymd s.symbol 
from stocks 
where s.ymd in (select d.ymd from dividends d);

-- 全局排序
order by

-- reducer排序
sort by；

-- distribute by
select s.ymd,s.symbol,s.price_close from stocks s
distribute by s.symbol
sort by s.symbbol s.symbol,s.ymd

-- cluster by(distribute和sort一致时升序)
select s.ymd,s.symbol,s.price_close from stocks s
cluster by s.symbol;

-- 类型转换
select name,sal from emp where cast(sal as float) < 2000.0;

避免进行mapreduce
1，select * form emp（本地模式）
2，where条件只是分区字段
join的优化
1，表的大小从左到大依次递增
2，指定驱动表：select /*+ streamtable(s)*/ s.ymd,s.symbol from stocks s join dividends d on s.ymd = d.ymd; 3，指定小表：select /*+ mapjoin(d)*/ s.ymd,s.symbol from stocks s join dividends d on s.ymd = d.ymd;

6调优

执行计划

explan [extended]  select sum(number) from onecol;

limit限制
临时本地模式

set oldjobtracker=${hiveconf:mapred.job.tracker};
set mapred.job.tracker=local;
set mapred.tmp.dir=/home/edwrd/tmp;
select * from people where firstname ='bob';
set mapred.job.tracker=${oldjobtracker};

适时本地模式

set hive.exec.mode.local.auto=true;

并行执行（阶段不依赖时）

set hive.exec.parallel=true;

严格模式
限制分区表不适用分区条件
order by 必须带 limit
join 必须使用on

set hive.mapred.mode=strict;

调整mapper，reducer个数
JVM重用

set mapred.job.reuse.jvm.num.tasks=10;

索引
加快group by速度
限制最大动态分区数
推测执行
单个mapreduce中多个groupby
虚拟列

set guve.exec.rowoffset=true;
select input__file__name,block__offset__inside__file,line 
from test 
where line like '%hive%' LIMIT 2;

7 其他

其它

-- 一次使用命令
hive -e -S 'use testdb; select * from student limit 3' > /tmp/output.txt
hive -e -S 'set' | grep warehouse

-- 执行hql文件，-S过滤多余返回内容
hive -f -S exer.sql  > /tmp/output.txt

-- 执行bash shell
! pwd;

-- 执行dfs命令
dfs -ls /

切割字段显示为多行

SELECT 
        TEST.*
        ,SPLIT(KPI,':')[0] AS KPI_CODE
        ,SPLIT(KPI,':')[1] AS KPI_VALUE
    FROM TEST LATERAL VIEW EXPLODE(SPLIT(KPI_DESC,',')) TBL AS KPI;

80 正则匹配字段
113 分桶抽样查询
udf/udtf 基础

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import sys
for line in sys.stdin:
        name,age=line.strip().split('\t')[1:3]
        print('\t'.join([name,age]))

set hive.exec.mode.local.auto=true;

create table if not exists testdb.student2(
    name string
    ,age string
);

add file ${env:HOME}/script/test.py;

insert overwrite table testdb.student2
select transform(*) using 'test.py' as (name,age) 
from testdb.student;

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。