hive(一)_基础使用
- 1 设置
- 2 库
- 2.1 创建库
- 2.2 库信息
- 2.3 操作库
- 3 表
- 3.1 创建表
- 3.2 表信息
- 3.3 操作表
- 4 数据
- 5 查询
- 6调优
- 7 其他
1 设置
- 启动hive前会寻址$HIVE_HOME/.hiverc,并限制性设置
-
hive -i path
指定设置路径
-- 查看设置
set;
-- 更多信息
set -v;
-- 开启显示字段名称
set hive.cli.print.header=true;
-- 提示当前所在数据库
set hive.cli.print.current.db=true;
-- 严格模式(严禁提交对分区表不使用where子句的查询)
set hive.mapred.mode=strict | nostrict;
-- 提高聚合性能,需要更多内存
set hive.map.aggr=true;
-- 尝试使用本地模式(伪分布式需要用)
set hive.exec.mode.local.auto=true;
-- 启动mapjoin优化(不支持right outer join和full outer join)
set hive.auto.convert.join=true;
-- 配置小表的大小
set hive.mapjoin.smalltable.filesize=25000000;
-- 开启动态分区功能
set hive.exec.dynamic.partition = true;
-- 严格模式至少要有一个分区字段是静态
set hive.exec.dynamic.partition.mode = nonstrict;
-- 每个mapper或reducer可以创建的最大动态分区个数
set hive.max.dynamic.partitions.pernode = 100;
-- 一个动态分区语句可以创建最大动态分区个数
set hive.max.dynamic.partitions = +1000;
-- 全局可以创建最大的文件个数
set hive.max.created.files = 1000000;
-- 分桶抽样相关设置
set hive.enforce.bucketing=true;
hive (default)> set mapreduce.job.reduces=-1;
2 库
2.1 创建库
-- 语法
-- 默认路径:/user/hive/warehouse/db_name.db/table_name/partition_name
CREATE DATABASE [IF NOT EXISTS] db_name
[COMMENT db_comment] --关于数据块的描述
[LOCATION hdfs_path] --数据库在HDFS上的路径
[WITH DBPROPERTIES (property_name=property_value[,...])]; --指定数据块属性
-- 建库
create database testdb;
-- 建立前判断存在
create database if not exists testdb;
-- 加注释
create database testdb comment 'learning';
-- 带属性,通过desc查询到
create database testdb with dbproperties('creator'='hadoop','date'='2018-04-05');
-- 指定位置创建库
create database testdb location '/testdb.db;
2.2 库信息
-- 查看所有库
show databases;
-- 查询正则匹配库
show databases like 't.*';
-- 查看库的详细信息
desc database [extended] testdb;
-- 查询库的创建语句
show create database testdb;
2.3 操作库
-- 使用库
use testdb;
-- 查看正在使用的库
show current_database();
-- 删除库
drop database [if exists] testdb [cascade];
-- 修改库(只能更改属性信息)
alter database testdb set dbproperties('creator'='hadoop','date'='2018-04-05');
3 表
3.1 创建表
-- 语法
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]tb_name --外部表
(col_name data_type [COMMENT col_comment][,...]) --字典
[COMMENT table_comment] --注释
[TBLPROPERTIES table_pro] --表属性
[PARTITIONED BY (col_name data_type [COMMENT col_comment], ...)] --分区
[CLUSTERED BY (col1,col2,...) [SORTED BY (col1 [ASC|DESC],...)] INTO num_buckets BUCKETS] --分桶
[ROW FORMAT row_format] --行格式
[STORED AS file_format] --存储格式
[LOCATION hdfs_path] --指定HDFS路径
-- 建表例子
-- json格式里,map和stuct是一样的,都是对象
creata table if not exists testdb.emp(
name string comment 'emp name'
,sal float comment 'emp sal'
,subordinates array<string> comment 'name of subordinates'
,deduction map<string,float> comment 'keys are names,values are pct'
,address struct<street:strubg,city:string,state:string,zip:int> comment 'home address'
)
comment 'table comment'
tblproperties ('creator'='hadoop','created_at'='2019-01-01 10:00:00')
location '/user/hive/warehouse/testdb.db/emp';
-- 手动指定分隔符
creata table if not exists testdb.emp(
name string comment 'emp name'
,sal float comment 'emp sal'
,subordinates array<string> comment 'name of subordinates'
,deduction map<string,float> comment 'keys are names,values are pct'
,address struct<street:strubg,city:string,state:string,zip:int> comment 'home address'
)
row format delimited
fields terminated by '\001'
collection items terminated by '\002'
map keys terminated by '\003'
lines terminated by '\n' -- 行分隔仅支持\n
stored as textfile; -- 默认数据格式
-- 管理表(删除影响物理层)
create table if not exists student(
sno string
,sname string
,sage int
,ssex string
)
row format delimited fields terminated by ',';
-- 外部表(删除不影响物理层,相当于引用表,使用应该指定路径)
create external table student_ext(
sno string
,sname string
,sage int
,ssex string
)
row format delimited fields terminated by ','
location '/user/hive/warehouse/testdb.db/student';
-- 复制表结构创建表(仅可以修改location语句)
create external table if not exists testdb.student_ext like testdb.student
location '/user/hive/warehouse/testdb.db/student';
-- 复制表,携带数据
create table emp2 as select * from emp;
-- 创建分区表(分区表drop table不影响物理层)
create [external] table student_pt(
sno string
,sname string
,sage int
,ssex string
)
partitioned by (country string,city string)
row format delimited fields terminated by ',';
-- 分桶表
create table if not exists stu_buck(
sno string
,sname string
,sage int
,ssex string
)
clustered by(sno) into 4 buckets
row format delimited fields terminated by ',';
3.2 表信息
-- 显示所有表名
show tables;
-- 正则显示表名
show tables like 's.*';
-- 显示建表语句
show create table tb_name;
-- 显示表结构详细信息
desc [extended | formatted] student;
-- 显示表属性信息
show tblproperties student;
-- 查看所有分区键信息
show partitions student_pt [partition(country='china'[,...])];
-- 查看分区所在路径
desc formatted student_pt partition(country='china',city='gz');
3.3 操作表
-- 删除表
drop table [if exists] tb_name;
-- 清空表
truncate table tb_name;
-- 改表名
alter table old_name rename to new_name;
-- 修改列,名字,类型,注释,位置
alter table tb_name change old_col new_col int
comment 'col comment'
after other_col;
-- 增加字段
alter tb_name add columns(
a string comment '111'
,b string comment '222'
);
-- 替换字段
alter table tb_name replace columns(
a string comment '111'
,b string comment '222'
);
-- 修改表属性(可添加或者修改,无法删除)
alter table tb_name set tblproperties ('nodes'='111');
-- 增加分区
alter table student_pt add [if not exists]
partition (country='china',city='gz') [location hdfs_path]
partition (country='china',city='sz') [location hdfs_path];
-- 修改分区路径
alter table student_pt partition(country='china',city='gz') set location '/user/hive/partition/country=china/city=gz';
-- 删除分区(内部影响物理层,外部不影响)
alter table student_pt drop if exists partition (country='china',city='sz');
-- 触发钩子执行,(存储文件在hive以外被修改)
alter table tb_name touch partition(country='china',city='gz');
-- 修改存储格式
-- 分区打包
alter table tb_name archive | unarchive partition(country='china',city='gz');
-- 禁止删除分区
alter table tb_name partition(country='china',city='gz') enable | disable no_drop;
-- 禁止查询分区
alter table tb_name partition(country='china',city='gz') enable | disable offline;
4 数据
inpath
:限制路径下不能包含目录overwrite
:先删除原来数据,没有则覆盖同名文件并增加新文件
-- 装载数据
load data local inpath '${env:HOME}/test_data/STUDENT.csv' into table student;
-- 装载分区数据(如没有分区会自动增加)
load data local inpath '${env:HOME}/test_data/' overwrite into table student_pt partition (country='china',city='gz');
-- insert装载(覆盖|追加)
insert overwrite | into table student_pt
partition (country='china',city='gz1')
select * from student_pt st where st.country='china' and st.city='gz'
-- insert机构类型
create table if not exists stu2(
sno string,
ins array<string>,
pro map<string,string>,
score struct<math:int,english:int>
);
INSERT INTO stu2 values(
's001',
array('y','s','m'),
str_to_map('age:18,sex:m'),
named_struct('math',90,'english',90)
);
INSERT INTO stu2 values(
's002',
array('s','m'),
map('age','18','sex','l'),
named_struct('math',81,'english',82)
);
-- 多行insert装载
from student_pt st
insert overwrite | into table student_pt
partition (country='china',city='gz1')
select * where st.country='china' and st.city='gz'
insert overwrite | into table student_pt
partition (country='china',city='gz2')
select * where st.country='china' and st.city='gz';
-- 动态分区(根据位置匹配分区)
insert overwrite table emp
partition (country,city)
select ...,se.cnty,se.ct from emp_src se;
-- 导出数据
from student_pt st
insert overwrite [local] directory '/output/student_01'
select * where st.country='china' and st.city='gz'
insert overwrite directory '/output/student_02'
select * where st.country='china' and st.city='gz'
insert overwrite directory '/output/student_03'
select * where st.country='china' and st.city='gz';
5 查询
-- 查询
from student select *;
-- 正则匹配列名
select symbol,`price.*` from stocks;
-- 存在链接
select s.ymd s.symbol
from stocks s left semi join dividends d on s.ymd = d.ymd;
等价于
select s.ymd s.symbol
from stocks
where s.ymd in (select d.ymd from dividends d);
-- 全局排序
order by
-- reducer排序
sort by;
-- distribute by
select s.ymd,s.symbol,s.price_close from stocks s
distribute by s.symbol
sort by s.symbbol s.symbol,s.ymd
-- cluster by(distribute和sort一致时升序)
select s.ymd,s.symbol,s.price_close from stocks s
cluster by s.symbol;
-- 类型转换
select name,sal from emp where cast(sal as float) < 2000.0;
- 避免进行mapreduce
1,select * form emp(本地模式)
2,where条件只是分区字段 - join的优化
1,表的大小从左到大依次递增
2,指定驱动表:select /*+ streamtable(s)*/ s.ymd,s.symbol from stocks s join dividends d on s.ymd = d.ymd;
3,指定小表:select /*+ mapjoin(d)*/ s.ymd,s.symbol from stocks s join dividends d on s.ymd = d.ymd;
6调优
- 执行计划
explan [extended] select sum(number) from onecol;
- limit限制
- 临时本地模式
set oldjobtracker=${hiveconf:mapred.job.tracker};
set mapred.job.tracker=local;
set mapred.tmp.dir=/home/edwrd/tmp;
select * from people where firstname ='bob';
set mapred.job.tracker=${oldjobtracker};
- 适时本地模式
set hive.exec.mode.local.auto=true;
- 并行执行(阶段不依赖时)
set hive.exec.parallel=true;
- 严格模式
限制分区表不适用分区条件
order by 必须带 limit
join 必须使用on
set hive.mapred.mode=strict;
- 调整mapper,reducer个数
- JVM重用
set mapred.job.reuse.jvm.num.tasks=10;
- 索引
加快group by速度 - 限制最大动态分区数
- 推测执行
- 单个mapreduce中多个groupby
- 虚拟列
set guve.exec.rowoffset=true;
select input__file__name,block__offset__inside__file,line
from test
where line like '%hive%' LIMIT 2;
7 其他
- 其它
-- 一次使用命令
hive -e -S 'use testdb; select * from student limit 3' > /tmp/output.txt
hive -e -S 'set' | grep warehouse
-- 执行hql文件,-S过滤多余返回内容
hive -f -S exer.sql > /tmp/output.txt
-- 执行bash shell
! pwd;
-- 执行dfs命令
dfs -ls /
- 切割字段显示为多行
SELECT
TEST.*
,SPLIT(KPI,':')[0] AS KPI_CODE
,SPLIT(KPI,':')[1] AS KPI_VALUE
FROM TEST LATERAL VIEW EXPLODE(SPLIT(KPI_DESC,',')) TBL AS KPI;
- 80 正则匹配字段
- 113 分桶抽样查询
- udf/udtf 基础
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
for line in sys.stdin:
name,age=line.strip().split('\t')[1:3]
print('\t'.join([name,age]))
set hive.exec.mode.local.auto=true;
create table if not exists testdb.student2(
name string
,age string
);
add file ${env:HOME}/script/test.py;
insert overwrite table testdb.student2
select transform(*) using 'test.py' as (name,age)
from testdb.student;