1.首先,很多时我们需要在Hive中去重数据:(语法和Oracle中的去重很类似,子查询更名为 t 这个不能省略)
select ID, CASE_ID_
from (
SELECT
ID, CASE_ID_ , row_number() OVER(PARTITION BY CASE_ID_ ) as row_flg
FROM table
)t where t.row_flg=1
2.对某一个存储手机号码的zidu字段进行过滤区号可以这样(记得是\\+86 而不是 +86,+属于特殊字符):
SELECT ID, CASE_ID_,
regexp_replace(regexp_replace(regexp_replace(regexp_replace(TO_ACCOUNT_,'\\+86', ''),'\\+85', ''),'-', ''),' ', '') AS TO_ACCOUNT_
FROM table
当然,使用正则去掉会更好:
SELECT ID, CASE_ID_,
regexp_replace(regexp_replace(regexp_replace(TO_ACCOUNT_,'(\\+\\d{2})?', ''),'-', ''),' ', '') AS TO_ACCOUNT_
FROM table
-- 注:去除url那个正则不太全面,只能去除 http://ip,端口号之后的内容不能去除
3. 对于某一个存储可能会有 \t \s 等字符的字段,可以这样过滤:
SELECT ID, CASE_ID_,
regexp_replace(CONTENT, '\\s+', '')
FROM table
4.对于某两个字段是互补的情况可以这样:
SELECT ID, CASE_ID_,
CASE WHEN TIME like '20___%__%_ %_:%_:%_' THEN TIME_ ELSE TIME1 END,
FROM table
-- 如果要拼接字段的值到like条件可以这样
select * from table where update_time like concat('%',TIME,'%');
5 创建表可以这样:
-- hive表
CREATE TABLE test(
APP_ID_ STRING,
CASE_ID_ STRING
)ROW FORMAT DELIMITED fields terminated BY '\t' NULL DEFINED AS '' STORED AS TEXTFILE;
-- 或者
CREATE TABLE test(
APP_ID_ STRING,
CASE_ID_ STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ("separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" )
STORED AS TEXTFILE;
-- 或者
CREATE TABLE test
ROW FORMAT DELIMITED fields terminated BY '\t' NULL DEFINED AS '' STORED AS TEXTFILE
as select * from other_table;
-- ES映射表 (要先把elasticsearch-hadoop-5.5.0.jar加入到hive的辅助包路径)
CREATE TABLE test(
APP_ID_ STRING,
CASE_ID_ STRING
)
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES (
'es.nodes'='node01,node02',
'es.port'='9200',
'es.resource'='index/mapping'
);
-- 下载jar:https://www.elastic.co/downloads/past-releases/elasticsearch-apache-hadoop-5-5-0
6.当某一个字段的值以某一个字符 如: ";" ,我们想取出 ";" 前的内容可以进行切割字符串:
SELECT ID, CASE_ID_,
split('192.168.0.1','\\.')
FROM table
-- 当split包含在 "" 之中时 需要加4个\,如 hive -e ".... split('192.168.0.1','\\\\.') ... " 不然得到的值是null
7.如6所说的,当我们使用 hive -e "..." 并且在java代码中执行hive sql的时候,原本是\\ 的要写成 \\\\ ,不然会报错,而 sql语句中的引号最好用单引号 '
8.文件从某一个节点下load到hive(比如csv放到node01节点的/opt/csv目录下):
load data local inpath '/opt/csv/csv_201812261002.csv'
into table schemaName.tableName
9.文件从hdfs上到hive:
load data inpath 'hdfs://node01:8020/bulkload'
into table test;
-- 以下是覆盖原有的表数据
load data inpath 'hdfs://node01:8020/bulkload' overwrite
into table test;
10.hive导出到节点本地路径:
insert overwrite local directory '/home/temp'
row format delimited
fields terminated by '\t'
select * from tablename;
11.hive导出到hdfs:
insert overwrite directory '/home/temp'
row format delimited
fields terminated by '\t'
select * from tablename;
-- 可能遇到 hive表中字段显示为NULL时,HDFS文件中存储为\N 这种情况 , 可以像下面这样属性设置
insert overwrite directory '/home/temp'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\t',
'serialization.format'= '',
'serialization.null.format'=''
) STORED AS TEXTFILE
select * from tablename;
-- 或者在创建hive表时指定 ROW FORMAT DELIMITED NULL DEFINED AS ''
create table tableName( id string, name string)row format delimited fields terminated by '\t' NULL DEFINED AS '' stored as textfile;
-- 或者直接修改表定义
alter table tableNameset serdeproperties('serialization.null.format' = '');
附:创建ES的index以及mapping:
-- 可以使用postman来创建es的index和mapping
http://192.168.0.1:9200/indexName
{
"setting":{
"number_of_shards":5,
"number_of_replicas":1
}
}
http://192.168.0.1:9200/indexName/mappingName/_mapping
{
"properties": {
"field1": {
"index": "not_analyzed",
"type": "string"
},
"field2": {
"index": "not_analyzed",
"type": "string"
},
"field3": {
"index": "not_analyzed",
"type": "string"
},
"field4": {
"index": "not_analyzed",
"type": "string"
}
}
}
-- Hive表数据插入ES
-- 先创建ES映射表,然后将Hive表数据insert到ES映射表,这样ES就会插入数据了,ES的mapping会自动创建
insert OVERWRITE table es_table
SELECT * FROM hive_table;
11. 具体参考 Hive sql 的 官网api :http://trafodion.apache.org/docs/sql_reference/#examples_of_substring
12.设置Hive查询引擎,set hive.execution.engine=mr ;
注:
1. 在where中判断date不为空的时候要注意 date is not null 和 date != null 的区别。
2. select中统计百分比的时候,要注意分母(总数)不能为0的情况。