1.首先,很多时我们需要在Hive中去重数据:(语法和Oracle中的去重很类似,子查询更名为 t 这个不能省略)

select  ID, CASE_ID_
from (
 SELECT 
 ID, CASE_ID_ , row_number() OVER(PARTITION BY CASE_ID_ ) as row_flg   

 FROM table

)t where t.row_flg=1

2.对某一个存储手机号码的zidu字段进行过滤区号可以这样(记得是\\+86 而不是 +86,+属于特殊字符):

SELECT  ID, CASE_ID_, 
 regexp_replace(regexp_replace(regexp_replace(regexp_replace(TO_ACCOUNT_,'\\+86', ''),'\\+85', ''),'-', ''),' ', '') AS TO_ACCOUNT_

 FROM table

当然,使用正则去掉会更好:

SELECT  ID, CASE_ID_, 
 regexp_replace(regexp_replace(regexp_replace(TO_ACCOUNT_,'(\\+\\d{2})?', ''),'-', ''),' ', '') AS TO_ACCOUNT_

 FROM table



-- 注:去除url那个正则不太全面,只能去除 http://ip,端口号之后的内容不能去除

3. 对于某一个存储可能会有 \t \s 等字符的字段,可以这样过滤:

SELECT  ID, CASE_ID_, 
 regexp_replace(CONTENT, '\\s+', '')
 FROM table

4.对于某两个字段是互补的情况可以这样:

SELECT  ID, CASE_ID_, 
  CASE WHEN TIME like '20___%__%_ %_:%_:%_' THEN TIME_ ELSE TIME1 END,
 FROM table

-- 如果要拼接字段的值到like条件可以这样
select * from table where update_time like concat('%',TIME,'%');

5 创建表可以这样:

-- hive表

CREATE TABLE test(
APP_ID_ STRING,
CASE_ID_ STRING
)ROW FORMAT DELIMITED fields terminated BY '\t' NULL DEFINED AS '' STORED AS TEXTFILE;

-- 或者

CREATE TABLE test(
 APP_ID_ STRING,
 CASE_ID_ STRING
)
 ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
 WITH SERDEPROPERTIES ("separatorChar" = "\t", "quoteChar" = "'", "escapeChar" = "\\" )
 STORED AS TEXTFILE;

-- 或者
CREATE TABLE test
ROW FORMAT DELIMITED fields terminated BY '\t' NULL DEFINED AS '' STORED AS TEXTFILE
as select * from other_table;


-- ES映射表 (要先把elasticsearch-hadoop-5.5.0.jar加入到hive的辅助包路径)
CREATE TABLE test(
 APP_ID_ STRING,
 CASE_ID_ STRING
 )
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' 
TBLPROPERTIES (
  'es.nodes'='node01,node02', 
  'es.port'='9200', 
  'es.resource'='index/mapping'
 );

-- 下载jar:https://www.elastic.co/downloads/past-releases/elasticsearch-apache-hadoop-5-5-0

6.当某一个字段的值以某一个字符 如: ";" ,我们想取出 ";" 前的内容可以进行切割字符串:

SELECT  ID, CASE_ID_, 
 split('192.168.0.1','\\.')
 FROM table

-- 当split包含在 "" 之中时 需要加4个\,如 hive -e "....  split('192.168.0.1','\\\\.') ... "  不然得到的值是null

 7.如6所说的,当我们使用 hive -e "..." 并且在java代码中执行hive sql的时候,原本是\\ 的要写成 \\\\ ,不然会报错,而 sql语句中的引号最好用单引号 ' 


8.文件从某一个节点下load到hive(比如csv放到node01节点的/opt/csv目录下):

load data local inpath '/opt/csv/csv_201812261002.csv'
 into table schemaName.tableName

9.文件从hdfs上到hive:

load data inpath 'hdfs://node01:8020/bulkload'
 into table test;

-- 以下是覆盖原有的表数据
load data inpath 'hdfs://node01:8020/bulkload' overwrite
 into table test;

10.hive导出到节点本地路径:

insert overwrite local directory '/home/temp'
 row format delimited
 fields terminated by '\t'
 select * from tablename;

11.hive导出到hdfs:

insert overwrite directory '/home/temp'
 row format delimited
 fields terminated by '\t'
 select * from tablename;


-- 可能遇到 hive表中字段显示为NULL时,HDFS文件中存储为\N 这种情况 , 可以像下面这样属性设置
insert overwrite directory '/home/temp'
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\t',
'serialization.format'= '',
'serialization.null.format'=''  
) STORED AS TEXTFILE
select * from tablename;

-- 或者在创建hive表时指定 ROW FORMAT DELIMITED NULL DEFINED AS ''
create table tableName( id string, name string)row format delimited fields terminated by '\t' NULL DEFINED AS '' stored as textfile;

-- 或者直接修改表定义
alter table tableNameset serdeproperties('serialization.null.format' = '');

 

:创建ES的index以及mapping:

-- 可以使用postman来创建es的index和mapping

http://192.168.0.1:9200/indexName
{
   "setting":{
             "number_of_shards":5,
             "number_of_replicas":1
   }
}


http://192.168.0.1:9200/indexName/mappingName/_mapping

{
     "properties": {
           "field1": {
                    "index": "not_analyzed",
                    "type": "string"
           },
           "field2": {
                    "index": "not_analyzed",
                    "type": "string"
           },
           "field3": {
                    "index": "not_analyzed",
                    "type": "string"
           },
           "field4": {
                    "index": "not_analyzed",
                    "type": "string"
           }
     }
}


-- Hive表数据插入ES
-- 先创建ES映射表,然后将Hive表数据insert到ES映射表,这样ES就会插入数据了,ES的mapping会自动创建

insert OVERWRITE table es_table
SELECT * FROM hive_table;

11. 具体参考 Hive sql 的 官网api :http://trafodion.apache.org/docs/sql_reference/#examples_of_substring

12.设置Hive查询引擎,set hive.execution.engine=mr ;

 

1. 在where中判断date不为空的时候要注意 date is not null 和 date != null 的区别。

2. select中统计百分比的时候,要注意分母(总数)不能为0的情况。