Hive学习 

 默认日志地址 /tmp/root(用户名)/hive.log 

 hive -hiveconf hive.root.logger=DEBUG,console   //重启调试。 

 HIVE 基本操作 hive shell  

 新建表 

 line_name,deal_yymmdd,deal_hh,count 

 线路10,20150101,06,714 

 线路10,20150101,07,2008 

 线路10,20150101,08,2652 

 线路10,20150101,09,2691 

 创建内部表 

 CREATE TABLE bus (line_name string,deal_yymmdd string, deal_hh string, count int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' ;  

 CREATE TABLE bus2 (line_name string,deal_yymmdd string, deal_hh string, count int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY ',';  

 创建外部表  对于使用external建表完成后,再drop掉表,表中的数据还在文件系统中,需要LOCATION指定存放数据的文件路径(如果目录存在会删除目录下文件重新建立) 

 COMMENT描述这张表 

 CREATE EXTERNAL TABLE IF NOT EXISTS bus ( 

 line_name string, 

 deal_yymmdd string,  

 deal_hh string,  

 count int)  

 COMMENT 'This is the staging page view table'    

 ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 

 LOCATION  '/opt/bus2';  

 #导入数据reduces2.txt到bus表 hdfs数据移动到/user/hive/warehouse/bus OVERWRITE 关键字,则目标表(或者分区)中的内容(如果有)会被删除 

 导入本地文件 

 LOAD DATA LOCAL INPATH '/home/reduces2.txt' OVERWRITE INTO TABLE bus; 

 导入HDFS文件 

 LOAD DATA INPATH '/opt/bus/part-m-00000' OVERWRITE INTO TABLE bus; 

 查看表和数据 

 show tables; 

 #正则匹配表名 

 show tables '*b*'; 

 #查看表数据 

 select * from bus; 

 #查看表结构 

 desc bus;  

 DESCRIBE bus; 

 #增加一个字段 

 ALTER TABLE bus ADD COLUMNS (week String); 

 #重命令表名 

 ALTER TABLE bus2 RENAME TO bus3; 

 删除表 

 DROP TABLE bus3; 

 从其他表导入数据 

 Specified key was too long; max key length is 767 bytes出错解决:一般是由于mysql的最大索引长度导致1、数据库的字符集为latin1;2、set global binlog_format='MIXED' 

 CREATE TABLE bus4 (line_name string,deal_yymmdd string, deal_hh string, count int,week string) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; 

 INSERT OVERWRITE TABLE bus4 SELECT * FROM bus ; 

 DROP TABLE bus4; 

 创建表并从其他表导入数据 

 CREATE TABLE bus4 AS SELECT * FROM bus; 

 仅复制表结构不导数据 

 CREATE TABLE bus4 LIKE bus; 

 数据导出   杀死JAVA进程kill -9 11110(pid) 

 导出到HDFS上其他目录 

 hadoop fs -cp /user/hive/warehouse/bus  /opt 

 通过Hive导出到本地文件系统 

 INSERT OVERWRITE LOCAL DIRECTORY '/opt/tools/' SELECT * FROM bus; 

 Hive查询HiveQL 

 普通查询:排序,列别名,嵌套子查询 ,列出前多少条 

 FROM ( 

 SELECT deal_yymmdd as c2,count FROM bus    

 ) t 

 SELECT t.c2, t.count 

 WHERE count>3000   

 LIMIT 2;   

 排序ORDER BY ASC(升序),DESC(降序)||SORT BY   ASC(升序),DESC(降序)顺序将根据列类型而定。如果数字类型的列,则排序顺序也以数字顺序。如果字符串类型的列,则排序顺序将字典顺序,只能在单机范围内排序. 

 select line_name,deal_yymmdd,deal_hh,count from bus1 ORDER BY deal_yymmdd ASC,deal_hh DESC; 

 连接查询:JOIN 

 SELECT t1.deal_yymmdd,t1.deal_hh,t2.deal_yymmdd,t2.deal_hh 

 FROM bus t1 JOIN bus4 t2 on t1.deal_yymmdd=t2.deal_yymmdd 

 WHERE t1.count>3000 

 LIMIT 5; 

 聚合查询1:count, avg 总数 平均值 

 SELECT count(*), avg(count) FROM bus WHERE deal_yymmdd='20150104'; 

 聚合查询2:count, distinct  总数 不重复的值 

 SELECT count(DISTINCT line_name) FROM bus; 

 聚合查询3:GROUP BY, HAVING  where在分组之前过滤数据having在分组之后过滤数据 ORDER BY ASC(升序),DESC(降序); 

 SELECT avg(count),sum(count) FROM bus GROUP BY line_name; 

 SELECT avg(count),deal_yymmdd,sum(count) FROM bus GROUP BY deal_yymmdd,line_name HAVING sum(count)>40000; 

 Hive视图 Hive视图和数据库视图的概念是一样的 

 CREATE VIEW v_bus AS SELECT deal_hh,count FROM bus where deal_yymmdd='20150104'; 

 select * from v_bus; 

 删除视图 

 DROP VIEW IF EXISTS v_bus; 

 Hive分区表  内部表以line_name 分区 

 CREATE TABLE bus1( 

 line_name string, 

 deal_yymmdd string,  

 deal_hh string,  

 count int, 

 week string)  

 PARTITIONED BY (line string) 

 ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; 

 装载 

 LOAD DATA  INPATH '/opt/bus/part-m-00001' OVERWRITE INTO TABLE bus1 PARTITION (line='10'); 

 LOAD DATA  INPATH '/opt/bus/part-m-00000' OVERWRITE INTO TABLE bus1 PARTITION (line='15'); 

 外部表建分区 可以不指定存储地址 

 CREATE EXTERNAL TABLE IF NOT EXISTS bus2( 

 line_name string, 

 deal_yymmdd string,  

 deal_hh string,  

 count int)  

 PARTITIONED BY (yy string,mm string) 

 ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; 

 增加外部分区 

 ALTER TABLE  bus2 ADD PARTITION(yy='2015',mm='02') location '/opt/bus/2015/02'; 

 ALTER TABLE  bus2 ADD  

 PARTITION(yy='2015',mm='01') location '/opt/bus/2015/01' 

 PARTITION(yy='2015',mm='02') location '/opt/bus/2015/02';  --同时增加多个分区 

 删除分区 

 ALTER TABLE bus2 DROP PARTITION (yy='2015',mm='02'); 

 装载 

 LOAD DATA  INPATH '/opt/bus/part-m-00001' OVERWRITE INTO TABLE bus2 PARTITION (yy='2015',mm='02'); 

 查看分区表 

 SHOW PARTITIONS bus1; 

 查询数据 

 select * from bus2 where yy='2015' and mm='02'; 

 select * from bus1 where deal_yymmdd='20150104'; 

 select * from bus1 where line='15' and deal_yymmdd='20150104'; 

 表重命名 数据所在的位置和分区名并不改变 

 ALTER TABLE bus2 RENAME TO bus3; 

 改变列名、数据类型、注释、列位置或者它们的任意组合 

 ALTER TABLE table_name CHANGE [COLUMN] col_old_name col_new_name column_type 

 [COMMENT col_comment] [FIRST|AFTER column_name] 

 通过查询语句向表中插入数据  --如何没有指定分区位置默认/user/hive/warehouse/bus2/yy=2015/mm=01 

 INSERT OVERWRITE TABLE bus2  

 PARTITION (yy='2015',mm='01') 

 select line_name,deal_yymmdd,deal_hh,count from bus2 where yy='2015' and mm='02'; 

 导出数据  

 INSERT OVERWRITE DIRECTORY '/opt/bus2'  

 select line_name,deal_yymmdd,deal_hh,count from bus2 where yy='2015' and mm='03' 

 分成多个reduce的方法DISTRIBUTE BY 

 set mapred.reduce.tasks=2;---可以规定reduce个数,也可以不规定 

 Select sale_id, amount from t_order 

 Distribute by sale_id 

 Sort by sale_id, amount; 



 Hive交互式模式 

     quit,exit:  退出交互式shell 

     reset: 重置配置为默认值 

     set <key>=<value> : 修改特定变量的值(如果变量名拼写错误,不会报错) 

     set :  输出用户覆盖的hive配置变量 

     set -v : 输出所有Hadoop和Hive的配置变量 

     add FILE[S] *, add JAR[S] *, add ARCHIVE[S] * : 添加 一个或多个 file, jar, archives到分布式缓存 

     list FILE[S], list JAR[S], list ARCHIVE[S] : 输出已经添加到分布式缓存的资源。 

     list FILE[S] *, list JAR[S] *,list ARCHIVE[S] * : 检查给定的资源是否添加到分布式缓存 

     delete FILE[S] *,delete JAR[S] *,delete ARCHIVE[S] * : 从分布式缓存删除指定的资源 

     ! <command> :  从Hive shell执行一个shell命令 

     dfs <dfs command> :  从Hive shell执行一个dfs命令 

     <query string> : 执行一个Hive 查询,然后输出结果到标准输出 

     source FILE <filepath>:  在CLI里执行一个hive脚本文件 

 bin/hive 是一个 shell 工具,它可以用来运行于交互或批处理方式配置单元查询 
  

 Usage: hive [-hiveconf x=y]* [<-i filename>]* [<-f filename>|<-e query-string>] [-S] 

 -i <filename> Initialization Sql from file (executed automatically and silently 

 before any other commands) 

 -e 'quoted query string' Sql from command line 

 -f <filename> Sql from file 

 -S Silent mode in interactive shell where only data is emitted 

 -hiveconf x=y Use this to set hive/hadoop configuration variables. 

 -e and -f cannot be specified together. In the absence of these options, interactive shell is 

 started. 

 However, -i can be used with any other options. Multiple instances of -i can be used to 

 execute multiple init scripts. 

 To see this usage help, run hive -h 
  

 运行一个查询 

 hive -e 'select count(*) from bus1' 

 将查询结果导出到一个文件 

 hive -S -e 'select count(*) from bus1' > a.txt 

 运行一个脚本 

 hive -f /home/my/hive-script.sql