本文将本地的hadoop日志,加载到Hive数据仓库中,再过滤日志中有用的日志信息转存到Mysql数据库里。
环境:hive-0.12.0 + Hadoop1.2.1
1、日志格式
2014-04-17 22:53:30,621 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* addToInvalidates: blk_4856124673669777083 to 127.0.0.1:50010
2014-04-17 22:53:30,621 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* addToInvalidates: blk_3952951948255939678 to 127.0.0.1:50010
2014-04-17 22:53:30,629 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* addToInvalidates: blk_-7659295115271387204 to 127.0.0.1:50010
2014-04-17 22:53:30,713 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* addToInvalidates: blk_1247712269990837159 to 127.0.0.1:50010
2014-04-17 22:53:30,713 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* addToInvalidates: blk_3041853621376201540 to 127.0.0.1:50010
2014-04-17 22:53:30,713 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* addToInvalidates: blk_5165085626508844330 to 127.0.0.1:50010
2014-04-17 22:53:30,713 INFO org.apache.hadoop.hdfs.StateChange: BLOCK* addToInvalidates: blk_5058515618805863349 to 127.0.0.1:50010
日志的格式大概是这样子的,这里采用空格方式对其分隔,组织成多列,不过后面的提示信息不好处理,暂时以3列来存储。
表结构定义:
1. //建立Hive表,用来存储日志信息
2. if not exists loginfo11
3. ( rdate String,time ARRAY<string>,type STRING,relateclass STRING,
4. information1 STRING,information2 STRING,information3 STRING)
5. ROW FORMAT DELIMITED FIELDS TERMINATED BY ' ' COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY ':'");
2、代码:
GetConnect.Java 类负责建立与Hive、Mysql的连接与关闭;
1. //package com.my.hivetest;
2.
3. import java.sql.Connection;
4. import java.sql.DriverManager;
5. import java.sql.ResultSet;
6. import java.sql.SQLException;
7. import java.sql.Statement;
8.
9. public class getConnect {
10.
11. private static Connection conn = null;
12. private static Connection conntomysql = null;
13.
14. private getConnect() {
15. }
16. public static Connection getHiveConn() throws SQLException {
17. if (conn == null)
18. {
19. try {
20. "org.apache.hadoop.hive.jdbc.HiveDriver");
21. catch (ClassNotFoundException e) {
22. // TODO Auto-generated catch block
23. e.printStackTrace();
24. 1);
25. }
26. conn = DriverManager.getConnection(
27. "jdbc:hive://localhost:50031/default", "", "");
28. 1111);
29. }
30. return conn;
31. }
32.
33. public static Connection getMysqlConn() throws SQLException {
34. if (conntomysql == null)
35. {
36. try {
37. "com.mysql.jdbc.Driver");
38. catch (ClassNotFoundException e) {
39. // TODO Auto-generated catch block
40. e.printStackTrace();
41. 1);
42. }
43. conntomysql = DriverManager.getConnection(
44. "jdbc:mysql://localhost:3306/hadoop?createDatabaseIfNotExist=true&useUnicode=true&characterEncoding=GBK",
45. "root", "123456");
46. 1111);
47. }
48. return conntomysql;
49. }
50. public static void closeHive() throws SQLException {
51. if (conn != null)
52. conn.close();
53. }
54. public static void closemysql() throws SQLException {
55. if (conntomysql != null)
56. conntomysql.close();
57. }
58. }
hiveUtil.java类,用来创建Hive表、加载数据、依据条件查询数据。以及将数据存到mysql中的方法。
1. //package com.my.hivetest;
2.
3. import java.sql.Connection;
4. import java.sql.ResultSet;
5. import java.sql.SQLException;
6. import java.sql.Statement;
7.
8. public class HiveUtil {
9. //创建hive表
10. public static void createTable(String hiveql) throws SQLException{
11. Connection con=getConnect.getHiveConn();
12.
13. Statement stmt = con.createStatement();
14. ResultSet res = stmt.executeQuery(hiveql);
15. }
16. //查询hive表
17. public static ResultSet queryHive(String hiveql) throws SQLException{
18. Connection con=getConnect.getHiveConn();
19.
20. Statement stmt = con.createStatement();
21. ResultSet res = stmt.executeQuery(hiveql);
22. return res;
23. }
24. //加载数据
25. public static void loadDate(String hiveql) throws SQLException{
26. Connection con=getConnect.getHiveConn();
27. Statement stmt = con.createStatement();
28. ResultSet res = stmt.executeQuery(hiveql);
29. }
30. //转存到mysql中
31. public static void hiveTomysql(ResultSet Hiveres) throws SQLException{
32. Connection con=getConnect.getMysqlConn();
33. Statement stmt = con.createStatement();
34. while (Hiveres.next()) {
35. 1);
36. 2);
37. 3);
38. 4);
39. 5)+Hiveres.getString(6)+Hiveres.getString(7);//信息组合
40. " "+time+" "+type+" "+relateclass+" "+information+" ");
41. int i = stmt.executeUpdate(
42. "insert into hadooplog values(0,'"+rdate+"','"+time+"','"+type+"','"+relateclass+"','"+information+"')");
43. }
44. }
45. }
exeHiveQL.java类,执行类,实现main函数。
1. //package com.my.hivetest;
2.
3. import java.sql.Connection;
4. import java.sql.DriverManager;
5. import java.sql.ResultSet;
6. import java.sql.SQLException;
7. import java.sql.Statement;
8.
9. public class exeHiveQL {
10. public static void main(String[] args) throws SQLException {
11.
12. if (args.length < 2) {
13. "请输入查询条件: 日志级别 日期");
14. 1);
15. }
16.
17. 0];
18. 1];
19.
20. //在hive中创建表
21. HiveUtil.createTable(
22. if not exists loginfo11
23. ( rdate String,time ARRAY<string>,type STRING,
24. relateclass STRING,information1 STRING,information2 STRING,
25. ' '
26. ',' MAP KEYS TERMINATED BY ':'");
27. //加载hadoop日志
28. "load data local inpath '/root/hadoop-1.2.1/logs/*.log.*' overwrite into table loginfo11");
29. //查询有用的信息
30. //test code
31. "select rdate,time[0],type,relateclass,information1,information2,information3 from loginfo11 where type='"
32. "' and rdate='" + date + "' ";
33. "----test");
34. ResultSet res1 = HiveUtil.queryHive(
35. 0],type,relateclass,information1,
36. '"+ type + "' and rdate='" + date + "' ");
37. //查询结果转存到mysql中
38. HiveUtil.hiveTomysql(res1);
39. //关闭hive连接
40. getConnect.closeHive();
41. //关闭mysql连接
42. getConnect.closemysql();
43. }
44. }
在运行之前 需要开启hive server服务,这里端口号 50031 要与GetConnect.Java 类的一致。
# bin/hive --service hiveserver -p 50031
然后在eclipse中运行起来,设置输入参数
ERROR 2014-04-14
不过在运行之前还需要导入各种包:
hive-jdbc-0.12.0.jar
hive-metastore-0.12.0.jar
mysql-connector-java-5.1.10.jar(mysql的jdbc驱动,可到http://dev.mysql.com/downloads/connector/j/下载)
以及hive/lib下所有的包。。。(为图省事全导入了。。。)
还有一点就是,要预先在mysql中建立数据库hadoop,并且在里面建立表hadooplog。表的格式与代码中的保持一致即可。。
源代码:https://github.com/y521263/Hadoop_in_Action
参考资料:
Apache Hive