目录
- 🍑一、背景
- 🍑二、具体配置及代码
- 🍊2.1、源码
- 🍊2.2、hive 配置
- 🍓2.2.1、hive-env.sh
- 🍓2.2.2、hive-site.xml
- 🍑三、测试环境、验证
- 🍊3.1、通过beeline创建库及表
- 🍊3.2、查看数据库
- 参考
🍑一、背景
公司开发了一套元数据管理系统,市面上是有开源元数据管理项目的,比如前端时间研究到的apache atlas,感觉还是比较复杂的,如果想复用其某一块代码,感觉会比较耗时,不如自己写呢,本文就是通过hive hook
监控元数据,将消息发送到Kafka,Kafka将消息发送到mysql数据库(待商议,是主动调用Kafka API还是Kafka主动推)
🍑二、具体配置及代码
🍊2.1、源码
已上传Git,点击下载,编译后的jar
包为hive-collec-demo-1.0.0.dev.jar
- HiveHook
package com.renxiaozhao.collect.hive;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.ql.QueryPlan;
import org.apache.hadoop.hive.ql.hooks.Entity;
import org.apache.hadoop.hive.ql.hooks.ExecuteWithHookContext;
import org.apache.hadoop.hive.ql.hooks.HookContext;
import org.apache.hadoop.hive.ql.hooks.HookContext.HookType;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.plan.HiveOperation;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.renxiaozhao.collect.util.HiveKafkaUtil;
public class HiveHook implements ExecuteWithHookContext {
private static final Logger LOGGER = LoggerFactory.getLogger(HiveHook.class);
// 存储Hive的SQL操作类型
private static final HashSet<String> OPERATION_NAMES = new HashSet<>();
// HiveOperation是一个枚举类,封装了Hive的SQL操作类型
// 监控SQL操作类型
static {
// 建表
OPERATION_NAMES.add(HiveOperation.CREATETABLE.getOperationName());
// 修改数据库属性
OPERATION_NAMES.add(HiveOperation.ALTERDATABASE.getOperationName());
// 修改数据库属主
OPERATION_NAMES.add(HiveOperation.ALTERDATABASE_OWNER.getOperationName());
// 修改表属性,添加列
OPERATION_NAMES.add(HiveOperation.ALTERTABLE_ADDCOLS.getOperationName());
// 修改表属性,表存储路径
OPERATION_NAMES.add(HiveOperation.ALTERTABLE_LOCATION.getOperationName());
// 修改表属性
OPERATION_NAMES.add(HiveOperation.ALTERTABLE_PROPERTIES.getOperationName());
// 表重命名
OPERATION_NAMES.add(HiveOperation.ALTERTABLE_RENAME.getOperationName());
// 列重命名
OPERATION_NAMES.add(HiveOperation.ALTERTABLE_RENAMECOL.getOperationName());
// 更新列,先删除当前的列,然后加入新的列
OPERATION_NAMES.add(HiveOperation.ALTERTABLE_REPLACECOLS.getOperationName());
// 创建数据库
OPERATION_NAMES.add(HiveOperation.CREATEDATABASE.getOperationName());
// 删除数据库
OPERATION_NAMES.add(HiveOperation.DROPDATABASE.getOperationName());
// 删除表
OPERATION_NAMES.add(HiveOperation.DROPTABLE.getOperationName());
}
@Override
public void run(HookContext hookContext) throws Exception {
assert (hookContext.getHookType() == HookType.POST_EXEC_HOOK);
// 执行计划
QueryPlan plan = hookContext.getQueryPlan();
// 操作名称
String operationName = plan.getOperationName();
LOGGER.info("执行的SQL语句: " + plan.getQueryString());
LOGGER.info("操作名称: " + operationName);
if (OPERATION_NAMES.contains(operationName) && !plan.isExplain()) {
LOGGER.info("监控SQL操作");
Set<ReadEntity> inputs = hookContext.getInputs();
Set<WriteEntity> outputs = hookContext.getOutputs();
for (Entity entity : inputs) {
LOGGER.info("Hook metadata输入值: " + toJson(entity));
}
for (Entity entity : outputs) {
LOGGER.info("发送Hook metadata到Kafka,输出值: " + toJson(entity));
HiveKafkaUtil.kafkaProducer("HIVE_HOOK", toJson(entity));
}
} else {
LOGGER.info("不在监控范围,忽略该hook!");
}
}
private static String toJson(Entity entity) throws Exception {
ObjectMapper mapper = new ObjectMapper();
// entity的类型
// 主要包括:
// DATABASE, TABLE, PARTITION, DUMMYPARTITION, DFS_DIR, LOCAL_DIR, FUNCTION
switch (entity.getType()) {
case DATABASE:
Database db = entity.getDatabase();
return mapper.writeValueAsString(db);
case TABLE:
return mapper.writeValueAsString(entity.getTable().getTTable());
}
return null;
}
}
- HiveKafkaUtil
package com.renxiaozhao.collect.util;
import java.sql.SQLException;
import java.time.Duration;
import java.util.Arrays;
import java.util.Properties;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.producer.Callback;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import org.apache.kafka.clients.producer.RecordMetadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HiveKafkaUtil {
private static final Logger LOGGER = LoggerFactory.getLogger(HiveKafkaUtil.class);
public static void main(String[] args) {
kafkaProducer("HIVE_HOOK", "hello2");
}
public static void kafkaProducer(String topicName,String value) {
Properties properties = new Properties();
properties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.38.10:9092");
properties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
properties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
properties.put(ProducerConfig.ACKS_CONFIG, "all");
properties.put(ProducerConfig.BUFFER_MEMORY_CONFIG, 33554432);
properties.put(ProducerConfig.RETRIES_CONFIG, 0);
properties.put(ProducerConfig.RETRY_BACKOFF_MS_CONFIG, 300);
properties.put(ProducerConfig.BATCH_SIZE_CONFIG, 16384);
// KafkaProducer 是线程安全的,可以多个线程使用用一个 KafkaProducer
KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(properties);
ProducerRecord<String, String> record = new ProducerRecord<>(topicName, value);
kafkaProducer.send(record, new Callback() {
@Override
public void onCompletion(RecordMetadata metadata, Exception e) {
if (e != null) {
LOGGER.error(String.format("发送数据到kafka发生异常: %s",e));
return;
}
LOGGER.info("发送数据到kafka成功, topic: " + metadata.topic() + " offset: " + metadata.offset() + " partition: "
+ metadata.partition());
LOGGER.info("从kafka获取数据topic数据: " + metadata.topic());
kafkaConsumer(metadata.topic());
}
});
kafkaProducer.close();
}
public static void kafkaConsumer(String topicName) {
Properties props = new Properties();
// 必须指定
props.put("bootstrap.servers", "192.168.38.10:9092");
// 必须指定
props.put("group.id", "atlas");
// 必须指定
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// 必须指定
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// 从最早的消息开始读取
props.put("auto.offset.reset", "earliest");
props.put("enable.auto.commit", "true");
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
consumer.subscribe(Arrays.asList(topicName));
try {
ConsumerRecords<String, String> records = consumer.poll(Duration.ofSeconds(2));
for (ConsumerRecord<String, String> record : records) {
LOGGER.info("KafkaConsumer获取"+topicName+"信息value=" +record.value());
if(!DBMySqlUtil.testColletHiveMeta(record.value())) {
LOGGER.info("KafkaConsumer获取"+topicName+"信息,插入mysql库失败");
}
}
} catch (ClassNotFoundException e) {
LOGGER.error(String.format("kafkaConsumer调用DbUtil报错: %s", e));
} catch (SQLException e) {
LOGGER.error(String.format("kafkaConsumer调用DbUtil报错: %s", e));
} finally {
consumer.close();
}
}
}
- DBMySqlUtil
package com.renxiaozhao.collect.util;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**.
* JDBC连接工具类
* @author admin
*
*/
public class DBMySqlUtil {
private static final Logger LOGGER = LoggerFactory.getLogger(DBMySqlUtil.class);
// public static Connection getConn(String url, String user, String password) throws ClassNotFoundException {
public static Connection getConn() throws ClassNotFoundException {
Connection conn = null;
try {
Class.forName("com.mysql.cj.jdbc.Driver");
conn = DriverManager.getConnection("jdbc:mysql://192.168.38.10:3306/dolphinscheduler?characterEncoding=utf8&useSSL=false&serverTimezone=Asia/Shanghai&rewriteBatchedStatements=true",
"ds_user","dolphinscheduler");// 连接数据库
} catch (SQLException e) {
LOGGER.error("DBMySqlConfig.getConn()异常---->", e);
}
return conn;
}
public static boolean testColletHiveMeta(String value) throws SQLException, ClassNotFoundException {
Connection con = getConn();
String sql = "insert into test_hive(meta_value) values('"+value+"')";
PreparedStatement pst = con.prepareStatement(sql);
return pst.execute(sql);
}
public void closeConn(Connection conn) {
try {
if (conn != null) {
conn.close();
conn = null;
}
} catch (SQLException e) {
LOGGER.error("DBMySqlConfig.closeConn()异常---->", e);
}
}
public PreparedStatement getPStmt(Connection conn, String sql) {
PreparedStatement pstmt = null;
try {
pstmt = conn.prepareStatement(sql);
} catch (SQLException e) {
LOGGER.error("DBMySqlConfig.getPStmt()异常---->", e);
}
return pstmt;
}
public void closePStmt(PreparedStatement stmt) {
try {
if (stmt != null) {
stmt.close();
stmt = null;
}
} catch (SQLException e) {
LOGGER.error("DBMySqlConfig.closePStmt()异常---->", e);
}
}
public void closeRs(ResultSet rs) {
try {
if (rs != null) {
rs.close();
rs = null;
}
} catch (SQLException e) {
LOGGER.error("DBMySqlConfig.closeRs()异常---->", e);
}
}
public ResultSet executeQuery(Connection conn, String sql) {
ResultSet rs = null;
try {
rs = conn.createStatement().executeQuery(sql);
} catch (SQLException e) {
LOGGER.error("DBMySqlConfig.executeQuery()异常---->", e);
}
return rs;
}
}
🍊2.2、hive 配置
🍓2.2.1、hive-env.sh
增加HIVE_AUX_JARS_PATH
变量,引入2.1编译后的jar包:
export HIVE_AUX_JARS_PATH=/root/collect
- 目前只需要下面两个jar包
🍓2.2.2、hive-site.xml
增加hook配置:
<property>
<name>hive.exec.post.hooks</name>
<value>com.renxiaozhao.collect.hive.HiveHook</value>
</property>
🍑三、测试环境、验证
确保hadoop、Kafka、hive已成功部署
🍊3.1、通过beeline创建库及表
[root@host1 bin]# ./beeline -u jdbc:hive2://192.168.38.10:10000 -n root
Connecting to jdbc:hive2://192.168.38.10:10000
Connected to: Apache Hive (version 3.1.3)
Driver: Hive JDBC (version 3.1.3)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 3.1.3 by Apache Hive
0: jdbc:hive2://192.168.38.10:10000> CREATE database hive_test_db;
No rows affected (0.668 seconds)
0: jdbc:hive2://192.168.38.10:10000> CREATE TABLE hive_meta_test(id int,name string);
No rows affected (3.497 seconds)
0: jdbc:hive2://192.168.38.10:10000> CREATE TABLE hive_meta_testB as select * from hive_meta_test;
No rows affected (25.366 seconds)
0: jdbc:hive2://192.168.38.10:10000>
0: jdbc:hive2://192.168.38.10:10000>
🍊3.2、查看数据库
三条语句四条记录?看最后两条的创建时间是一样的
详细数据:
{
"name": "hive_test_db",
"description": null,
"locationUri": "hdfs://192.168.38.10:8020/user/hive/warehouse/hive_test_db.db",
"parameters": {},
"privileges": null,
"ownerName": "root",
"ownerType": "USER",
"catalogName": "hive",
"setCatalogName": true,
"setParameters": true,
"setPrivileges": false,
"setOwnerType": true,
"parametersSize": 0,
"setName": true,
"setDescription": false,
"setLocationUri": true,
"setOwnerName": true
}
{
"name": "hive_test_db",
"description": null,
"locationUri": null,
"parameters": null,
"privileges": null,
"ownerName": null,
"ownerType": null,
"catalogName": null,
"setCatalogName": false,
"setParameters": false,
"setPrivileges": false,
"setOwnerType": false,
"parametersSize": 0,
"setName": true,
"setDescription": false,
"setLocationUri": false,
"setOwnerName": false
}
{
"name": "default",
"description": "Default Hive database",
"locationUri": "hdfs://192.168.38.10:8020/user/hive/warehouse",
"parameters": {},
"privileges": null,
"ownerName": "public",
"ownerType": "ROLE",
"catalogName": "hive",
"setCatalogName": true,
"setParameters": true,
"setPrivileges": false,
"setOwnerType": true,
"parametersSize": 0,
"setName": true,
"setDescription": true,
"setLocationUri": true,
"setOwnerName": true
}
{
"tableName": "hive_meta_test",
"dbName": "default",
"owner": "root",
"createTime": 1656849289,
"lastAccessTime": 0,
"retention": 0,
"sd": {
"cols": [],
"location": null,
"inputFormat": "org.apache.hadoop.mapred.SequenceFileInputFormat",
"outputFormat": "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
"compressed": false,
"numBuckets": -1,
"serdeInfo": {
"name": null,
"serializationLib": "org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe",
"parameters": {
"serialization.format": "1"
},
"description": null,
"serializerClass": null,
"deserializerClass": null,
"serdeType": null,
"setParameters": true,
"parametersSize": 1,
"setName": false,
"setDescription": false,
"setSerializationLib": true,
"setSerializerClass": false,
"setDeserializerClass": false,
"setSerdeType": false
},
"bucketCols": [],
"sortCols": [],
"parameters": {},
"skewedInfo": {
"skewedColNames": [],
"skewedColValues": [],
"skewedColValueLocationMaps": {},
"setSkewedColNames": true,
"setSkewedColValues": true,
"setSkewedColValueLocationMaps": true,
"skewedColNamesSize": 0,
"skewedColNamesIterator": [],
"skewedColValuesSize": 0,
"skewedColValuesIterator": [],
"skewedColValueLocationMapsSize": 0
},
"storedAsSubDirectories": false,
"colsSize": 0,
"setParameters": true,
"parametersSize": 0,
"setLocation": false,
"setInputFormat": true,
"setCols": true,
"colsIterator": [],
"setSkewedInfo": true,
"bucketColsSize": 0,
"bucketColsIterator": [],
"sortColsSize": 0,
"sortColsIterator": [],
"setOutputFormat": true,
"setSerdeInfo": true,
"setBucketCols": true,
"setSortCols": true,
"setCompressed": false,
"setNumBuckets": true,
"setStoredAsSubDirectories": false
},
"partitionKeys": [],
"parameters": {},
"viewOriginalText": null,
"viewExpandedText": null,
"tableType": "MANAGED_TABLE",
"privileges": null,
"temporary": false,
"rewriteEnabled": false,
"creationMetadata": null,
"catName": null,
"ownerType": "USER",
"partitionKeysSize": 0,
"setCatName": false,
"setParameters": true,
"setTableName": true,
"setDbName": true,
"setOwner": true,
"setViewOriginalText": false,
"setViewExpandedText": false,
"setTableType": true,
"setPrivileges": false,
"setCreationMetadata": false,
"setOwnerType": true,
"setCreateTime": true,
"setLastAccessTime": false,
"setRetention": false,
"partitionKeysIterator": [],
"parametersSize": 0,
"setTemporary": false,
"setRewriteEnabled": false,
"setPartitionKeys": true,
"setSd": true
}
- 整明白了,创建数据库,一条记录,创建表两条记录(数据库一条+表一条)
- 指定数据库创建表(同样两条数据)
参考
hive hook应用元数据管理集成部署及测试