目录
- 一、案例项目实施方案
- 二、 RegexSerDe处理数据
- 三、数据拆分
- 四、UDF数据转换
- 五、数据分析
- 六、基于python数据预处理
一、案例项目实施方案
拿到源数据access.log之后,准备工作如下:
1.数据进行预处理,加载hive表之前
>>MR程序处理
>>正则表达式(企业推荐)
>>python脚本
2.表拆分,源数据不变,创建对应业务需求的字表
3.基于子表的基础之上:
3-1.数据文件存储格式:orc/parquet
3-2.数据文件压缩:snappy
3-3.map output:中间结果数据压缩snappy
3-4.外部表
3-5.分区表
3-6.UDF数据处理
二、 RegexSerDe处理数据
查看数据:
创建hive表并通过正则表达式处理数据:
CREATE TABLE apache_log (
host STRING,
identity STRING,
user STRING,
time STRING,
request STRING,
status STRING,
size STRING,
referer STRING,
agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES ("input.regex" = "([^ ]*) (-|[^ ]*) (-|[^ ]*) (-|\\[[^\\]]*\\]) (\"[^\"]*\") ([0-9]*) ([0-9]*) (\"[^\"]*\") (\"[^\"]*\")?")
STORED AS TEXTFILE;
加载数据:
load data local inpath '/opt/datas/access.log' into table apache_log;
查看数据:
hive (db_hive)> select * from apache_log limit 5;
OK
apache_log.host apache_log.identity apache_log.user apache_log.time apache_log.request apache_log.status apache_log.size apache_log.referer apache_log.agent
194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] "GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1" 3040"-" "Mozilla/4.0 (compatible;)"
183.49.46.228 - - [18/Sep/2013:06:49:23 +0000] "-" 400 0 "-" "-"
163.177.71.12 - - [18/Sep/2013:06:49:33 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
163.177.71.12 - - [18/Sep/2013:06:49:36 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
101.226.68.137 - - [18/Sep/2013:06:49:42 +0000] "HEAD / HTTP/1.1" 200 20 "-" "DNSPod-Monitor/1.0"
Time taken: 0.309 seconds, Fetched: 5 row(s)
三、数据拆分
根据业务需求,我们不需要完全使用大表的数据,我们可以将大表中的部分数据提取出来,组成一个字表。
CREATE TABLE apache_log_comm (
host STRING,
time STRING,
status STRING,
referer STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS ORC tblproperties ("orc.compress"="SNAPPY");
加载子表数据
insert into table apache_log_comm select host,time,status,referer from apache_log;
查看子表数据
hive (db_hive)> select * from apache_log_comm limit 10;
OK
apache_log_comm.host apache_log_comm.time apache_log_comm.status apache_log_comm.referer
194.237.142.21 [18/Sep/2013:06:49:18 +0000] 304 "-"
183.49.46.228 [18/Sep/2013:06:49:23 +0000] 400 "-"
163.177.71.12 [18/Sep/2013:06:49:33 +0000] 200 "-"
163.177.71.12 [18/Sep/2013:06:49:36 +0000] 200 "-"
101.226.68.137 [18/Sep/2013:06:49:42 +0000] 200 "-"
101.226.68.137 [18/Sep/2013:06:49:45 +0000] 200 "-"
60.208.6.156 [18/Sep/2013:06:49:48 +0000] 200 "http://cos.name/category/software/packages/"
222.68.172.190 [18/Sep/2013:06:49:57 +0000] 200 "http://www.angularjs.cn/A00n"
222.68.172.190 [18/Sep/2013:06:50:08 +0000] 400 "-"
183.195.232.138 [18/Sep/2013:06:50:16 +0000] 200 "-"
四、UDF数据转换
根据上面操作,我们得出数据拆分后的子表,但是根据业务需求,我们需要对子表的某些字段进行数据处理,这时就需要UDF数据转换,我们需要把time字段的中括号去掉并且转换成相应的时间数据格式、还需要把referer字段的双引号去掉。
编写DateTransform代码:
package com.kfk.hive;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
/**
* @author : 蔡政洁
* @email :caizhengjie888@icloud.com
* @date : 2020/10/29
* @time : 9:36 上午
*/
public class DateTransform extends UDF {
// 18/Sep/2013:06:49:18 +0000
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
SimpleDateFormat outSimpleDateFormat = new SimpleDateFormat("yyyyMMddHHmmss", Locale.ENGLISH);
public Text evaluate(final Text s) {
if (s == null) { return null; }
String outdata = null;
try {
Date date = simpleDateFormat.parse(s.toString());
outdata = outSimpleDateFormat.format(date);
}catch (Exception e){
e.printStackTrace();
}
return new Text(outdata);
}
public static void main(String[] args) {
System.out.println(new DateTransform().evaluate(new Text("18/Sep/2013:06:49:18 +0000")));
}
}
编写RemoveSquare代码:
package com.kfk.hive;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* @author : 蔡政洁
* @email :caizhengjie888@icloud.com
* @date : 2020/10/29
* @time : 9:50 上午
*/
public class RemoveSquare extends UDF {
public Text evaluate(final Text s) {
if (s == null) { return null; }
String str = s.toString();
return new Text(str.replaceAll("\\[|\\]",""));
}
public static void main(String[] args) {
System.out.println(new RemoveSquare().evaluate(new Text("[18/Sep/2013:06:49:18 +0000]")));
}
}
编写RemoveQuotes代码:
package com.kfk.hive;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
/**
* @author : 蔡政洁
* @email :caizhengjie888@icloud.com
* @date : 2020/10/29
* @time : 9:50 上午
*/
public class RemoveQuotes extends UDF {
public Text evaluate(final Text s) {
if (s == null) { return null; }
String str = s.toString();
str.replaceAll("\"","");
return new Text(str.replaceAll("\"",""));
}
public static void main(String[] args) {
System.out.println(new RemoveQuotes().evaluate(new Text("\"-\"")));
}
}
打包并上传!
添加jar包:
add jar /opt/jars/datetransform.jar;
add jar /opt/jars/removequotes.jar;
add jar /opt/jars/removesquare.jar;
创建function:
create temporary function kfk_removeSquare as "com.kfk.hive.RemoveSquare";
create temporary function kfk_removeQuotes as 'com.kfk.hive.RemoveQuotes';
create temporary function kfk_dateTransform as 'com.kfk.hive.DateTransform';
创建处理UDF数据处理过之后的子表
CREATE TABLE apache_log_opt
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS ORC tblproperties ("orc.compress"="SNAPPY")
AS select host,kfk_datetransform(kfk_removesquare(time)) time,status,kfk_removequotes(referer) referer from apache_log_comm;
查看数据:
hive (db_hive)> select * from apache_log_opt limit 10;
OK
apache_log_opt.host apache_log_opt.time apache_log_opt.status apache_log_opt.referer
194.237.142.21 20130918064918 304 -
183.49.46.228 20130918064923 400 -
163.177.71.12 20130918064933 200 -
163.177.71.12 20130918064936 200 -
101.226.68.137 20130918064942 200 -
101.226.68.137 20130918064945 200 -
60.208.6.156 20130918064948 200 http://cos.name/category/software/packages/
222.68.172.190 20130918064957 200 http://www.angularjs.cn/A00n
222.68.172.190 20130918065008 400 -
183.195.232.138 20130918065016 200 -
Time taken: 0.054 seconds, Fetched: 10 row(s)
由此可见数据已经被处理过
五、数据分析
得到子表之后,我们开始对数据进行简单的分析,比如:我们要找出数量排名前10的referer:
select referer,count(1) count from apache_log_opt group by referer order by count desc limit 10;
查询结果:
referer count
- 6053
http://blog.fens.me/category/hadoop-action/ 547
http://blog.fens.me/ 377
http://blog.fens.me/wp-admin/post.php?post=2445&action=edit&message=10 360
http://blog.fens.me/r-json-rjson/ 274
http://blog.fens.me/angularjs-webstorm-ide/ 271
http://blog.fens.me/wp-content/themes/silesia/style.css 228
http://blog.fens.me/nodejs-express3/ 198
http://blog.fens.me/hadoop-mahout-roadmap/ 182
http://blog.fens.me/vps-ip-dns/ 176
Time taken: 32.674 seconds, Fetched: 10 row(s)
六、基于python数据预处理
在hive的官网有详细的python预处理的教程:https://cwiki.apache.org/confluence/display/Hive/GettingStarted#app-switcher
首先下载数据:
wget http://files.grouplens.org/datasets/movielens/ml-100k.zip
解压数据:
unzip ml-100k.zip
如果没有安装unzip需要安装
sudo yum install -y unzip zip
创建数据表:
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
加载数据:
LOAD DATA LOCAL INPATH '/opt/datas/u.data'
OVERWRITE INTO TABLE u_data;
查看u_data数据表
hive (db_hive)> select * from u_data limit 5;
OK
u_data.userid u_data.movieid u_data.rating u_data.unixtime
196 242 3 881250949
186 302 3 891717742
22 377 1 878887116
244 51 2 880606923
166 346 1 886397596
创建python预处理的文件 weekday_mapper.py:
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
使用weekday_mapper.py脚本:
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
add FILE weekday_mapper.py;
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
分析数据:
select weekday,count(1) count from u_data_new group by weekday order by count desc;
weekday count
5 17964
3 15426
2 14816
4 13774
1 13278
7 12424
6 12318
如果这篇文章对您有帮助,左下角的大拇指就是对博主最大的鼓励。
您的鼓励就是博主最大的动力!