离线项目(二)数据预处理
文章目录
- 离线项目(二)数据预处理
- 一:nginx埋点
- 二:flume收集信息
- 1.agent1监听
- 2.agent2监听
- 3.agent3收集
- 启动
- 三:对数据进行预处理
- 1.主要目的
- 2.具体处理
- 四:点击流模型pageviews 信息表
- 1.为什么构建点击流模型
- 2.创建ClickStreanBean点击流类
- 3.如何判断是否属于同一个Session
- 4.具体处理
- 5.处理结果
- 五:点击流模型 visit 信息表
- 1.创建VisitBean
- 2.具体的MR处理类
- 3.处理后的结果如下
一:nginx埋点
需要注意的是要对/var/log/nginx/中的文件改权限,改为可读可写可执行
chmod 777 access.log;
chmod 777 error.log;
二:flume收集信息
1.agent1监听
/var/log/nginx/access.log
# 给当前的agent source channel sink起别名 a1代表当前agent的名字
# source的别名
a1.sources = r1
# channel的别名
a1.channels = c1
# sink的别名
a1.sinks = k1
# 配置source的相关信息 数据源的
a1.sources.r1.type = exec
a1.sources.r1.command = tail -100 /var/log/nginx/access.log
a1.sources.r1.interceptors = i1
a1.sources.r1.interceptors.i1.type = static
a1.sources.r1.interceptors.i1.key=filename
a1.sources.r1.interceptors.i1.value = access
# 配置channel的相关信息 内存
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity =100
# 配置sink的信息 采用avro协议
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = hadoop03
a1.sinks.k1.port = 45551
# 绑定source channel sink的对应关系
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
2.agent2监听
/var/log/nginx/error.log
# 给当前的agent source channel sink起别名 a1代表当前agent的名字
# source的别名
a1.sources = r2
# channel的别名
a1.channels = c2
# sink的别名
a1.sinks = k2
# 配置source的相关信息 数据源的
a1.sources.r2.type = exec
a1.sources.r2.command = tail -100 /var/log/nginx/error.log
a1.sources.r2.interceptors = i1
a1.sources.r2.interceptors.i1.type = static
a1.sources.r2.interceptors.i1.key=filename
a1.sources.r2.interceptors.i1.value = error
# 配置channel的相关信息 内存
a1.channels.c2.type = memory
a1.channels.c2.capacity = 1000
a1.channels.c2.transactionCapacity =100
# 配置sink的信息 采用avro协议
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = hadoop03
a1.sinks.k2.port = 45551
# 绑定source channel sink的对应关系
a1.sources.r2.channels = c2
a1.sinks.k2.channel = c2
3.agent3收集
从agent1和agent2传来的event
# 给当前的agent source channel sink起别名 a1代表当前agent的名字
# source的别名
a1.sources = r1
# channel的别名
a1.channels = c1
# sink的别名
a1.sinks = k1
# 配置source的相关信息 数据源的
a1.sources.r1.type = avro
# 这里的主机 和avrosink 一致
a1.sources.r1.bind = hadoop01
a1.sources.r1.port = 45551
# 配置拦截器
a1.sources.r1.interceptors = i1 i2
a1.sources.r1.interceptors.i1.type = timestamp
a1.sources.r1.interceptors.i2.type = static
a1.sources.r1.interceptors.i2.key=type
a1.sources.r1.interceptors.i2.value = log
# 配置channel的相关信息 内存
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity =100
# 配置sink的信息 控制台打印
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /user/flume_blog/%{type}/%{filename}/%Y-%m-%d/%H
a1.sinks.k1.hdfs.filePrefix = event-
a1.sinks.k1.hdfs.fileSuffix = .log
a1.sinks.k1.hdfs.rollSize = 1024
a1.sinks.k1.hdfs.rollInterval = 10
a1.sinks.k1.hdfs.rollCount = 0
a1.sinks.k1.hdfs.fileType = DataStream
a1.sinks.k1.hdfs.writeFormat = Text
# 绑定source channel sink的对应关系
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
启动
./flume-ng agent --conf conf --conf-file /home/hadoop/apps/apache-flume-1.8.0-bin/conf/log-conf --name a1 -Dflume.root.logger=INFO,console
./flume-ng agent --conf conf --conf-file /home/hadoop/apps/apache-flume-1.8.0-bin/conf/access-conf --name a1 -Dflume.root.logger=INFO,console
./flume-ng agent --conf conf --conf-file /home/hadoop/apps/apache-flume-1.8.0-bin/conf/error-conf --name a1 -Dflume.root.logger=INFO,console
三:对数据进行预处理
1.主要目的
- 过滤“不合规”数据
- 格式转换和规整
- 根据后续的统计需求,过滤分离出各种不同主题的基础数据
2.具体处理
使用MR对数据进行预处理,未经过处理的数据如下:
194.237.142.21 - - [18/Sep/2013:06:49:18 +0000] “GET /wp-content/uploads/2013/07/rstudio-git3.png HTTP/1.1” 304 0 “-” “Mozilla/4.0 (compatible;)”
简单分析一下:我们按照空格进行切分
194.237.142.21 //ip
-
-
[18/Sep/2013:06:49:18 +0000] //请求时间
"GET //请求方式
/wp-content/uploads/2013/07/rstudio-git3.png //请求的url
HTTP/1.1" //采用的协议
304 //响应码
0 //返回的数据流量
"-" //外链
"Mozilla/4.0 //浏览器信息
(compatible;)"
使用MR进行切分:
创建一个WebLogBean用来作为切分后返回结果容器
package com.aura.cn.bean;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class WebLogBean implements Writable{
/*
* $remote_addr IP
$time_local 时间
$request 请求
$status 响应状态码
$body_bytes_sent 发送字节数
$http_referer 外链
$http_user_agent 浏览器信息
*/
//标识数据是否 有效
private boolean is_avalible = true;
private String remote_addr;
private String time_local;
private String request;
private int status;
private int body_bytes_sent;
private String http_referer;
private String http_user_agent;
public WebLogBean() {
super();
}
public WebLogBean(boolean is_avalible, String remote_addr, String time_local, String request, int status,
int body_bytes_sent, String http_referer, String http_user_agent) {
this.is_avalible = is_avalible;
this.remote_addr = remote_addr;
this.time_local = time_local;
this.request = request;
this.status = status;
this.body_bytes_sent = body_bytes_sent;
this.http_referer = http_referer;
this.http_user_agent = http_user_agent;
}
@Override
public String toString() {
return is_avalible + "\001" + remote_addr + "\001" + time_local
+ "\001" + request + "\001" + status + "\001" + body_bytes_sent
+ "\001" + http_referer + "\001" + http_user_agent ;
}
public boolean isIs_avalible() {
return is_avalible;
}
public void setIs_avalible(boolean is_avalible) {
this.is_avalible = is_avalible;
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getTime_local() {
return time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public int getStatus() {
return status;
}
public void setStatus(int status) {
this.status = status;
}
public int getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(int body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public String getHttp_referer() {
return http_referer;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
//反序列化
@Override
public void readFields(DataInput in) throws IOException {
this.is_avalible=in.readBoolean();
this.remote_addr=in.readUTF();
this.time_local=in.readUTF();
this.request=in.readUTF();
this.status=in.readInt();
this.body_bytes_sent=in.readInt();
this.http_referer=in.readUTF();
this.http_user_agent=in.readUTF();
}
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeBoolean(is_avalible);
out.writeUTF(remote_addr);
out.writeUTF(time_local);
out.writeUTF(request);
out.writeInt(status);
out.writeInt(body_bytes_sent);
out.writeUTF(http_referer);
out.writeUTF(http_user_agent);
}
}
创建一个LogParse类,用来作为切分,这个方法定义了一个parseLog()是主要的切分逻辑
package com.aura.cn.predata;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Locale;
import com.aura.cn.bean.WebLogBean;
public class LogParse {
//18/Sep/2013:06:49:18
static SimpleDateFormat sdf1=new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss", Locale.US);
static SimpleDateFormat sdf2=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
/*
* 将一行日志 进行封装为 一个WebLogBean
*/
public static WebLogBean parseLog(String line) throws ParseException{
//切分 数据
String[] datas = line.split(" ");
if(datas.length >= 12){
/*
* 194.237.142.21
-
-
[18/Sep/2013:06:49:18
+0000]
"GET
/wp-content/uploads/2013/07/rstudio-git3.png
HTTP/1.1"
304
0
"-"
"Mozilla/4.0
(compatible;)"
*/
String remote_addr=datas[0];
//[18/Sep/2013:06:49:18
String time_local=sdf2.format(sdf1.parse(datas[3].substring(1)));
if("".equals(time_local) || time_local ==null){
time_local="not_avalible";
}
String request=datas[6];
int status=Integer.parseInt(datas[8].trim());
int body_bytes_sent=Integer.parseInt(datas[9].trim());
String http_referer=datas[10];
StringBuffer user_agent=new StringBuffer();
for(int i=11;i<datas.length;i++){
user_agent.append(datas[i]);
}
WebLogBean logBean=new WebLogBean(
true, remote_addr, time_local, request,
status, body_bytes_sent,
http_referer, user_agent.toString());
/*
* 无效数据 处理
* 404 500
*/
if(logBean.getStatus() > 400){
logBean.setIs_avalible(false);
}
//日期采集 灭有
if("not_avalible".equals(logBean.getTime_local())){
logBean.setIs_avalible(false);
}
/*静态页面的
* css png jpg
*/
if(logBean.getRequest().endsWith("css") || logBean.getRequest().endsWith("png")
|| logBean.getRequest().endsWith("jpg")
|| logBean.getRequest().endsWith("js")){
logBean.setIs_avalible(false);
}
return logBean;
}
return null;
}
}
具体的MR处理:
package com.aura.cn.predata;
import java.io.IOException;
import java.text.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.aura.cn.bean.WebLogBean;
public class WebLogParse {
static class WebLogParseMapper extends Mapper<LongWritable,
Text, NullWritable, WebLogBean>{
@Override
protected void map(LongWritable key, Text value,
Context context)
throws IOException, InterruptedException {
String line = value.toString();
//
WebLogBean webBean;
try {
webBean = LogParse.parseLog(line);
if(webBean != null){
context.write(NullWritable.get(), webBean);
}
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
System.setProperty("HADOOP_USER_NAME", "hadoop");
Configuration conf=new Configuration();
conf.set("fs.defaultFS", "hdfs://bd1906/");
//启动job
Job job=Job.getInstance(conf);
job.setJarByClass(WebLogParse.class);
job.setMapperClass(WebLogParseMapper.class);
//job.setReducerClass(cls);
//最终输出
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(WebLogBean.class);
job.setNumReduceTasks(0);
//输入
FileInputFormat.addInputPath(job, new Path("/user/data/access.log"));
FileSystem fs=FileSystem.get(conf);
Path out=new Path("/dianshang/data/pre_out");
if(fs.exists(out)){
fs.delete(out, true);
}
FileOutputFormat.setOutputPath(job,out );
//提交
job.waitForCompletion(true);
}
}
经过处理之后的数据如下:
false //是否是有效信息
194.237.142.21 //ip
2013-09-18 06:49:18 //时间
/wp-content/uploads/2013/07/rstudio-git3.png //请求url
304 //状态码
0 //流量字节
"-" //外链
"Mozilla/4.0(compatible;)" //浏览器信息
四:点击流模型pageviews 信息表
1.为什么构建点击流模型
因为大量的数据从点击流中更容易得出,所以可以在预处理之后直接构建点击数数据,将清洗之后的日志梳理出点击流 pageviews 模型数据 输入数据是清洗过后的结果数据 区分出每一次会话,给每一次 visit(session)增加了 session-id(随机 uuid) 梳理出每一次会话中所访问的每个页面(请求时间,url,停留时长,以及该页面在这次 session 中 的序号) 保留 referral_url,body_bytes_send,useragent
2.创建ClickStreanBean点击流类
package com.aura.cn.bean;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class ClickStreanBean implements Writable{
private String sessionId;//会话id
private String remote_addr;//ip
private String time_local;//时间
private String request;//请求url
private int status;//状态码
private int body_bytes_sent;//字节流量
private int step;//该页面在这次 session 中 的序号
private int staytime;//页面停留时间
private String http_referer;//外链
private String http_user_agent;//用户浏览器信息
@Override
public String toString() {
return sessionId + "\001" + remote_addr + "\001" + time_local
+ "\001" + request + "\001" + status + "\001" + body_bytes_sent + "\001"
+ step + "\001" + staytime + "\001" + http_referer + "\001"
+ http_user_agent;
}
public ClickStreanBean(String sessionId, String remote_addr, String time_local, String request, int status,
int body_bytes_sent, int step, int staytime, String http_referer, String http_user_agent) {
super();
this.sessionId = sessionId;
this.remote_addr = remote_addr;
this.time_local = time_local;
this.request = request;
this.status = status;
this.body_bytes_sent = body_bytes_sent;
this.step = step;
this.staytime = staytime;
this.http_referer = http_referer;
this.http_user_agent = http_user_agent;
}
public ClickStreanBean() {
super();
}
public String getSessionId() {
return sessionId;
}
public void setSessionId(String sessionId) {
this.sessionId = sessionId;
}
public String getRemote_addr() {
return remote_addr;
}
public void setRemote_addr(String remote_addr) {
this.remote_addr = remote_addr;
}
public String getTime_local() {
return time_local;
}
public void setTime_local(String time_local) {
this.time_local = time_local;
}
public String getRequest() {
return request;
}
public void setRequest(String request) {
this.request = request;
}
public int getStatus() {
return status;
}
public void setStatus(int status) {
this.status = status;
}
public int getBody_bytes_sent() {
return body_bytes_sent;
}
public void setBody_bytes_sent(int body_bytes_sent) {
this.body_bytes_sent = body_bytes_sent;
}
public int getStep() {
return step;
}
public void setStep(int step) {
this.step = step;
}
public int getStaytime() {
return staytime;
}
public void setStaytime(int staytime) {
this.staytime = staytime;
}
public String getHttp_referer() {
return http_referer;
}
public void setHttp_referer(String http_referer) {
this.http_referer = http_referer;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
@Override
public void readFields(DataInput in) throws IOException {
this.sessionId=in.readUTF();
this.remote_addr=in.readUTF();
this.time_local=in.readUTF();
this.request=in.readUTF();
this.status=in.readInt();
this.body_bytes_sent=in.readInt();
this.step=in.readInt();
this.staytime=in.readInt();
this.http_referer=in.readUTF();
this.http_user_agent=in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(sessionId);;
out.writeUTF(remote_addr);
out.writeUTF(time_local);
out.writeUTF(request);
out.writeInt(status);
out.writeInt(body_bytes_sent);
out.writeInt(step);
out.writeInt(staytime);
out.writeUTF(http_referer);
out.writeUTF(http_user_agent);
}
}
3.如何判断是否属于同一个Session
具体的点击流数据处理,MR;关于如何判断是否属于一个session,因为id作为map阶段的k,所以同一个key的数据是被分在一起的,通过context写到reduce阶段处理,所以在一个maptask里,可以直接自定义条件判断是否属于同一个session,在这里判断条件就是如果相邻的两条数据之间的时间差小于30分钟就属于同一个session,例如有A,B,C,D四条数据,A和B,B和C之间的时间差小于30,C和D之间的时间差大于30分钟,A,B,C就属于同一个session,D不属于。
4.具体处理
package com.aura.cn.click;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.UUID;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.aura.cn.bean.ClickStreanBean;
import com.aura.cn.bean.WebLogBean;
import com.aura.cn.predata.WebLogParse;
public class ClickStreamPre {
static class ClickStreamPreMapper extends Mapper<LongWritable,
Text, Text, WebLogBean>{
Text mk=new Text();
WebLogBean mv=new WebLogBean();
//读 切 封装
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, WebLogBean>.Context context)
throws IOException, InterruptedException {
//false194.237.142.212013-09-18 06:49:18/wp-content/uploads/2013/07/rstudio-git3.png3040"-""Mozilla/4.0(compatible;)"
String[] datas = value.toString().split("\001");
//"false" "true"
boolean is_avalible="false".equals(datas[0])?false:true;
if(is_avalible){
mk.set(datas[1]);
mv.setIs_avalible(is_avalible);
mv.setRemote_addr(datas[1]);
mv.setTime_local(datas[2]);
mv.setRequest(datas[3]);
mv.setStatus(Integer.parseInt(datas[4]));
mv.setBody_bytes_sent(Integer.parseInt(datas[5]));
mv.setHttp_referer(datas[6]);
mv.setHttp_user_agent(datas[7]);
context.write(mk, mv);
}
}
}
static class ClickStreamPreReducer extends Reducer<
Text, WebLogBean, ClickStreanBean, NullWritable>{
@Override
/*
* values
* 1)只能循环遍历一次
* 2)对象重用
* values 所有对象 公用一个内存地址的
*/
protected void reduce(Text key, Iterable<WebLogBean> values,
Context context)
throws IOException, InterruptedException {
//遍历 出来每一个WebLogBean 放在一个容器了
ArrayList<WebLogBean> list=new ArrayList<WebLogBean>();
for(WebLogBean v:values){
//重新创建 新的对象
WebLogBean bean=new WebLogBean();
//将 v的属性值 赋值 gei bean
try {
BeanUtils.copyProperties(bean, v);
list.add(bean);
} catch (IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//对list集合 中 按照时间排序
Collections.sort(list, new Comparator<WebLogBean>() {
@Override
public int compare(WebLogBean o1, WebLogBean o2) {
//String --- date
String time1 = o1.getTime_local();
String time2 = o2.getTime_local();
Date date1=null;
Date date2=null;
try {
date1=toDate(time1);
date2=toDate(time2);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return date1.compareTo(date2);
}
});
//划分 会话id step 停留时间
int step=1;
UUID sessionId = UUID.randomUUID();
//给list 循环遍历
/*
* sessionid
* step
* staytime 后一个-前一个
*/
for(int i=0;i<list.size();i++){
//获取当前访问
WebLogBean web_bean = list.get(i);
//list集合 只有一个元素
if(list.size()==1){
//封装 发送
ClickStreanBean cbean=new ClickStreanBean(sessionId.toString(),
web_bean.getRemote_addr(),
web_bean.getTime_local(),
web_bean.getRequest(),
web_bean.getStatus(),
web_bean.getBody_bytes_sent(),
step,
60,
web_bean.getHttp_referer(),
web_bean.getHttp_user_agent());
context.write(cbean, NullWritable.get());
}else{
//list 有多个元素 多个页面
//如果是第一个页面
/*
*a 5:03
*b 5:50 sessionid 1
*c 6:01 60
*d 7:00
*/
if(i==0){
continue;
}else{
//要获取当前页面 //获取上一个页面
WebLogBean last_bean = list.get(i-1);
//判断时间差 当前的-上一个
String current_time=web_bean.getTime_local();
String last_time=last_bean.getTime_local();
try {
long diff_time = diffDate(last_time, current_time);
//判断是否是一个session
if(diff_time <= 30*60*1000){//同一个session
//封装上一个 访问的bean
ClickStreanBean last_click_bean=new ClickStreanBean(
sessionId.toString(),
last_bean.getRemote_addr(),
last_bean.getTime_local(),
last_bean.getRequest(),
last_bean.getStatus(),
last_bean.getBody_bytes_sent(),
step, (int)diff_time/1000,
last_bean.getHttp_referer(),
last_bean.getHttp_user_agent());
step++;
context.write(last_click_bean, NullWritable.get());
}else{//新的session了
//上一个页面 停留时间 大于30min 旧的session的结束
//封装的是 上一个会话的 最后一个对象
//新的sessionid step
ClickStreanBean end_click_bean=new ClickStreanBean(
sessionId.toString(),
last_bean.getRemote_addr(),
last_bean.getTime_local(),
last_bean.getRequest(),
last_bean.getStatus(),
last_bean.getBody_bytes_sent(),
step, 60,
last_bean.getHttp_referer(),
last_bean.getHttp_user_agent());
context.write(end_click_bean, NullWritable.get());
sessionId=UUID.randomUUID();
step=1;
}
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//处理最后一条数据
if(i== (list.size()-1)){
//封装 发送
ClickStreanBean end_bean=new ClickStreanBean(sessionId.toString(),
web_bean.getRemote_addr(),
web_bean.getTime_local(),
web_bean.getRequest(),
web_bean.getStatus(),
web_bean.getBody_bytes_sent(),
step,
60,
web_bean.getHttp_referer(),
web_bean.getHttp_user_agent());
context.write(end_bean, NullWritable.get());
}
}
}
}
public Date toDate(String time) throws ParseException {
SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
return sdf.parse(time);
}
public long diffDate(String last,String current) throws ParseException {
Date date1 = toDate(last);
Date date2 = toDate(current);
return date2.getTime()-date1.getTime();
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
System.setProperty("HADOOP_USER_NAME", "hadoop");
Configuration conf=new Configuration();
conf.set("fs.defaultFS", "hdfs://bd1906/");
//启动job
Job job=Job.getInstance(conf);
job.setJarByClass(ClickStreamPre.class);
job.setMapperClass(ClickStreamPreMapper.class);
job.setReducerClass(ClickStreamPreReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(WebLogBean.class);
//最终输出
job.setOutputKeyClass(ClickStreanBean.class);
job.setOutputValueClass(NullWritable.class);
//输入
FileInputFormat.addInputPath(job, new Path("/dianshang/data/pre_out"));
FileSystem fs=FileSystem.get(conf);
Path out=new Path("/dianshang/data/pre_click");
if(fs.exists(out)){
fs.delete(out, true);
}
FileOutputFormat.setOutputPath(job,out );
//提交
job.waitForCompletion(true);
}
}
5.处理结果
处理完的部分数据如下:
19a806cd-5377-4aea-bb9b-6283de767fd7
1.202.186.37
2013-09-18 15:39:18
/nodejs-async-windjs/
200
10139
1
0
"http://cnodejs.org/topic/521a30d4bee8d3cb1272ac0f"
"Mozilla/5.0(Macintosh;IntelMacOSX10_8_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/29.0.1547.65Safari/537.36"
可以看到每一条数据都被打上了sessionID,相同的session在一起,图上列出来的被分为了四部分,分别属于四个会话。
五:点击流模型 visit 信息表
我们可以通过在数据预处理阶段,统计出每一个会话的信息,例如会话时间,会话最开始点击的页面和结束时候点击的页面,点击了多少个页面,等等信息,因为这些信息在HIVE中统计是由一定的难度的,可以加上这些字段: sessionid start-time out-time start-page out-page pagecounts 。
1.创建VisitBean
package com.aura.cn.bean;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class VisitBean implements Writable{
// sessionid start-time out-time start-page out-page pagecounts
private String sessioned;
private String starttime;
private String startpage;
private String outtime;
private String outpage;
private int pagecount;
private String addr;
private String http_user_agent;
public VisitBean(String sessioned, String starttime, String startpage, String outtime, String outpage, int pagecount,String addr,String http_user_agent) {
this.sessioned = sessioned;
this.starttime = starttime;
this.startpage = startpage;
this.outtime = outtime;
this.outpage = outpage;
this.pagecount = pagecount;
this.addr=addr;
this.http_user_agent=http_user_agent;
}
public VisitBean() {
}
public String getSessioned() {
return sessioned;
}
public void setSessioned(String sessioned) {
this.sessioned = sessioned;
}
public String getStarttime() {
return starttime;
}
public void setStarttime(String starttime) {
this.starttime = starttime;
}
public String getOuttie() {
return startpage;
}
public void setOuttie(String outtie) {
this.startpage = startpage;
}
public String getOuttime() {
return outtime;
}
public void setOuttime(String outtime) {
this.outtime = outtime;
}
public String getOutpage() {
return outpage;
}
public void setOutpage(String outpage) {
this.outpage = outpage;
}
public int getPagecount() {
return pagecount;
}
public void setPagecount(int pagecount) {
this.pagecount = pagecount;
}
public String getStartpage() {
return startpage;
}
public void setStartpage(String startpage) {
this.startpage = startpage;
}
public String getAddr() {
return addr;
}
public void setAddr(String addr) {
this.addr = addr;
}
public String getHttp_user_agent() {
return http_user_agent;
}
public void setHttp_user_agent(String http_user_agent) {
this.http_user_agent = http_user_agent;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(sessioned);
out.writeUTF(starttime);
out.writeUTF(startpage);
out.writeUTF(outpage);
out.writeUTF(outpage);
out.writeInt(pagecount);
out.writeUTF(addr);
out.writeUTF(http_user_agent);
}
@Override
public void readFields(DataInput in) throws IOException {
this.sessioned=in.readUTF();
this.starttime=in.readUTF();
this.startpage=in.readUTF();
this.outpage=in.readUTF();
this.outtime=in.readUTF();
this.pagecount=in.readInt();
this.addr=in.readUTF();
this.http_user_agent=in.readUTF();
}
@Override
public String toString() {
return sessioned + "\001" + starttime + "\001" + startpage
+ "\001" + outtime + "\001" + outpage + "\001" + pagecount + "\001" + addr
+ "\001" + http_user_agent ;
}
}
2.具体的MR处理类
package com.aura.cn.click;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.aura.cn.bean.ClickStreanBean;
import com.aura.cn.bean.VisitBean;
public class Click_Visit {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
System.setProperty("HADOOP_USER_NAME", "hadoop");
Configuration conf=new Configuration();
conf.set("fs.defaultFS", "hdfs://bd1906/");
//启动job
Job job=Job.getInstance(conf);
job.setJarByClass(Click_Visit.class);
job.setMapperClass(Click_VisitMapper.class);
job.setReducerClass(Click_VisitReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ClickStreanBean.class);
//最终输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//输入
FileInputFormat.addInputPath(job, new Path("/dianshang/data/pre_click"));
FileSystem fs=FileSystem.get(conf);
Path out=new Path("/dianshang/data/visit");
if(fs.exists(out)){
fs.delete(out, true);
}
FileOutputFormat.setOutputPath(job,out );
//提交
job.waitForCompletion(true);
}
static class Click_VisitMapper extends Mapper<LongWritable, Text, Text, ClickStreanBean>{
Text mk=new Text();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, ClickStreanBean>.Context context)
throws IOException, InterruptedException {
String[] datas = value.toString().split("\001");
mk.set(datas[0]);
ClickStreanBean mv=new ClickStreanBean(datas[0],
datas[1],
datas[2],
datas[3],
Integer.parseInt(datas[4]),
Integer.parseInt(datas[5]),
Integer.parseInt(datas[6]),
Integer.parseInt(datas[7]),
datas[8],
datas[9]);
context.write(mk, mv);
}
}
static class Click_VisitReducer extends Reducer<Text, ClickStreanBean, Text, VisitBean>{
Text rk=new Text();
@Override
protected void reduce(Text key, Iterable<ClickStreanBean> valus,
Reducer<Text, ClickStreanBean, Text, VisitBean>.Context context)
throws IOException, InterruptedException {
List<ClickStreanBean> list=new ArrayList<ClickStreanBean>();
for (ClickStreanBean v : valus) {
ClickStreanBean bean=new ClickStreanBean();
try {
BeanUtils.copyProperties(bean, v);
list.add(bean);
} catch (IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Collections.sort(list,new Comparator<ClickStreanBean>() {
@Override
public int compare(ClickStreanBean o1, ClickStreanBean o2) {
// TODO Auto-generated method stub
return o1.getStep()-o2.getStep();
}
});
VisitBean visitbean=new VisitBean();
//String sessioned, String starttime, String startpage, String outtime, String outpage, int pagecount,String addr,String http_user_agent
visitbean.setSessioned(list.get(0).getSessionId());
visitbean.setStarttime(list.get(0).getTime_local());
visitbean.setStartpage(list.get(0).getRequest());
visitbean.setOuttime(list.get(list.size()-1).getTime_local());
visitbean.setOutpage(list.get(list.size()-1).getRequest());
visitbean.setPagecount(list.get(list.size()-1).getStep());
visitbean.setAddr(list.get(0).getRemote_addr());
visitbean.setHttp_user_agent(list.get(0).getHttp_user_agent());
/*
* ClickStreanBean first = list.get(0); ClickStreanBean last =
* list.get(list.size()-1);
* rk.set(first.getSessionId()+"\001"+first.getTime_local()+"\001"+
* last.getTime_local()+"\001"+first.getRequest()+"\001"+
* last.getRequest()+"\001"+last.getStep() +"\001"+first.getRemote_addr()+
* "\001"+first.getHttp_referer());
*/
context.write(rk, visitbean);
}
}
}
3.处理后的结果如下
005992c8-d1aa-4f54-8064-d12f4bd64fdc
2013-09-18 07:09:02
/mongodb-shard/
2013-09-18 07:09:02
/mongodb-shard/
1
180.153.236.195
“Mozilla/5.0(Windows;U;WindowsNT5.1;zh-CN;rv:1.8.0.11)Firefox/1.5.0.11;360Spider”