需求描述:
现在有一个文件,包含若干个字段(时间戳,手机号,...,上行流量,下行流量等),字段间以“\t“,分隔,数据格式如下,现在要统计出所有手机号的上行/下行流量总和。
输入:
时间戳 手机号 ... 上行流量 下行流量
输出:
手机号 总上行流量 总下行流量 总流量
思路:
框架传递给Map的数据是文件中的一行数据,首先将行切分成字符串数组,提取出要用的字段,然后输出kv对<手机号,FlowBean>
这里我们需要封装一个Bean,用于表示手机号对应的上行流量和下行流量。
由于FlowBean是我们自己定义的类,它要自Hadoop不同节点间传输的话,就需要符合Hadoop的序列化规范,因此FlowBean需要实现Writable接口,用于序列化,以便于在集群的不同节点间传输Bean对象。
Hadoop自己实现了一套序列化机制,不同于jdk中自带的序列化机制的是,Hadoop中的序列化机制不会保存对象的继承结构,这样就会提高传输效率。
在反序列化的时候,会先调用Bean的空参构造方法反射出一个对象,然后在给对象的字段赋值。
Reduce从框架接收到形如<k,{flowbean,flowbean,flowbean...}>(一个k对应一个手机号),需要对values遍历累加求和,然后输出
另外,Reduce阶段输出数据给框架,框架写结果文件的时候,需要调用Bean的toString()方法,因此默认的toString()方法返回对象的id在这里并不适合,需要重新覆盖Bean的toString()方法,以便得到想要的结果。
具体代码如下:
依次定义4个类,FlowBean,FlowSumMapper,FlowSumMapper,FlowSumRunner
然后按照指定的逻辑实现即可。
FlowBean
package com.gege.hadoop.mr.flowsum;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
public class FlowBean implements WritableComparable<FlowBean>{
private String phoneNB;
private long up_flow;
private long d_flow;
private long s_flow;
//在反序列化时,反射机制需要调用空参构造函数,所以显示定义了一个空参构造函数
public FlowBean(){}
//为了对象数据的初始化方便,加入一个带参的构造函数
public FlowBean(String phoneNB, long up_flow, long d_flow) {
this.phoneNB = phoneNB;
this.up_flow = up_flow;
this.d_flow = d_flow;
this.s_flow = up_flow + d_flow;
}
public String getPhoneNB() {
return phoneNB;
}
public void setPhoneNB(String phoneNB) {
this.phoneNB = phoneNB;
}
public long getUp_flow() {
return up_flow;
}
public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}
public long getD_flow() {
return d_flow;
}
public void setD_flow(long d_flow) {
this.d_flow = d_flow;
}
public long getS_flow() {
return s_flow;
}
public void setS_flow(long s_flow) {
this.s_flow = s_flow;
}
//将对象数据序列化到流中
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phoneNB);
out.writeLong(up_flow);
out.writeLong(d_flow);
out.writeLong(s_flow);
}
//从数据流中反序列出对象的数据
//从数据流中读出对象字段时,必须跟序列化时的顺序保持一致
@Override
public void readFields(DataInput in) throws IOException {
phoneNB = in.readUTF();
up_flow = in.readLong();
d_flow = in.readLong();
s_flow = in.readLong();
}
@Override
public String toString() {
return "" + up_flow + "\t" +d_flow + "\t" + s_flow;
}
@Override
public int compareTo(FlowBean o) {
return s_flow>o.getS_flow()?-1:1;
}
}
FlowSumMapper
package com.gege.hadoop.mr.flowsum;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* FlowBean 是我们自定义的一种数据类型,要在hadoop的各个节点之间传输,应该遵循hadoop的序列化机制
* 就必须实现hadoop相应的序列化接口
*
*/
public class FlowSumMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
//拿到日志中的一行数据,切分各个字段,抽取出我们需要的字段:手机号,上行流量,下行流量,然后封装成kv发送出去
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
//拿一行数据
String line = value.toString();
//切分成各个字段
String[] fields = StringUtils.split(line, "\t");
//拿到我们需要的字段
String phoneNB = fields[1];
long u_flow = Long.parseLong(fields[7]);
long d_flow = Long.parseLong(fields[8]);
//封装数据为kv并输出
context.write(new Text(phoneNB), new FlowBean(phoneNB,u_flow,d_flow));
}
}
FlowSumReducer
package com.gege.hadoop.mr.flowsum;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowSumReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
//框架每传递一组数据<1387788654,{flowbean,flowbean,flowbean,flowbean.....}>调用一次我们的reduce方法
//reduce中的业务逻辑就是遍历values,然后进行累加求和再输出
@Override
protected void reduce(Text key, Iterable<FlowBean> values,Context context)
throws IOException, InterruptedException {
long up_flow_counter = 0;
long d_flow_counter = 0;
for(FlowBean bean : values){
up_flow_counter += bean.getUp_flow();
d_flow_counter += bean.getD_flow();
}
context.write(key, new FlowBean(key.toString(), up_flow_counter, d_flow_counter));
}
}
FlowSumRunner
package com.gege.hadoop.mr.flowsum;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//这是job描述和提交类的规范写法
public class FlowSumRunner extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(FlowSumRunner.class);
job.setMapperClass(FlowSumMapper.class);
job.setReducerClass(FlowSumReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new FlowSumRunner(), args);
System.exit(res);
}
}
执行逻辑:
待上传