前言
这个练习题是使用hadoop本地模式运行的。
你需要做的是:
1,把代码复制粘贴,改吧改吧
2,自己把结果跑出来,对照结果是否正确
3,再仔细看一遍代码,百度其中不懂的地方
准备数据
链接:https://pan.baidu.com/s/1wFwJMLTDnLCYkgiwemHQ0g
提取码:ih8l
大小:52kb左右
下载data_flow3.1.dat,里面的数据长这个样子:
需求一:统计求和
统计每个手机号的上行流量总和,下行流量总和,上行总流量之和,下行总流量之和
不知道这四个是什么?那我在图里把它们标出来吧:
分析:以手机号码作为key值,上行流量,下行流量,上行总流量,下行总流量四个字段作为value值,然后以这个key,和value作为map阶段的输出,reduce阶段的输入。
代码定义如下:
第一步:自定义map的输出value对象FlowBean
package com.czxy.demo04;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FlowBean implements Writable {
private Integer upFlow;
private Integer downFlow;
private Integer upCountFlow;
private Integer downCountFlow;
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(upFlow);
out.writeInt(downFlow);
out.writeInt(upCountFlow);
out.writeInt(downCountFlow);
}
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readInt();
this.downFlow = in.readInt();
this.upCountFlow = in.readInt();
this.downCountFlow = in.readInt();
}
public FlowBean() {
}
public FlowBean(Integer upFlow, Integer downFlow, Integer upCountFlow, Integer downCountFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.upCountFlow = upCountFlow;
this.downCountFlow = downCountFlow;
}
public Integer getUpFlow() {
return upFlow;
}
public void setUpFlow(Integer upFlow) {
this.upFlow = upFlow;
}
public Integer getDownFlow() {
return downFlow;
}
public void setDownFlow(Integer downFlow) {
this.downFlow = downFlow;
}
public Integer getUpCountFlow() {
return upCountFlow;
}
public void setUpCountFlow(Integer upCountFlow) {
this.upCountFlow = upCountFlow;
}
public Integer getDownCountFlow() {
return downCountFlow;
}
public void setDownCountFlow(Integer downCountFlow) {
this.downCountFlow = downCountFlow;
}
@Override
public String toString() {
return "FlowBean{" +
"upFlow=" + upFlow +
", downFlow=" + downFlow +
", upCountFlow=" + upCountFlow +
", downCountFlow=" + downCountFlow +
'}';
}
}
第二步:定义FlowMapper类
package com.czxy.demo04;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text,Text,FlowBean> {
FlowBean flowBean = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String s = value.toString();
String[] split = s.split("\t");
flowBean.setUpFlow(Integer.parseInt(split[6]));
flowBean.setDownFlow(Integer.parseInt(split[7]));
flowBean.setUpCountFlow(Integer.parseInt(split[8]));
flowBean.setDownCountFlow(Integer.parseInt(split[9]));
context.write(new Text(split[1]),flowBean);
}
}
第三步:定义FlowReducer类
package com.czxy.demo04;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<Text,FlowBean,Text,FlowBean> {
private FlowBean flowBean = new FlowBean();
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
Integer upFlow = 0;
Integer downFlow = 0;
Integer upCountFlow = 0;
Integer downCountFlow = 0;
for (FlowBean value : values) {
upFlow += value.getUpFlow();
downFlow += value.getDownFlow();
upCountFlow += value.getUpCountFlow();
downCountFlow += value.getDownCountFlow();
}
flowBean.setUpFlow(upFlow);
flowBean.setDownFlow(downFlow);
flowBean.setUpCountFlow(upCountFlow);
flowBean.setDownCountFlow(downCountFlow);
context.write(key,flowBean);
}
}
第四步:程序main函数入口FlowMain
package com.czxy.demo04;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FlowBeanMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), FlowBeanMain.class.getSimpleName());
job.setJarByClass(FlowBeanMain.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///E:\\cache\\mapReduceTestCache\\data_flow3.1.dat"));
job.setMapperClass(FlowMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setReducerClass(FlowReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///E:\\cache\\mapReduceResultCache\\flowStatistics01"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(),new FlowBeanMain(),args);
}
}
运行结果:
需求是:统计每个手机号的上行流量总和,下行流量总和,上行总流量之和,下行总流量之和
结果正确,你也跑一遍代码,对比一下结果。
需求二:上行流量倒序排序(递减排序)
分析,以需求一的输出数据作为排序的输入数据,自定义FlowBean,以FlowBean为map输出的key,以手机号作为Map输出的value,因为MapReduce程序会对Map阶段输出的key进行排序
java 的compareTo方法说明
compareTo 方法用于将当前对象与方法的参数进行比较。 如果指定的数与参数相等返回 0。如果指定的数小于参数返回 -1。 如果指定的数大于参数返回 1。
例如:o1.compareTo(o2); 返回正数的话,当前对象(调用 compareTo 方法的对象 o1)要排在比较对象(compareTo 传参对象 o2)后面,返回负数的话,放在前面。
第一步:定义FlowBean实现WritableComparable实现比较排序
package com.czxy.demo05;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class FlowBean implements WritableComparable<FlowBean> {
private Integer upFlow;
private Integer downFlow;
private Integer upCountFlow;
private Integer downCountFlow;
public FlowBean() {
}
public FlowBean(Integer upFlow, Integer downFlow, Integer upCountFlow, Integer downCountFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.upCountFlow = upCountFlow;
this.downCountFlow = downCountFlow;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(upFlow);
out.writeInt(downFlow);
out.writeInt(upCountFlow);
out.writeInt(downCountFlow);
}
@Override
public void readFields(DataInput in) throws IOException {
upFlow = in.readInt();
downFlow = in.readInt();
upCountFlow = in.readInt();
downCountFlow = in.readInt();
}
public Integer getUpFlow() {
return upFlow;
}
public void setUpFlow(Integer upFlow) {
this.upFlow = upFlow;
}
public Integer getDownFlow() {
return downFlow;
}
public void setDownFlow(Integer downFlow) {
this.downFlow = downFlow;
}
public Integer getUpCountFlow() {
return upCountFlow;
}
public void setUpCountFlow(Integer upCountFlow) {
this.upCountFlow = upCountFlow;
}
public Integer getDownCountFlow() {
return downCountFlow;
}
public void setDownCountFlow(Integer downCountFlow) {
this.downCountFlow = downCountFlow;
}
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + upCountFlow + "\t" + downCountFlow;
}
@Override
public int compareTo(FlowBean o) {
return this.upCountFlow > o.upCountFlow ? -1 : 1;
}
}
第二步:定义FlowMapper
package com.czxy.demo05;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowMapper extends Mapper<LongWritable, Text,FlowBean,Text> {
Text outKey = new Text();
FlowBean flowBean = new FlowBean();
@Override
protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
flowBean.setUpFlow(Integer.parseInt(split[6]));
flowBean.setDownFlow(Integer.parseInt(split[7]));
flowBean.setUpCountFlow(Integer.parseInt(split[8]));
flowBean.setDownCountFlow(Integer.parseInt(split[9]));
outKey.set(split[0]);
context.write(flowBean,outKey);
}
}
第三步:定义FlowReducer
package com.czxy.demo05;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowReducer extends Reducer<FlowBean, Text,Text,FlowBean> {
FlowBean flowBean = new FlowBean();
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
context.write(values.iterator().next(),key);
}
}
第四步:程序main函数入口
package com.czxy.demo05;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FlowMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = super.getConf();
conf.set("mapreduce.framework.name","local");
Job job = Job.getInstance(conf, FlowMain.class.getSimpleName());
job.setJarByClass(FlowMain.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///E:\\cache\\mapReduceTestCache\\data_flow3.1.dat"));
job.setMapperClass(FlowMapper.class);
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(FlowReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
TextOutputFormat.setOutputPath(job,new Path("file:///E:\\cache\\mapReduceResultCache\\flowStatistics03"));
job.setOutputFormatClass(TextOutputFormat.class);
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
int run = ToolRunner.run(configuration, new FlowMain(), args);
System.exit(run);
}
}
运行结果:
因为有很多假数据,所以出现了重复数据。
需求三:手机号码分区
要分区,就得设置
job.setNumReduceTask();
分几个区,就得填几
在需求一的基础上,继续完善,将不同的手机号分到不同的数据文件的当中去,需要自定义分区来实现,这里自定义来模拟分区,将以下数字开头的手机号进行分开
137 开头数据到一个分区文件
138 开头数据到一个分区文件
139 开头数据到一个分区文件
135 开头数据到一个分区文件
136 开头数据到一个分区文件
其他分区
自定义分区:
package com.czxy.demo06;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class FlowPartition extends Partitioner<Text,FlowBean> {
@Override
public int getPartition(Text text, FlowBean flowBean, int i) {
String line = text.toString();
if (line.startsWith("135")){
return 0;
}else if(line.startsWith("136")){
return 1;
}else if(line.startsWith("137")){
return 2;
}else if(line.startsWith("138")){
return 3;
}else if(line.startsWith("139")){
return 4;
}else{
return 5;
}
}
}
作业运行添加分区设置:
FlowBeanMean中,在下面这两行代码后面
Job job = Job.getInstance(super.getConf(), FlowBeanMain.class.getSimpleName());
job.setJarByClass(FlowBeanMain.class);
添加如下代码:
job.setPartitionerClass(FlowPartition.class);
job.setNumReduceTask(5);
变成这个样子
Job job = Job.getInstance(super.getConf(), FlowBeanMain.class.getSimpleName());
job.setJarByClass(FlowBeanMain.class);
job.setPartitionerClass(FlowPartition.class);
job.setNumReduceTask(5);
更改输入与输出路径,并打包到集群上面去运行
TextInputFormat.addInputPath(job,new Path("hdfs://192.168.100.201:8020/partition_flow/"));
TextOutputFormat.setOutputPath(job,new Path("hdfs://192.168.100.201:8020/partition_out"));
运行结果:
[root@hadoop01 home]# hadoop fs -ls /partition_out/
Found 7 items
-rw-r--r-- 2 root supergroup 0 2019-11-18 18:15 /partition_out/_SUCCESS
-rw-r--r-- 2 root supergroup 174 2019-11-18 18:15 /partition_out/part-r-00000
-rw-r--r-- 2 root supergroup 256 2019-11-18 18:15 /partition_out/part-r-00001
-rw-r--r-- 2 root supergroup 241 2019-11-18 18:15 /partition_out/part-r-00002
-rw-r--r-- 2 root supergroup 157 2019-11-18 18:15 /partition_out/part-r-00003
-rw-r--r-- 2 root supergroup 330 2019-11-18 18:15 /partition_out/part-r-00004
-rw-r--r-- 2 root supergroup 582 2019-11-18 18:15 /partition_out/part-r-00005
[root@hadoop01 home]#
可以看到,根据不同手机号分成了不同的文件,让我们看看每个文件中的内容,是否符合需求描述。
[root@hadoop01 home]# hadoop fs -cat /partition_out/part-r-00000
13502468 FlowBean{upFlow=1254, downFlow=2244, upCountFlow=161370, downCountFlow=2427678}
13560439 FlowBean{upFlow=726, downFlow=528, upCountFlow=44748, downCountFlow=129624}
[root@hadoop01 home]# hadoop fs -cat /partition_out/part-r-00001
13600217 FlowBean{upFlow=396, downFlow=3036, upCountFlow=23760, downCountFlow=4110744}
13602846 FlowBean{upFlow=330, downFlow=264, upCountFlow=42636, downCountFlow=64020}
13660577 FlowBean{upFlow=528, downFlow=198, upCountFlow=153120, downCountFlow=15180}
[root@hadoop01 home]# hadoop fs -cat /partition_out/part-r-00002
13719199 FlowBean{upFlow=88, downFlow=0, upCountFlow=5280, downCountFlow=0}
13726230 FlowBean{upFlow=528, downFlow=594, upCountFlow=54582, downCountFlow=542982}
13760778 FlowBean{upFlow=44, downFlow=44, upCountFlow=2640, downCountFlow=2640}
[root@hadoop01 home]# hadoop fs -cat /partition_out/part-r-00003
13823070 FlowBean{upFlow=132, downFlow=66, upCountFlow=7920, downCountFlow=3960}
13826544 FlowBean{upFlow=88, downFlow=0, upCountFlow=5808, downCountFlow=0}
[root@hadoop01 home]# hadoop fs -cat /partition_out/part-r-00004
13922314 FlowBean{upFlow=264, downFlow=264, upCountFlow=66176, downCountFlow=81840}
13925057 FlowBean{upFlow=1518, downFlow=1386, upCountFlow=243276, downCountFlow=1061346}
13926251 FlowBean{upFlow=88, downFlow=0, upCountFlow=5280, downCountFlow=0}
13926435 FlowBean{upFlow=44, downFlow=88, upCountFlow=2904, downCountFlow=33264}
[root@hadoop01 home]# hadoop fs -cat /partition_out/part-r-00005
13480253 FlowBean{upFlow=66, downFlow=66, upCountFlow=3960, downCountFlow=3960}
15013685 FlowBean{upFlow=616, downFlow=594, upCountFlow=80498, downCountFlow=77836}
15920133 FlowBean{upFlow=440, downFlow=440, upCountFlow=69432, downCountFlow=64592}
15989002 FlowBean{upFlow=66, downFlow=66, upCountFlow=42636, downCountFlow=3960}
18211575 FlowBean{upFlow=330, downFlow=264, upCountFlow=33594, downCountFlow=46332}
18320173 FlowBean{upFlow=462, downFlow=396, upCountFlow=209682, downCountFlow=53064}
84138413 FlowBean{upFlow=440, downFlow=352, upCountFlow=90552, downCountFlow=31504}
[root@hadoop01 home]#
符合需求,成功。