一,WordCount程序
案例分析
需求:统计多个文件的每个单词的出现的次数。
分析:
阶段: 将每一行文本数据变成<单词,1>这样的kv数据
阶段:将相同单词的一组kv数据进行聚合:累加所有的v
代码实现
编码实现:Mapper类开发
package cn.edu360.mr.wc;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* KEYIN :是map task读取到的数据的key的类型,是一行的起始偏移量Long
* VALUEIN:是map task读取到的数据的value的类型,是一行的内容String
*
* KEYOUT:是用户的自定义map方法要返回的结果kv数据的key的类型,在wordcount逻辑中,我们需要返回的是单词String
* VALUEOUT:是用户的自定义map方法要返回的结果kv数据的value的类型,在wordcount逻辑中,我们需要返回的是整数Integer
* 但是,在mapreduce中,map产生的数据需要传输给reduce,需要进行序列化和反序列化,而jdk中的原生序列化机制产生的数据量比较冗余,就会导致数据在mapreduce运行过程中传输效率低下
* 所以,hadoop专门设计了自己的序列化机制,那么,mapreduce中传输的数据类型就必须实现hadoop自己的序列化接口
*
* hadoop为jdk中的常用基本类型Long String Integer Float等数据类型封住了自己的实现了hadoop序列化接口的类型:LongWritable,Text,IntWritable,FloatWritable
* @author ThinkPad
*
*/
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 切单词
String line = value.toString();
String[] words = line.split(" ");
for(String word:words){
context.write(new Text(word), new IntWritable(1));
}
}
}
Mapper
编码实现:Reduce类开发
package cn.edu360.mr.wc;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int count = 0;
Iterator<IntWritable> iterator = values.iterator();
while(iterator.hasNext()){
IntWritable value = iterator.next();
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
Reduce类
编码实现: 提交任务类开发
package cn.edu360.mr.wc;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 用于提交mapreduce job的客户端程序
* 功能:
* 1、封装本次job运行时所需要的必要参数
* 2、跟yarn进行交互,将mapreduce程序成功的启动、运行
* @author ThinkPad
*
*/
public class JobSubmitter {
public static void main(String[] args) throws Exception {
// 在代码中设置JVM系统参数,用于给job对象来获取访问HDFS的用户身份
System.setProperty("HADOOP_USER_NAME", "root");
Configuration conf = new Configuration();
// 1、设置job运行时要访问的默认文件系统
conf.set("fs.defaultFS", "hdfs://hdp-01:9000");
// 2、设置job提交到哪去运行
conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resourcemanager.hostname", "hdp-01");
// 3、如果要从windows系统上运行这个job提交客户端程序,则需要加这个跨平台提交的参数
conf.set("mapreduce.app-submission.cross-platform","true");
Job job = Job.getInstance(conf);
// 1、封装参数:jar包所在的位置
job.setJar("d:/wc.jar");
//job.setJarByClass(JobSubmitter.class);
// 2、封装参数: 本次job所要调用的Mapper实现类、Reducer实现类
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);
// 3、封装参数:本次job的Mapper实现类、Reducer实现类产生的结果数据的key、value类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
Path output = new Path("/wordcount/output");
FileSystem fs = FileSystem.get(new URI("hdfs://hdp-01:9000"),conf,"root");
if(fs.exists(output)){
fs.delete(output, true);
}
// 4、封装参数:本次job要处理的输入数据集所在路径、最终结果的输出路径
FileInputFormat.setInputPaths(job, new Path("/wordcount/input"));
FileOutputFormat.setOutputPath(job, output); // 注意:输出路径必须不存在
// 5、封装参数:想要启动的reduce task的数量
job.setNumReduceTasks(2);
// 6、提交job给yarn
boolean res = job.waitForCompletion(true);
System.exit(res?0:-1);
}
}
提交任务类
二,求WordCount的topN程序
案例分析
求访问次数最多的topn个网站(只能有1个reduce worker)《全局TOPN》
思路:
阶段:解析数据,将域名作为key,1作为value
reduce阶段:
方法中——对一个域名的一组1累加,然后将 <域名,总次数>放入一个成员变量Treemap中
方法中——从treemap中挑出次数最高的n个域名作为结果输出
代码实现
编码实现: PageCount类开发
package cn.edu360.mr.page.topn;
public class PageCount implements Comparable<PageCount>{
private String page;
private int count;
public void set(String page, int count) {
this.page = page;
this.count = count;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public int compareTo(PageCount o) {
return o.getCount()-this.count==0?this.page.compareTo(o.getPage()):o.getCount()-this.count;
}
}
PageCount
编码实现:Mapper类开发
package cn.edu360.mr.page.topn;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class PageTopnMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
context.write(new Text(split[1]), new IntWritable(1));
}
}
Mapper
编码实现:Reduce类开发
package cn.edu360.mr.page.topn;
import java.io.IOException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class PageTopnReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
TreeMap<PageCount, Object> treeMap = new TreeMap<>();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
PageCount pageCount = new PageCount();
pageCount.set(key.toString(), count);
treeMap.put(pageCount,null);
}
@Override
protected void cleanup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int topn = conf.getInt("top.n", 5);
Set<Entry<PageCount, Object>> entrySet = treeMap.entrySet();
int i= 0;
for (Entry<PageCount, Object> entry : entrySet) {
context.write(new Text(entry.getKey().getPage()), new IntWritable(entry.getKey().getCount()));
i++;
if(i==topn) return;
}
}
}
Reduce
编码实现: 提交任务类开发
package cn.edu360.mr.page.topn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter {
public static void main(String[] args) throws Exception {
/**
* 通过加载classpath下的*-site.xml文件解析参数
*/
Configuration conf = new Configuration();
conf.addResource("xx-oo.xml");
/**
* 通过代码设置参数
*/
//conf.setInt("top.n", 3);
//conf.setInt("top.n", Integer.parseInt(args[0]));
/**
* 通过属性配置文件获取参数
*/
/*Properties props = new Properties();
props.load(JobSubmitter.class.getClassLoader().getResourceAsStream("topn.properties"));
conf.setInt("top.n", Integer.parseInt(props.getProperty("top.n")));*/
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(PageTopnMapper.class);
job.setReducerClass(PageTopnReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\output"));
job.waitForCompletion(true);
}
}
Submit
三,MapReduce中的自定义类型
案例分析
需求:统计一下文件中,每一个用户所耗费的总上行流量,总下行流量,总流量
思路:
map阶段:将每一行按tab切分成各字段,提取其中的手机号作为输出key,流量信息封装到FlowBean对象中,作为输出的value
reduce阶段:遍历一组数据的所有value(flowbean),进行累加,然后以手机号作为key输出,以总流量信息bean作为value输出
注意:自定义类型如何实现hadoop的序列化接口。
FlowBean这种自定义数据类型必须实现hadoop的序列化接口:Writable,实现其中的两个方法:
readFields(in) 反序列化方法
write(out) 序列化方法
代码实现
编码实现: PageCount类开发
package cn.edu360.mr.flow;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
* 本案例的功能:演示自定义数据类型如何实现hadoop的序列化接口
* 1、该类一定要保留空参构造函数
* 2、write方法中输出字段二进制数据的顺序 要与 readFields方法读取数据的顺序一致
*
* @author ThinkPad
*
*/
public class FlowBean implements Writable {
private int upFlow;
private int dFlow;
private String phone;
private int amountFlow;
public FlowBean(){}
public FlowBean(String phone, int upFlow, int dFlow) {
this.phone = phone;
this.upFlow = upFlow;
this.dFlow = dFlow;
this.amountFlow = upFlow + dFlow;
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public int getUpFlow() {
return upFlow;
}
public void setUpFlow(int upFlow) {
this.upFlow = upFlow;
}
public int getdFlow() {
return dFlow;
}
public void setdFlow(int dFlow) {
this.dFlow = dFlow;
}
public int getAmountFlow() {
return amountFlow;
}
public void setAmountFlow(int amountFlow) {
this.amountFlow = amountFlow;
}
/**
* hadoop系统在序列化该类的对象时要调用的方法
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(upFlow);
out.writeUTF(phone);
out.writeInt(dFlow);
out.writeInt(amountFlow);
}
/**
* hadoop系统在反序列化该类的对象时要调用的方法
*/
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readInt();
this.phone = in.readUTF();
this.dFlow = in.readInt();
this.amountFlow = in.readInt();
}
@Override
public String toString() {
return this.phone + ","+this.upFlow +","+ this.dFlow +"," + this.amountFlow;
}
}
FlowBean
编码实现:Mapper类开发
package cn.edu360.mr.flow;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String phone = fields[1];
int upFlow = Integer.parseInt(fields[fields.length-3]);
int dFlow = Integer.parseInt(fields[fields.length-2]);
context.write(new Text(phone), new FlowBean(phone, upFlow, dFlow));
}
}
Mapper
编码实现:Reduce类开发
package cn.edu360.mr.flow;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
/**
* key:是某个手机号
* values:是这个手机号所产生的所有访问记录中的流量数据
*
* <135,flowBean1><135,flowBean2><135,flowBean3><135,flowBean4>
*/
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context)
throws IOException, InterruptedException {
int upSum = 0;
int dSum = 0;
for(FlowBean value:values){
upSum += value.getUpFlow();
dSum += value.getdFlow();
}
context.write(key, new FlowBean(key.toString(), upSum, dSum));
}
}
Reduce
编码实现: 提交任务类开发
package cn.edu360.mr.flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
// 设置参数:maptask在做数据分区时,用哪个分区逻辑类 (如果不指定,它会用默认的HashPartitioner)
job.setPartitionerClass(ProvincePartitioner.class);
// 由于我们的ProvincePartitioner可能会产生6种分区号,所以,需要有6个reduce task来接收
job.setNumReduceTasks(6);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\flow\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\flow\\province-output"));
job.waitForCompletion(true);
}
}
Submit
四,MapReduce中的自定义Partitioner
案例分析
需求:统计每一个用户的总流量信息,并且按照其归属地,将统计结果输出在不同的文件中
思路:
想办法让map端worker在将数据分区时,按照我们需要的按归属地划分。
实现方式:自定义一个Partitioner
代码实现
编码实现:FlowBean类开发
package cn.edu360.mr.flow;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
* 本案例的功能:演示自定义数据类型如何实现hadoop的序列化接口
* 1、该类一定要保留空参构造函数
* 2、write方法中输出字段二进制数据的顺序 要与 readFields方法读取数据的顺序一致
*
* @author ThinkPad
*
*/
public class FlowBean implements Writable {
private int upFlow;
private int dFlow;
private String phone;
private int amountFlow;
public FlowBean(){}
public FlowBean(String phone, int upFlow, int dFlow) {
this.phone = phone;
this.upFlow = upFlow;
this.dFlow = dFlow;
this.amountFlow = upFlow + dFlow;
}
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public int getUpFlow() {
return upFlow;
}
public void setUpFlow(int upFlow) {
this.upFlow = upFlow;
}
public int getdFlow() {
return dFlow;
}
public void setdFlow(int dFlow) {
this.dFlow = dFlow;
}
public int getAmountFlow() {
return amountFlow;
}
public void setAmountFlow(int amountFlow) {
this.amountFlow = amountFlow;
}
/**
* hadoop系统在序列化该类的对象时要调用的方法
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(upFlow);
out.writeUTF(phone);
out.writeInt(dFlow);
out.writeInt(amountFlow);
}
/**
* hadoop系统在反序列化该类的对象时要调用的方法
*/
@Override
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readInt();
this.phone = in.readUTF();
this.dFlow = in.readInt();
this.amountFlow = in.readInt();
}
@Override
public String toString() {
return this.phone + ","+this.upFlow +","+ this.dFlow +"," + this.amountFlow;
}
}
FlowBean
编码实现:Mapper类开发
package cn.edu360.mr.flow;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split("\t");
String phone = fields[1];
int upFlow = Integer.parseInt(fields[fields.length-3]);
int dFlow = Integer.parseInt(fields[fields.length-2]);
context.write(new Text(phone), new FlowBean(phone, upFlow, dFlow));
}
}
Mapper
编码实现:Reduce类开发
package cn.edu360.mr.flow;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
/**
* key:是某个手机号
* values:是这个手机号所产生的所有访问记录中的流量数据
*
* <135,flowBean1><135,flowBean2><135,flowBean3><135,flowBean4>
*/
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Reducer<Text, FlowBean, Text, FlowBean>.Context context)
throws IOException, InterruptedException {
int upSum = 0;
int dSum = 0;
for(FlowBean value:values){
upSum += value.getUpFlow();
dSum += value.getdFlow();
}
context.write(key, new FlowBean(key.toString(), upSum, dSum));
}
}
Reduce
编码实现:Partitioner类开发
package cn.edu360.mr.flow;
import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 本类是提供给MapTask用的
* MapTask通过这个类的getPartition方法,来计算它所产生的每一对kv数据该分发给哪一个reduce task
* @author ThinkPad
*
*/
public class ProvincePartitioner extends Partitioner<Text, FlowBean>{
static HashMap<String,Integer> codeMap = new HashMap<>();
static{
codeMap.put("135", 0);
codeMap.put("136", 1);
codeMap.put("137", 2);
codeMap.put("138", 3);
codeMap.put("139", 4);
}
@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
Integer code = codeMap.get(key.toString().substring(0, 3));
return code==null?5:code;
}
}
Partitioner
编码实现:Submiit类开发
package cn.edu360.mr.flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitter {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitter.class);
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
// 设置参数:maptask在做数据分区时,用哪个分区逻辑类 (如果不指定,它会用默认的HashPartitioner)
job.setPartitionerClass(ProvincePartitioner.class);
// 由于我们的ProvincePartitioner可能会产生6种分区号,所以,需要有6个reduce task来接收
job.setNumReduceTasks(6);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\flow\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\flow\\province-output"));
job.waitForCompletion(true);
}
}
Submit
五,MapReduce中的自定义排序
案例分析
需求:统计request.dat中每个页面被访问的总次数,同时,要求输出结果文件中的数据按照次数大小倒序排序
思路:
关键点:
mapreduce程序内置了一个排序机制:
map worker 和reduce worker ,都会对数据按照key的大小来排序
所以最终的输出结果中,一定是按照key有顺序的结果
1、先写一个mr程序,将每个页面的访问总次数统计出来
2、再写第二个mr程序:
map阶段:读取第一个mr产生的结果文件,将每一条数据解析成一个java对象UrlCountBean(封装着一个url和它的总次数),然后将这个对象作为key,null作为value返回。
注意:这个java对象要实现WritableComparable接口,以让worker可以调用对象的compareTo方法来进行排序
reduce阶段:由于worker已经对收到的数据按照UrlCountBean的compareTo方法排了序,所以,在reduce方法中,只要将数据输出即可,最后的结果自然是按总次数大小的有序结果
代码实现
编码实现:PageCount类开发
package cn.edu360.mr.page.count.sort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class PageCount implements WritableComparable<PageCount>{
private String page;
private int count;
public void set(String page, int count) {
this.page = page;
this.count = count;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
@Override
public int compareTo(PageCount o) {
return o.getCount()-this.count==0?this.page.compareTo(o.getPage()):o.getCount()-this.count;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.page);
out.writeInt(this.count);
}
@Override
public void readFields(DataInput in) throws IOException {
this.page= in.readUTF();
this.count = in.readInt();
}
@Override
public String toString() {
return this.page + "," + this.count;
}
}
PageCount
编码实现:MapRed1类开发
package cn.edu360.mr.page.count.sort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PageCountStep1 {
public static class PageCountStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(" ");
context.write(new Text(split[1]), new IntWritable(1));
}
}
public static class PageCountStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int count = 0;
for (IntWritable v : values) {
count += v.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws Exception {
/**
* 通过加载classpath下的*-site.xml文件解析参数
*/
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(PageCountStep1.class);
job.setMapperClass(PageCountStep1Mapper.class);
job.setReducerClass(PageCountStep1Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\countout"));
job.setNumReduceTasks(3);
job.waitForCompletion(true);
}
}
MapReduce1
编码实现:MapRed2类开发
package cn.edu360.mr.page.count.sort;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PageCountStep2 {
public static class PageCountStep2Mapper extends Mapper<LongWritable, Text, PageCount, NullWritable>{
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, PageCount, NullWritable>.Context context)
throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
PageCount pageCount = new PageCount();
pageCount.set(split[0], Integer.parseInt(split[1]));
context.write(pageCount, NullWritable.get());
}
}
public static class PageCountStep2Reducer extends Reducer<PageCount, NullWritable, PageCount, NullWritable>{
@Override
protected void reduce(PageCount key, Iterable<NullWritable> values,
Reducer<PageCount, NullWritable, PageCount, NullWritable>.Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(PageCountStep2.class);
job.setMapperClass(PageCountStep2Mapper.class);
job.setReducerClass(PageCountStep2Reducer.class);
job.setMapOutputKeyClass(PageCount.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(PageCount.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\url\\countout"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\url\\sortout"));
job.setNumReduceTasks(1);
job.waitForCompletion(true);
}
}
MapReduce2
六,MapReduce中的自定义GroupingComparator
案例分析
需求:有如下数据(本案例用了排序控制、分区控制、分组控制)
思路:
读取数据切分字段,封装数据到一个bean中作为key传输,key要按照成交金额比大小
利用自定义GroupingComparator将数据按订单id进行分组,然后在reduce方法中输出每组数据的前N条即可
代码实现
编码实现:OrderBean类开发
package cn.edu360.mr.order.topn.grouping;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import org.apache.hadoop.io.WritableComparable;
public class OrderBean implements WritableComparable<OrderBean>{
private String orderId;
private String userId;
private String pdtName;
private float price;
private int number;
private float amountFee;
public void set(String orderId, String userId, String pdtName, float price, int number) {
this.orderId = orderId;
this.userId = userId;
this.pdtName = pdtName;
this.price = price;
this.number = number;
this.amountFee = price * number;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getPdtName() {
return pdtName;
}
public void setPdtName(String pdtName) {
this.pdtName = pdtName;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public int getNumber() {
return number;
}
public void setNumber(int number) {
this.number = number;
}
public float getAmountFee() {
return amountFee;
}
public void setAmountFee(float amountFee) {
this.amountFee = amountFee;
}
@Override
public String toString() {
return this.orderId + "," + this.userId + "," + this.pdtName + "," + this.price + "," + this.number + ","
+ this.amountFee;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.userId);
out.writeUTF(this.pdtName);
out.writeFloat(this.price);
out.writeInt(this.number);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.userId = in.readUTF();
this.pdtName = in.readUTF();
this.price = in.readFloat();
this.number = in.readInt();
this.amountFee = this.price * this.number;
}
// 比较规则:先比总金额,如果相同,再比商品名称
@Override
public int compareTo(OrderBean o) {
return this.orderId.compareTo(o.getOrderId())==0?Float.compare(o.getAmountFee(), this.getAmountFee()):this.orderId.compareTo(o.getOrderId());
}
}
OrderBean
编码实现:Partitioner类开发
package cn.edu360.mr.order.topn.grouping;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class OrderIdPartitioner extends Partitioner<OrderBean, NullWritable>{
@Override
public int getPartition(OrderBean key, NullWritable value, int numPartitions) {
// 按照订单中的orderid来分发数据
return (key.getOrderId().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
Partitioner
编码实现:GroupingComparator类开发
package cn.edu360.mr.order.topn.grouping;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class OrderIdGroupingComparator extends WritableComparator{
public OrderIdGroupingComparator() {
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean o1 = (OrderBean) a;
OrderBean o2 = (OrderBean) b;
return o1.getOrderId().compareTo(o2.getOrderId());
}
}
View Code
编码实现:MapperReduce类开发
package cn.edu360.mr.order.topn.grouping;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class OrderTopn {
public static class OrderTopnMapper extends Mapper<LongWritable, Text, OrderBean, NullWritable>{
OrderBean orderBean = new OrderBean();
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, OrderBean, NullWritable>.Context context)
throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
orderBean.set(fields[0], fields[1], fields[2], Float.parseFloat(fields[3]), Integer.parseInt(fields[4]));
context.write(orderBean,v);
}
}
public static class OrderTopnReducer extends Reducer< OrderBean, NullWritable, OrderBean, NullWritable>{
/**
* 虽然reduce方法中的参数key只有一个,但是只要迭代器迭代一次,key中的值就会变
*/
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values,
Reducer<OrderBean, NullWritable, OrderBean, NullWritable>.Context context)
throws IOException, InterruptedException {
int i=0;
for (NullWritable v : values) {
context.write(key, v);
if(++i==3) return;
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration(); // 默认只加载core-default.xml core-site.xml
conf.setInt("order.top.n", 2);
Job job = Job.getInstance(conf);
job.setJarByClass(OrderTopn.class);
job.setMapperClass(OrderTopnMapper.class);
job.setReducerClass(OrderTopnReducer.class);
job.setPartitionerClass(OrderIdPartitioner.class);
job.setGroupingComparatorClass(OrderIdGroupingComparator.class);
job.setNumReduceTasks(2);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\order\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\order\\out-3"));
job.waitForCompletion(true);
}
}
MapReduce
七,MapReduce中的输入输出控制
案例分析
需求:还是对案例3中的流量数据进行汇总,然后求出汇总结果中的流量最大的TOPN条
步骤1
思路:统计逻辑跟之前的流量统计一致:
:以手机号作为key,flowbean作为value
注:步骤1输出的结果文件通过指定SequenceFileOutputFormat来产生SequenceFile文件;SequenceFile文件是hadoop定义的一种文件,里面存放的是大量key-value的对象序列化字节(文件头部还存放了key和value所属的类型名);
步骤2:
思路:读取步骤1的SequenceFile结果文件,需要指定inputformatclass为SequenceFileInputFormat组件
既然使用了这种输入组件,那么我们的map方法中直接就接收一对KEY-VALUE数据
如何实现topn呢?
通过把所有汇总数据发给同一个reduce端的worker,并且通过定义一个GroupingComparator来让这个worker把所有手机号的flowbean对象看成一组,调用一次reduce方法,我们就只需要在reduce方法中输出前n个即可
代码实现
StepOne
package cn.edu360.mr.index.sequence;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
public class IndexStepOne {
public static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
// 产生 <hello-文件名,1>
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 从输入切片信息中获取当前正在处理的一行数据所属的文件
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String fileName = inputSplit.getPath().getName();
String[] words = value.toString().split(" ");
for (String w : words) {
// 将"单词-文件名"作为key,1作为value,输出
context.write(new Text(w + "-" + fileName), new IntWritable(1));
}
}
}
public static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
int count = 0;
for (IntWritable value : values) {
count += value.get();
}
context.write(key, new IntWritable(count));
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepOne.class);
job.setMapperClass(IndexStepOneMapper.class);
job.setReducerClass(IndexStepOneReducer.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// job.setOutputFormatClass(TextOutputFormat.class); // 这是默认的输出组件
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\index\\input"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out-seq-1"));
job.waitForCompletion(true);
}
}
StepOne
StepTwo
package cn.edu360.mr.index.sequence;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class IndexStepTwo {
public static class IndexStepTwoMapper extends Mapper<Text, IntWritable, Text, Text> {
@Override
protected void map(Text key, IntWritable value, Context context) throws IOException, InterruptedException {
String[] split = key.toString().split("-");
context.write(new Text(split[0]), new Text(split[1]+"-->"+value));
}
}
public static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text> {
// 一组数据: <hello,a.txt-->4> <hello,b.txt-->4> <hello,c.txt-->4>
@Override
protected void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
// stringbuffer是线程安全的,stringbuilder是非线程安全的,在不涉及线程安全的场景下,stringbuilder更快
StringBuilder sb = new StringBuilder();
for (Text value : values) {
sb.append(value.toString()).append("\t");
}
context.write(key, new Text(sb.toString()));
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration(); // 默认只加载core-default.xml core-site.xml
Job job = Job.getInstance(conf);
job.setJarByClass(IndexStepTwo.class);
job.setMapperClass(IndexStepTwoMapper.class);
job.setReducerClass(IndexStepTwoReducer.class);
job.setNumReduceTasks(1);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// job.setInputFormatClass(TextInputFormat.class); 默认的输入组件
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("F:\\mrdata\\index\\out1"));
FileOutputFormat.setOutputPath(job, new Path("F:\\mrdata\\index\\out2"));
job.waitForCompletion(true);
}
}
StepTwo
八,案例所需文档
点击下载