目标:
输入数据:
- sort1 1
- sort2 3
- sort2 88
- sort2 54
- sort1 2
- sort6 22
- sort6 888
- sort6 58
输出数据:
- sort1 1,2
- sort2 3,54,88
- sort6 22,58,888
既然是二次排序,所以上述的输入数据必须在同一分区中,不然达不到我们想要的二次排序效果。MapReduce框架默认是对key进行排序的,这里我们需要自定义排序方法对key进行排序。
先理解MapReduce处理数据的流程:
(这是在另一个大神的博客看到的 )
我们应该先深刻的理解MapReduce处理数据的整个流程,这是最基础的,不然的话是不可能找到解决问题的思路的。我描述一下MapReduce处理数据的大概流程:首先,MapReduce框架通过getSplits()方法实现对原始文件的切片之后,每一个切片对应着一个MapTask,InputSplit输入到map()函数进行处理,中间结果经过环形缓冲区的排序,然后分区、自定义二次排序(如果有的话)和合并,再通过Shuffle操作将数据传输到reduce Task端,reduce端也存在着缓冲区,数据也会在缓冲区和磁盘中进行合并排序等操作,然后对数据按照key值进行分组,然后每处理完一个分组之后就会去调用一次reduce()函数,最终输出结果。大概流程 我画了一下,如下图:
大概解决思路:
map端排序:
首先将输入数据放到同一分区中并对其进行排序,因为MapReduce排序只对key进行排序,所以我们需要一个组合key,以[key,value]的形式输入,map输入的数据类型结构大概为{[key,value],value}:
- {[sort1,1],1}
- {[sort2,3],3}
- {[sort2,88],88}
- {[sort2,54],54}
- {[sort1,2],2}
- {[sort6,22],22}
- {[sort6,888],888}
- {[sort6,58],58}
温馨提示:请务必保证map端排序结果正确,不然不管怎么分组都得不到你想要的结果(本人就踩过这坑,费了好长时间才找到原因)。
map端排序后的结果应该如下:
- sort1:1,
sort1:2,
sort2:3,
sort2:54,
sort2:88,
sort6:22,
sort6:58,
sort6:888,
reduce端分组:
前面提到过,reduce方法执行的次数是由分组数决定的。在reduce端按组合key的第一个字段进行分组,然后再对对应的value值进行相应的处理则可以达到我们想要的结果。
- sort1 1,2
- sort2 3,54,88
- sort6 22,58,888
具体的代码实现如下:
自定义排序类:
package com.secondSortJob;
import com.sun.deploy.util.ArrayUtil;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* Created by Administrator on 2018/1/28 0028.
*/
public class SecondKeySort implements WritableComparable<SecondKeySort> {
private Text key = new Text();
private IntWritable count = new IntWritable();
public SecondKeySort(Text key, IntWritable count) {
this.key = key;
this.count = count;
}
public SecondKeySort() {
}
public Text getKey() {
return key;
}
public void setKey(Text key) {
this.key = key;
}
public IntWritable getCount() {
return count;
}
public void setCount(IntWritable count) {
this.count = count;
}
public void write(DataOutput dataOutput) throws IOException {
// new Text("["+this.key.toString()+","+this.count.toString()+"]");
this.key.write(dataOutput);
this.count.write(dataOutput);
}
public void readFields(DataInput dataInput) throws IOException {
this.key.readFields(dataInput);
this.count.readFields(dataInput);
}
public int compareTo(SecondKeySort o) {
String keyStr1 = this.key.toString().substring(0,this.key.toString().length());
String keyStr2 = o.getKey().toString().substring(0,o.getKey().toString().length());
String[] keyStr1Arr = keyStr1.split("\t");
String[] keyStr2Arr = keyStr2.split("\t");
String keyStr1Text = keyStr1Arr [0];
String keyStr2Text = keyStr2Arr [0];
if(!keyStr1Text.equals(keyStr2Text)){
System.out.println(keyStr1Text.compareTo(keyStr2Text));
return keyStr1Text.compareTo(keyStr2Text) ;
}else{
return Integer.parseInt(this.count.toString()) - Integer.parseInt(o.getCount().toString()) ;
}
}
@Override
public String toString() {
return this.key.toString() +"\t" +this.count.toString();
}
}
mapper任务类:
package com.secondSortJob;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* Created by Administrator on 2018/1/28 0028.
*/
public class SecondSortMapper extends Mapper<LongWritable ,Text,SecondKeySort,IntWritable> {
private SecondKeySort secondKeySort = new SecondKeySort();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] arrs = value.toString().split("\t");
secondKeySort.setKey(new Text(arrs[0]));
secondKeySort.setCount(new IntWritable(Integer.parseInt(arrs[1].trim())));
context.write(secondKeySort,new IntWritable(Integer.parseInt(arrs[1].trim())));
System.out.println(secondKeySort.toString()+" "+arrs[1]);
}
}
reduce任务类:
package com.secondSortJob;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* Created by Administrator on 2018/1/28 0028.
*/
public class SecondSortReducer extends Reducer<SecondKeySort,IntWritable,Text,NullWritable> {
@Override
protected void reduce(SecondKeySort key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
System.out.println("reduce action");
String str = key.getKey().toString() +":";
while (values.iterator().hasNext()){
str = str + values.iterator().next()+",";
}
context.write(new Text(str),NullWritable.get());
}
}
自定义分区类:(保证输入数据在同一个分区,分区数和Reduce任务数相等)
package com.secondSortJob;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* Created by Administrator on 2018/1/29 0029.
*/
public class GroupingPartitioner extends Partitioner<SecondKeySort,IntWritable> {
public int getPartition(SecondKeySort secondKeySort, IntWritable intWritable, int i) {
return secondKeySort.getKey().toString().hashCode()%i ;
}
}
分组函数类:(可以实现RawComparator接口,也可以继承WritableComparator类)
package com.secondSortJob;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
/**
* Created by Administrator on 2018/1/28 0028.
*/
public class GroupingComparator implements RawComparator<SecondKeySort> {
public int compare(byte[] bytes, int i, int i1, byte[] bytes1, int i2, int i3) {
int compareBytes = WritableComparator.compareBytes(bytes, i, 8, bytes1, i2, 8);
return compareBytes;
}
public int compare(SecondKeySort o1, SecondKeySort o2) {
return o1.getKey().compareTo(o2.getKey());
}
}
主程序加载类:
package com.secondSortJob;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
/**
* 二次排序
* Created by Administrator on 2018/1/28 0028.
*/
public class SecondSortJob {
private static Configuration configuration = new Configuration();
private static String master = "hdfs://192.168.8.222";
private static String masterFS = "hdfs://192.168.8.222:9000";
private static FileSystem fs = null ;
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
configuration.set("yarn.resourcemanager.hostname",master);
fs = FileSystem.get(URI.create(masterFS),configuration);
Job job = Job.getInstance(configuration);
String inputPath = "/lsw/secondSort.txt";
String secondSortResult = "/lsw/secondSortResult" ;
if(fs.isDirectory(new Path(masterFS+secondSortResult)))
fs.delete(new Path(masterFS+secondSortResult));
job.setMapOutputKeyClass(SecondKeySort.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setJarByClass(SecondSortJob.class);
job.setMapperClass(SecondSortMapper.class);
job.setReducerClass(SecondSortReducer.class);
job.setPartitionerClass(GroupingPartitioner.class);
job.setGroupingComparatorClass(GroupingComparator.class);
// job.setNumReduceTasks(3);
FileInputFormat.addInputPath(job,new Path(masterFS+inputPath));
FileOutputFormat.setOutputPath(job,new Path(masterFS+secondSortResult));
job.waitForCompletion(true);
}
}