MapReduce--MapJoin、ReduceJoin、TopN

 

1. MapReduce Join

  • Join分为两种:一种是Map Join,一种是Reduce Join
  • MapJoin 指的是在Map端进行Join,没有Reduce,所以没有Shuffle
  • ReduceJoin 指的是在Reduce端进行Join,存在Reduce,存在Shuffle
  • 在大数据领域经常听到数据倾斜这个词,数据倾斜的原因是由于某个Key的数量过多,导致Reduce里面的数据不均匀导致的,会出现某个Reduce task执行速度过慢,导致整个job执行速度比较慢
  • 数据倾斜是发生在Shuffle阶段,为了避免数据倾斜,即避免Shuffle,可以使用MapJoin

 

2 MapJoin

2.1 使用场景

  • 一个大表Join小表

2.2 具体做法

  • 在启动mapreduce的方法里面缓存文件路径
  • 在Mapper的setup阶段,将文件读取到缓存集合中
  • 在map阶段进行匹配数据

2.3 需求

把dept和emp数据进行MapJoin

2.4 数据

2.4.1 dept.txt

deptno,dname,message
10	ACCOUNTING	NEW YORK
20	RESEARCH	DALLAS
30	SALES	CHICAGO
40	OPERATIONS	BOSTON

2.4.2 emp.txt

empno,ename,deptno
7369	SMITH	20
7499	ALLEN	30
7521	WARD	30
7566	JONES	20
7654	MARTIN	30
7698	BLAKE	30
7782	CLARK	10
7788	SCOTT	20
7839	KING	10
7844	TURNER	30
7876	ADAMS	20
7900	JAMES	30
7902	FORD	20
7934	MILLER	10

2.5 Code

2.5.1 MapJoinDriver Code

package com.xk.bigata.hadoop.mapreduce.join;

import com.xk.bigata.hadoop.mapreduce.domain.AccessDomain;
import com.xk.bigata.hadoop.mapreduce.domain.DeptWritable;
import com.xk.bigata.hadoop.mapreduce.domain.EmpInfoWritable;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class MapJoinDriver {

    public static void main(String[] args) throws Exception {

        String input = "mapreduce-basic/data/join/emp.txt";
        String output = "mapreduce-basic/out";

        // 1 创建 MapReduce job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 删除输出路径
        FileUtils.deleteFile(job.getConfiguration(), output);

        // 2 设置运行主类
        job.setJarByClass(MapJoinDriver.class);

        // 3 设置Map和Reduce运行的类
        job.setMapperClass(MyMapper.class);

        // 4 设置Map 输出的 KEY 和 VALUE 数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(AccessDomain.class);

        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        // 设置reduce task 数量
        job.setNumReduceTasks(0);

        // 缓存dept路径
        job.addCacheFile(new URI("mapreduce-basic/data/join/dept.txt"));

        // 7 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }

    public static class MyMapper extends Mapper<LongWritable, Text, EmpInfoWritable, NullWritable> {

        Map<Integer, DeptWritable> cache = new HashMap<>();

        // 把文件放在缓存的map里面
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            String deptPath = context.getCacheFiles()[0].getPath();
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(deptPath)));
            String message = null;
            while (!StringUtils.isEmpty(message = reader.readLine())) {
                String[] split = message.split("\t");
                cache.put(Integer.parseInt(split[0]), new DeptWritable(Integer.parseInt(split[0]), split[1], split[2]));
            }
            reader.close();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] spilts = value.toString().split("\t");
            int deptNo = Integer.parseInt(spilts[2]);
            DeptWritable deptWritable = cache.get(deptNo);
            // int empNo, String eName, int deptNo, String dName, String dMessage
            EmpInfoWritable empInfoWritable = new EmpInfoWritable(Integer.parseInt(spilts[0]), spilts[1], deptNo, deptWritable.getdName(), deptWritable.getdMessage());
            context.write(empInfoWritable, NullWritable.get());
        }
    }
}

2.5.2 DeptWritable Code

package com.xk.bigata.hadoop.mapreduce.domain;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class DeptWritable implements Writable {

    private Integer deptNo;

    private String dName;

    private String dMessage;

    public DeptWritable() {
    }

    @Override
    public String toString() {
        return deptNo + "\t" + dName + "\t" + dMessage;
    }

    public DeptWritable(Integer deptNo, String dName, String dMessage) {
        this.deptNo = deptNo;
        this.dName = dName;
        this.dMessage = dMessage;
    }

    public int getDeptNo() {
        return deptNo;
    }

    public void setDeptNo(int deptNo) {
        this.deptNo = deptNo;
    }

    public String getdName() {
        return dName;
    }

    public void setdName(String dName) {
        this.dName = dName;
    }

    public String getdMessage() {
        return dMessage;
    }

    public void setdMessage(String dMessage) {
        this.dMessage = dMessage;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(deptNo);
        out.writeUTF(dName);
        out.writeUTF(dMessage);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.deptNo = in.readInt();
        this.dName = in.readUTF();
        this.dMessage = in.readUTF();
    }
}

2.5.3 EmpWritable Code

package com.xk.bigata.hadoop.mapreduce.domain;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class EmpWritable implements Writable {

    private int empNo;

    private String eName;

    private int deptNo;

    @Override
    public String toString() {
        return empNo + "\t" + eName + "\t" + deptNo;
    }

    public EmpWritable() {
    }

    public EmpWritable(int empNo, String eName, int deptNo) {
        this.empNo = empNo;
        this.eName = eName;
        this.deptNo = deptNo;
    }

    public int getEmpNo() {
        return empNo;
    }

    public void setEmpNo(int empNo) {
        this.empNo = empNo;
    }

    public String geteName() {
        return eName;
    }

    public void seteName(String eName) {
        this.eName = eName;
    }

    public int getDeptNo() {
        return deptNo;
    }

    public void setDeptNo(int deptNo) {
        this.deptNo = deptNo;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(empNo);
        out.writeUTF(eName);
        out.writeInt(deptNo);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.empNo = in.readInt();
        this.eName = in.readUTF();
        this.deptNo = in.readInt();
    }
}

2.5.4 EmpInfoWritable Code

package com.xk.bigata.hadoop.mapreduce.domain;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class EmpInfoWritable implements Writable {

    private int empNo;

    private String eName;

    private int deptNo;

    private String dName;

    private String dMessage;

    @Override
    public String toString() {
        return empNo + "\t" + eName + "\t" +
                deptNo + "\t" + dName + "\t" +
                dMessage;
    }

    public EmpInfoWritable(int empNo, String eName, int deptNo, String dName, String dMessage) {
        this.empNo = empNo;
        this.eName = eName;
        this.deptNo = deptNo;
        this.dName = dName;
        this.dMessage = dMessage;
    }

    public EmpInfoWritable() {
    }

    public int getEmpNo() {
        return empNo;
    }

    public void setEmpNo(int empNo) {
        this.empNo = empNo;
    }

    public String geteName() {
        return eName;
    }

    public void seteName(String eName) {
        this.eName = eName;
    }

    public int getDeptNo() {
        return deptNo;
    }

    public void setDeptNo(int deptNo) {
        this.deptNo = deptNo;
    }

    public String getdName() {
        return dName;
    }

    public void setdName(String dName) {
        this.dName = dName;
    }

    public String getdMessage() {
        return dMessage;
    }

    public void setdMessage(String dMessage) {
        this.dMessage = dMessage;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(empNo);
        out.writeUTF(eName);
        out.writeInt(deptNo);
        out.writeUTF(dName);
        out.writeUTF(dMessage);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.empNo = in.readInt();
        this.eName = in.readUTF();
        this.deptNo = in.readInt();
        this.dName = in.readUTF();
        this.dMessage = in.readUTF();
    }
}

2.6 结果

7369	SMITH	20	RESEARCH	DALLAS
7499	ALLEN	30	SALES	CHICAGO
7521	WARD	30	SALES	CHICAGO
7566	JONES	20	RESEARCH	DALLAS
7654	MARTIN	30	SALES	CHICAGO
7698	BLAKE	30	SALES	CHICAGO
7782	CLARK	10	ACCOUNTING	NEW YORK
7788	SCOTT	20	RESEARCH	DALLAS
7839	KING	10	ACCOUNTING	NEW YORK
7844	TURNER	30	SALES	CHICAGO
7876	ADAMS	20	RESEARCH	DALLAS
7900	JAMES	30	SALES	CHICAGO
7902	FORD	20	RESEARCH	DALLAS
7934	MILLER	10	ACCOUNTING	NEW YORK

MapJoinDriver Code

 

3 Reduce Join

3.1 使用场景

  • 大表Join 大表

3.2 具体做法

  • 先把两个文件的数据全部加载到Map里面
  • 通过增加一个标识符来判断进来的是emp数据还是dept数据
  • 在Reduce端进行Join操作

3.3 需求

把dept和emp数据进行ReduceJoin

3.4 数据

3.4.1 dept.txt

deptno,dname,message
10	ACCOUNTING	NEW YORK
20	RESEARCH	DALLAS
30	SALES	CHICAGO
40	OPERATIONS	BOSTON

3.4.2 emp.txt

empno,ename,deptno
7369	SMITH	20
7499	ALLEN	30
7521	WARD	30
7566	JONES	20
7654	MARTIN	30
7698	BLAKE	30
7782	CLARK	10
7788	SCOTT	20
7839	KING	10
7844	TURNER	30
7876	ADAMS	20
7900	JAMES	30
7902	FORD	20
7934	MILLER	10

3.5 Code

 3.5.1 ReduceJoinDriver Code

package com.xk.bigata.hadoop.mapreduce.join;

import com.xk.bigata.hadoop.mapreduce.domain.EmpInfoReduceJoinWritable;
import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class ReduceJoinDriver {

    public static void main(String[] args) throws Exception {

        String input = "mapreduce-basic/data/join";
        String output = "mapreduce-basic/out";

        // 1 创建 MapReduce job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 删除输出路径
        FileUtils.deleteFile(job.getConfiguration(), output);

        // 2 设置运行主类
        job.setJarByClass(ReduceJoinDriver.class);

        // 3 设置Map和Reduce运行的类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReduce.class);

        // 4 设置Map 输出的 KEY 和 VALUE 数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(EmpInfoReduceJoinWritable.class);

        // 5 设置Reduce 输出 KEY 和 VALUE 数据类型
        job.setOutputKeyClass(EmpInfoReduceJoinWritable.class);
        job.setOutputValueClass(NullWritable.class);

        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        // 7 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }

    public static class MyMapper extends Mapper<LongWritable, Text, Text, EmpInfoReduceJoinWritable> {

        private String fileName;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            // 确定读取的是哪一个文件
            FileSplit fileSplit = (FileSplit) context.getInputSplit();
            fileName = fileSplit.getPath().getName();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            if (fileName.contains("dept")) {
                // 进入的数据是dept
                String[] spits = value.toString().split("\t");
                EmpInfoReduceJoinWritable empInfoReduceJoinWritable = new EmpInfoReduceJoinWritable();
                empInfoReduceJoinWritable.setDeptNo(Integer.parseInt(spits[0]));
                empInfoReduceJoinWritable.setdName(spits[1]);
                empInfoReduceJoinWritable.setdMessage(spits[2]);
                empInfoReduceJoinWritable.setEmpNo(0);
                empInfoReduceJoinWritable.setFlg("0");
                empInfoReduceJoinWritable.seteName("");
                context.write(new Text(spits[0]), empInfoReduceJoinWritable);
            } else if (fileName.contains("emp")) {
                // 进来的数据是emp
                String[] splits = value.toString().split("\t");
                EmpInfoReduceJoinWritable empInfoReduceJoinWritable = new EmpInfoReduceJoinWritable();
                empInfoReduceJoinWritable.setEmpNo(Integer.parseInt(splits[0]));
                empInfoReduceJoinWritable.seteName(splits[1]);
                empInfoReduceJoinWritable.setDeptNo(Integer.parseInt(splits[2]));
                empInfoReduceJoinWritable.setFlg("1");
                empInfoReduceJoinWritable.setdMessage("");
                empInfoReduceJoinWritable.setdName("");
                context.write(new Text(splits[2]), empInfoReduceJoinWritable);
            }
        }
    }

    public static class MyReduce extends Reducer<Text, EmpInfoReduceJoinWritable, EmpInfoReduceJoinWritable, NullWritable> {
        // 相同的deptno(key)所对应的emp 和 dept的数据都落在了values
        @Override
        protected void reduce(Text key, Iterable<EmpInfoReduceJoinWritable> values, Context context) throws IOException, InterruptedException {
            List<EmpInfoReduceJoinWritable> emps = new ArrayList<>();
            String dName = null;
            String dMessage = null;
            for (EmpInfoReduceJoinWritable emp : values) {
                if (emp.getFlg().equals("1")) {
                    // 数据来自emp
                    EmpInfoReduceJoinWritable empInfoReduceJoinWritable = new EmpInfoReduceJoinWritable();
                    empInfoReduceJoinWritable.setEmpNo(emp.getEmpNo());
                    empInfoReduceJoinWritable.seteName(emp.geteName());
                    empInfoReduceJoinWritable.setDeptNo(emp.getDeptNo());
                    emps.add(empInfoReduceJoinWritable);
                } else {
                    // 数据来自dept
                    dName = emp.getdName();
                    dMessage = emp.getdMessage();
                }
            }

            for (EmpInfoReduceJoinWritable bean : emps) {
                bean.setdName(dName);
                bean.setdMessage(dMessage);
                context.write(bean, NullWritable.get());
            }

        }
    }

}

3.5.2 EmpInfoReduceJoinWritable Code

package com.xk.bigata.hadoop.mapreduce.domain;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class EmpInfoReduceJoinWritable implements Writable {

    private int empNo;

    private String eName;

    private int deptNo;

    private String dName;

    private String dMessage;

    private String flg;

    @Override
    public String toString() {
        return empNo + "\t" + eName + "\t" +
                deptNo + "\t" + dName + "\t" +
                dMessage;
    }

    public EmpInfoReduceJoinWritable(int empNo, String eName, int deptNo, String dName, String dMessage, String flg) {
        this.empNo = empNo;
        this.eName = eName;
        this.deptNo = deptNo;
        this.dName = dName;
        this.dMessage = dMessage;
        this.flg = flg;
    }

    public EmpInfoReduceJoinWritable() {
    }

    public int getEmpNo() {
        return empNo;
    }

    public void setEmpNo(int empNo) {
        this.empNo = empNo;
    }

    public String geteName() {
        return eName;
    }

    public void seteName(String eName) {
        this.eName = eName;
    }

    public int getDeptNo() {
        return deptNo;
    }

    public void setDeptNo(int deptNo) {
        this.deptNo = deptNo;
    }

    public String getdName() {
        return dName;
    }

    public void setdName(String dName) {
        this.dName = dName;
    }

    public String getdMessage() {
        return dMessage;
    }

    public void setdMessage(String dMessage) {
        this.dMessage = dMessage;
    }

    public String getFlg() {
        return flg;
    }

    public void setFlg(String flg) {
        this.flg = flg;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(empNo);
        out.writeUTF(eName);
        out.writeInt(deptNo);
        out.writeUTF(dName);
        out.writeUTF(dMessage);
        out.writeUTF(flg);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.empNo = in.readInt();
        this.eName = in.readUTF();
        this.deptNo = in.readInt();
        this.dName = in.readUTF();
        this.dMessage = in.readUTF();
        this.flg = in.readUTF();
    }
}

3.6 结果

7934	MILLER	10	ACCOUNTING	NEW YORK
7839	KING	10	ACCOUNTING	NEW YORK
7782	CLARK	10	ACCOUNTING	NEW YORK
7876	ADAMS	20	RESEARCH	DALLAS
7788	SCOTT	20	RESEARCH	DALLAS
7369	SMITH	20	RESEARCH	DALLAS
7566	JONES	20	RESEARCH	DALLAS
7902	FORD	20	RESEARCH	DALLAS
7844	TURNER	30	SALES	CHICAGO
7499	ALLEN	30	SALES	CHICAGO
7698	BLAKE	30	SALES	CHICAGO
7654	MARTIN	30	SALES	CHICAGO
7521	WARD	30	SALES	CHICAGO
7900	JAMES	30	SALES	CHICAGO

ReduceJoinDriver Code

 

4 TopN

  • 分组TopN就是先分组,然后取每个组内某个字段聚合结果比较多的前N

4.1 需求

emp数据每个部门的薪资Top前两名
第一个字段是工号
第二个字段是姓名
倒数第二个字段是薪资
倒数第一字段是部门编号

4.2 数据

7369	SMITH	CLERK	7902	1980-12-17	800.00		20
7499	ALLEN	SALESMAN	7698	1981-2-20	1600.00	300.00	30
7521	WARD	SALESMAN	7698	1981-2-22	1250.00	500.00	30
7566	JONES	MANAGER	7839	1981-4-2	2975.00		20
7654	MARTIN	SALESMAN	7698	1981-9-28	1250.00	1400.00	30
7698	BLAKE	MANAGER	7839	1981-5-1	2850.00		30
7782	CLARK	MANAGER	7839	1981-6-9	2450.00		10
7788	SCOTT	ANALYST	7566	1987-4-19	3000.00		20
7839	KING	PRESIDENT		1981-11-17	5000.00		10
7844	TURNER	SALESMAN	7698	1981-9-8	1500.00	0.00	30
7876	ADAMS	CLERK	7788	1987-5-23	1100.00		20
7900	JAMES	CLERK	7698	1981-12-3	950.00		30
7902	FORD	ANALYST	7566	1981-12-3	3000.00		20
7934	MILLER	CLERK	7782	1982-1-23	1300.00		10

4.3 Code

4.3.1 TopNEmpWritable Code

package com.xk.bigata.hadoop.mapreduce.top;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class TopNEmpWritable implements Writable {

    private int empNo;

    private String eName;

    private int deptNo;

    private double wages;

    @Override
    public String toString() {
        return empNo + "\t" + eName + "\t" + deptNo + "\t" + wages;
    }

    public TopNEmpWritable() {
    }

    public TopNEmpWritable(int empNo, String eName, int deptNo, double wages) {
        this.empNo = empNo;
        this.eName = eName;
        this.deptNo = deptNo;
        this.wages = wages;
    }

    public int getEmpNo() {
        return empNo;
    }

    public void setEmpNo(int empNo) {
        this.empNo = empNo;
    }

    public String geteName() {
        return eName;
    }

    public void seteName(String eName) {
        this.eName = eName;
    }

    public int getDeptNo() {
        return deptNo;
    }

    public void setDeptNo(int deptNo) {
        this.deptNo = deptNo;
    }

    public double getWages() {
        return wages;
    }

    public void setWages(double wages) {
        this.wages = wages;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(empNo);
        out.writeUTF(eName);
        out.writeInt(deptNo);
        out.writeDouble(wages);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.empNo = in.readInt();
        this.eName = in.readUTF();
        this.deptNo = in.readInt();
        this.wages = in.readDouble();
    }
}

4.3.2 GroupTopDriver Code

package com.xk.bigata.hadoop.mapreduce.top;

import com.xk.bigata.hadoop.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class GroupTopDriver {

    public static void main(String[] args) throws Exception {

        String input = "mapreduce-basic/data/emp.data";
        String output = "mapreduce-basic/out";

        // 1 创建 MapReduce job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 删除输出路径
        FileUtils.deleteFile(job.getConfiguration(), output);

        // 2 设置运行主类
        job.setJarByClass(GroupTopDriver.class);

        // 3 设置Map和Reduce运行的类
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        // 4 设置Map 输出的 KEY 和 VALUE 数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TopNEmpWritable.class);

        // 5 设置Reduce 输出 KEY 和 VALUE 数据类型
        job.setOutputKeyClass(TopNEmpWritable.class);
        job.setOutputValueClass(NullWritable.class);

        // 6 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(input));
        FileOutputFormat.setOutputPath(job, new Path(output));

        // 7 提交job
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }

    public static class MyMapper extends Mapper<LongWritable, Text, Text, TopNEmpWritable> {

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] spilts = value.toString().split("\t");
            TopNEmpWritable topNEmpWritable = new TopNEmpWritable(Integer.parseInt(spilts[0]),
                    spilts[1],
                    Integer.parseInt(spilts[7]),
                    Double.parseDouble(spilts[5]));
            context.write(new Text(spilts[7]), topNEmpWritable);
        }
    }

    public static class MyReducer extends Reducer<Text, TopNEmpWritable, TopNEmpWritable, NullWritable> {
        int topN = 2;

        @Override
        protected void reduce(Text key, Iterable<TopNEmpWritable> values, Context context) throws IOException, InterruptedException {
            int index = 1;
            for (TopNEmpWritable value : values) {
                index++;
                context.write(value, NullWritable.get());
                if (index > topN) {
                    break;
                }
            }
        }
    }
}

4.4 结果

7934	MILLER	10	1300.0
7839	KING	10	5000.0
7876	ADAMS	20	1100.0
7788	SCOTT	20	3000.0
7844	TURNER	30	1500.0
7499	ALLEN	30	1600.0

GroupTopDriver Code