computer,huangxiaoming,85,86,41,75,93,42,85
computer,xuzheng,54,52,86,91,42
computer,huangbo,85,42,96,38
english,zhaobenshan,54,52,86,91,42,85,75
english,liuyifei,85,41,75,21,85,96,14
algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,liuyifei,76,95,86,74,68,74,48
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85
english,zhouqi,85,86,41,75,93,42,85,75,55,47,22
english,huangbo,85,42,96,38,55,47,22
algorithm,liutao,85,75,85,99,66
computer,huangzitao,85,86,41,75,93,42,85
math,wangbaoqiang,85,86,41,75,93,42,85
computer,liujialing,85,41,75,21,85,96,14,74,86
computer,liuyifei,75,85,62,48,54,96,15
computer,liutao,85,75,85,99,66,88,75,91
computer,huanglei,76,95,86,74,68,74,48
english,liujialing,75,85,62,48,54,96,15
math,huanglei,76,95,86,74,68,74,48
math,huangjiaju,85,75,86,85,85,74,86
math,liutao,48,58,67,86,15,33,85
english,huanglei,85,75,85,99,66,88,75,91
math,xuzheng,54,52,86,91,42,85,75
math,huangxiaoming,85,75,85,99,66,88,75,91
math,liujialing,85,86,41,75,93,42,85,75
english,huangxiaoming,85,86,41,75,93,42,85
algorithm,huangdatou,48,58,67,86,15,33,85
algorithm,huangzitao,85,86,41,75,93,42,85,75

数据解释:

  • 数据字段个数不固定:第一个是课程名称,总共四个课程,computer,math,english,algorithm
  • 第二个是学生姓名,后面是每次考试的分数

1.统计每门课程的参考人数和课程平均分

package com.hadoop.mapreduce.shuffle;

/**
 * 统计每门课程的参考人数(参加考试人数)和课程平均分
 */
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Test1 {
	/*
	 * map端:
	 * key:课程名	Text
	 * value:平均分 DoubleWritable
	 */
	static class MyMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
		Text mk = new Text();
		DoubleWritable mv = new DoubleWritable();

		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
				throws IOException, InterruptedException {
			String[] datas = value.toString().split(",");
			int count = 0;
			double avg = 0;
			int sum = 0;
			// 数据中从第三个字段开始才是分数
			for (int i = 2; i < datas.length; i++) {
				sum += Integer.parseInt(datas[i].trim());
				count++;
			}
			avg = sum / count * 1.0;
			mk.set(datas[0]);
			mv.set(avg);
			context.write(mk, mv);
		}

	}
	/*
	 * reduce端:
	 * key:课程名	Text
	 * value:平均分 次数 Text
	 */
	static class MyReducer extends Reducer<Text, DoubleWritable, Text, Text> {
		Text rv = new Text();

		@Override
		protected void reduce(Text key, Iterable<DoubleWritable> value,
				Reducer<Text, DoubleWritable, Text, Text>.Context context) throws IOException, InterruptedException {
			double sum = 0;
			int count = 0;
			double avg = 0;
			//算所有平均分的和
			for (DoubleWritable v : value) {
				// v.get()直接获取到java中对应数据类型的值
				sum += v.get();
				count++;
			}
			avg = sum / count * 1.0;
			rv.set(count + "\t" + String.format("%.2f", avg));
			context.write(key, rv);
		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Test1.class);

		job.setMapperClass(MyMapper.class);

		job.setReducerClass(MyReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(DoubleWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		Path inpath = new Path("F:\\test\\test1.txt");
		FileInputFormat.addInputPath(job, inpath);

		Path outpath = new Path("F:\\test\\testout\\");
		FileSystem fs = FileSystem.get(conf);
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);

		job.waitForCompletion(true);
	}

}

结果:

algorithm	6	71.33
computer	10	69.60
english	9	66.00
math	7	72.57

2.统计每一个学生的每一门课程的平均分,在对结果按照科目 输出到不同的文件

package com.hadoop.mapreduce.shuffle;

/**
 * 统计每一个学生的每一门课程的平均分,在对结果按照科目 输出到不同的文件 
 */
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Test2 {
	/*这一问主要考察hadoop中的Partitioner类
	 * map端:
	 * key 课程名 Text
	 * value 姓名 平均分 Text
	 */
	static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
		Text mk = new Text();
		Text mv = new Text();

		@Override
		protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			//与第一问一样的代码
			String[] datas = value.toString().split(",");
			double avg = 0;
			int sum = 0;
			int count = 0;
			for (int i = 2; i < datas.length; i++) {
				sum += Integer.parseInt(datas[i].trim());
				count++;
			}
			avg = sum / count * 1.0;
			String.format("%,2f", avg);
			mk.set(datas[0]);
			mv.set(datas[1] + "\t" + avg);
			context.write(mk, mv);
		}

	}

	static class MyReducer extends Reducer<Text, Text, Text, Text> {
		// 没有其他要求直接输出就行
		@Override
		protected void reduce(Text key, Iterable<Text> value, Reducer<Text, Text, Text, Text>.Context context)
				throws IOException, InterruptedException {
			for (Text v : value) {
				context.write(key, v);
			}
		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);
		
        //这里要加载一下自定义分区类,设置我们需要的maptask数
		job.setPartitionerClass(MyPartitioner.class);
		job.setNumReduceTasks(4);

		job.setJarByClass(Test2.class);

		job.setMapperClass(MyMapper.class);

		job.setReducerClass(MyReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		Path inpath = new Path("F:\\test\\test1.txt");
		FileInputFormat.addInputPath(job, inpath);

		Path outpath = new Path("F:\\test\\testout\\");
		FileSystem fs = FileSystem.get(conf);
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);

		job.waitForCompletion(true);
	}

}

//定义一个自定义分区类来继承Partitioner,我们map里面输出的类型为Text,Text
class MyPartitioner extends Partitioner<Text, Text> {
	// 定义一个map集合可以让思路显得很清晰
	static Map<String, Integer> map = new HashMap<String, Integer>();
	// 这里key中直接写了分组的类型,方便与key的toString值进行比较
	// 因为输出的文件名是从0开始的,所以分区算法一般从0开始分,避免Illegal partition for algorithm异常
	static {
		map.put("algorithm", 0);
		map.put("computer", 1);
		map.put("english", 2);
		map.put("math", 3);
	}

	@Override
	// 重写getPartition方法
	public int getPartition(Text key, Text value, int numPartitions) {
		// 与map中的key比较
		String subject = key.toString();
		// 返回value值
		Integer id = map.get(subject);
		// 直接返回id,进行对应的分区,id是多少就在哪一个分区
		return id;
	}

}

结果:

algorithm	huangzitao	72.0
algorithm	huangdatou	56.0
algorithm	liuyifei	62.0
algorithm	huanglei	74.0
algorithm	huangjiaju	82.0
algorithm	liutao	82.0
computer	huangzitao	72.0
computer	huanglei	74.0
computer	liutao	83.0
computer	liuyifei	62.0
computer	liujialing	64.0
computer	huangdatou	56.0
computer	huangjiaju	83.0
computer	huangbo	65.0
computer	xuzheng	65.0
computer	huangxiaoming	72.0
english	huangbo	55.0
english	liujialing	62.0
english	huangdatou	56.0
english	liuyifei	74.0
english	zhaobenshan	69.0
english	huangxiaoming	72.0
english	zhouqi	64.0
english	liuyifei	59.0
english	huanglei	83.0
math	huangjiaju	82.0
math	xuzheng	69.0
math	huangxiaoming	83.0
math	liujialing	72.0
math	liutao	56.0
math	huanglei	74.0
math	wangbaoqiang	72.0

3.统计每一门课程平均分最高的前两个学生

package com.hadoop.mapreduce.shuffle;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

/**
 */
public class CourseBean implements WritableComparable<CourseBean> {
	// 根据需求来定义
	private String subject;
	private String name;
	private double avg;

	public CourseBean() {
		super();
	}

	public CourseBean(String subject, String name, double avg) {
		super();
		this.subject = subject;
		this.name = name;
		this.avg = avg;
	}

	public String getSubject() {
		return subject;
	}

	public void setSubject(String subject) {
		this.subject = subject;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public double getAvg() {
		return avg;
	}

	// 一定要冲写toString方法不然输出的全是地址
	@Override
	public String toString() {
		return subject + "\t" + name + "\t" + avg;
	}

	public void setAvg(double avg) {
		this.avg = avg;
	}

	// 先按课程的字典顺序升序排,再按平均分排
	@Override
	public int compareTo(CourseBean o) {
		int temp = -o.subject.compareTo(this.subject);
		if (temp == 0)
			temp = (int) (o.avg - this.avg);
		return temp;
	}

	// 序列化,自定义类的必须进行的操作
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(subject);
		out.writeUTF(name);
		out.writeDouble(avg);
	}

	// 反序列化,自定义类的必须进行的操作
	@Override
	public void readFields(DataInput in) throws IOException {
		subject = in.readUTF();
		name = in.readUTF();
		avg = in.readDouble();
	}

}
package com.hadoop.mapreduce.shuffle;

/**
 * 统计每一门课程平均分最高的前两个学生
 */
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
 * 这一题首先我们要按课程排序,然后相同的课程按平均分排序,最后取一个
 * 要使用到自定义类与分组
 */
public class Test3 {
	/*
	 * map端:
	 * key 自定义 ScoreBean 包含输出的所有信息
	 * value 给一个 NullWritable即可
	 */
	static class MyMapper extends Mapper<LongWritable, Text, CourseBean, NullWritable> {

		@Override
		protected void map(LongWritable key, Text value,
				Mapper<LongWritable, Text, CourseBean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			// 与上面两问代码一样
			String[] datas = value.toString().split(",");
			double avg = 0;
			int sum = 0;
			int count = 0;
			for (int i = 2; i < datas.length; i++) {
				sum += Integer.parseInt(datas[i].trim());
				count++;
			}
			avg = sum / count * 1.0;
			String.format("%,2f", avg);
			CourseBean mk = new CourseBean(datas[0], datas[1].trim(), avg);
			context.write(mk, NullWritable.get());
		}

	}

	static class MyReducer extends Reducer<CourseBean, NullWritable, CourseBean, NullWritable> {

		@Override
		protected void reduce(CourseBean key, Iterable<NullWritable> value,
				Reducer<CourseBean, NullWritable, CourseBean, NullWritable>.Context context)
				throws IOException, InterruptedException {
			int count = 0;

			for (NullWritable v : value) {

				context.write(key, v);
				count++;
				// 因为只要两个,这里我们等于2了就结束循环
				if (count == 2)
					break;
			}

		}

	}

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(Test3.class);

		job.setMapperClass(MyMapper.class);

		job.setReducerClass(MyReducer.class);

		job.setMapOutputKeyClass(CourseBean.class);
		job.setMapOutputValueClass(NullWritable.class);


		job.setOutputKeyClass(CourseBean.class);
		job.setOutputValueClass(NullWritable.class);
		//加载分组函数
		job.setGroupingComparatorClass(SubjectScoreGroupComparator.class);

	
		Path inpath = new Path("F:\\test\\test1.txt");
		FileInputFormat.addInputPath(job, inpath);

		Path outpath = new Path("F:\\test\\testout\\");
		FileSystem fs = FileSystem.get(conf);
		if (fs.exists(outpath)) {
			fs.delete(outpath, true);
		}
		FileOutputFormat.setOutputPath(job, outpath);


		job.waitForCompletion(true);
	}

	public static class SubjectScoreGroupComparator extends WritableComparator {
		//跟一下WritableComparator的源码我们可以发现这里必须给true了才能创建对象
		SubjectScoreGroupComparator() {
			//防止空指针异常
			super(CourseBean.class, true);
		}

		@Override
		public int compare(WritableComparable a, WritableComparable b) {

			CourseBean cb1 = (CourseBean) a;
			CourseBean cb2 = (CourseBean) b;

			int result = cb1.getSubject().compareTo(cb2.getSubject());
			return result;
		}
	}

}

结果:

algorithm	huangjiaju	82.0
algorithm	liutao	82.0
computer	huangjiaju	83.0
computer	liutao	83.0
english	huanglei	83.0
english	liuyifei	74.0
math	huangxiaoming	83.0
math	huangjiaju	82.0