题目及数据:

computer,huangxiaoming,85,86,41,75,93,42,85
computer,xuzheng,54,52,86,91,42
computer,huangbo,85,42,96,38
english,zhaobenshan,54,52,86,91,42,85,75
english,liuyifei,85,41,75,21,85,96,14
algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,liuyifei,76,95,86,74,68,74,48
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85
english,zhouqi,85,86,41,75,93,42,85,75,55,47,22
english,huangbo,85,42,96,38,55,47,22
algorithm,liutao,85,75,85,99,66
computer,huangzitao,85,86,41,75,93,42,85
math,wangbaoqiang,85,86,41,75,93,42,85
computer,liujialing,85,41,75,21,85,96,14,74,86
computer,liuyifei,75,85,62,48,54,96,15
computer,liutao,85,75,85,99,66,88,75,91
computer,huanglei,76,95,86,74,68,74,48
english,liujialing,75,85,62,48,54,96,15
math,huanglei,76,95,86,74,68,74,48
math,huangjiaju,85,75,86,85,85,74,86
math,liutao,48,58,67,86,15,33,85
english,huanglei,85,75,85,99,66,88,75,91
math,xuzheng,54,52,86,91,42,85,75
math,huangxiaoming,85,75,85,99,66,88,75,91
math,liujialing,85,86,41,75,93,42,85,75
english,huangxiaoming,85,86,41,75,93,42,85
algorithm,huangdatou,48,58,67,86,15,33,85
algorithm,huangzitao,85,86,41,75,93,42,85,75

一、数据解释

数据字段个数不固定:
第一个是课程名称,总共四个课程,computer,math,english,algorithm,
第二个是学生姓名,后面是每次考试的分数

二、统计需求:
1、统计每门课程的参考人数和课程平均分

2、统计每门课程参考学生的平均分,并且按课程存入不同的结果文件,要求一门课程一个结果文件,并且按平均分从高到低排序,分数保留一位小数

3、求出每门课程参考学生平均分最高的学生的信息:课程,姓名和平均分

题目解析:1、课程平均分需要在map中先计算每个人的课程平均成绩,然后在reduce中求出整体的平均成绩

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午7:16:47
 * @Description:
 */
package lpj.reduceWork;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
 *
 */
public class StudentScore3MR {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		conf.addResource("hdfs-site.xml");//使用配置文件
//		System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
		FileSystem fs = FileSystem.get(conf);//默认使用本地
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(StudentScore3MR.class);
		job.setMapperClass(StudentScore3MR_Mapper.class);
		job.setReducerClass(StudentScore3MR_Reducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
//		
//		String inputpath = args[0];
//		String outpath = args[1];
		
		Path inputPath = new Path("d:/a/homework6.txt");
		Path outputPath = new Path("d:/a/homework6");
		if (fs.exists(inputPath)) {
			fs.delete(outputPath, true);
		}
		
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		boolean isdone = job.waitForCompletion(true);
		System.exit(isdone ? 0 : 1);
	}
	//1、统计每门课程的参考人数和课程平均分
	public static class StudentScore3MR_Mapper extends Mapper<LongWritable, Text, Text, Text>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			//algorithm,huangzitao,85,86,41,75,93,42,85,75
			String [] reads = value.toString().trim().split(",");
			String kk = reads[0];
			int sum = 0;
			int count = 0;
			double avg = 0;
			for(int i = 2; i < reads.length; i++){
				sum += Integer.parseInt(reads[i]);
				count++;
			}
			avg = 1.0 * sum / count;
			String vv = avg + "";
			kout.set(kk);
			valueout.set(vv);
			context.write(kout, valueout);
		}
	}
	public static class StudentScore3MR_Reducer extends Reducer<Text, Text, Text, Text>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
			double sum = 0;
			int count = 0;
			double avg = 0;
			for(Text text : values){
				sum += Double.parseDouble(text.toString());
				count ++;
			}
			avg = sum / count;
			String vv = count + "\t" + avg;
			valueout.set(vv);
			context.write(key, valueout);
		}
		
	}

}

结果:

algorithm	6	71.60119047619047
computer	10	69.79896825396825
english	9	66.22655122655122
math	7	72.88265306122449

2、输出结果存储到不同的结果文件中,需要指定setNumReduceTasks,分区规则通过使用partitioner进行分区设定,平均成绩需要进行排序,可以使用封装对象的方式,通过实现WritableComparable接口进行设置排序规则

实体类定义:

/**
 * @author: lpj   
 * @date: 2018年3月14日 下午9:46:02
 * @Description:
 */
package lpj.day2.homeworkbean;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;

import org.apache.hadoop.io.WritableComparable;

/**
 *
 */
public class Student implements WritableComparable<Student>{
	private String name;
	private double score;
	private String course;
	

	public String getName() {
		return name;
	}
	public void setName(String name) {
		this.name = name;
	}
	public double getScore() {
		return score;
	}
	public void setScore(double score) {
		this.score = score;
	}
	public String getCourse() {
		return course;
	}
	public void setCourse(String course) {
		this.course = course;
	}
	
	@Override
	public String toString() {
		DecimalFormat fs = new DecimalFormat("#.#");
	
			return  course + "\t" +name+ "\t"+ fs.format(score);

		
	}

	public Student() {
		
	}


	public Student(String name, double score, String course) {
		super();
		this.name = name;
		this.score = score;
		this.course = course;
	}
	@Override
	public int compareTo(Student o) {
		int diff = this.course.compareTo(o.course);
		if (diff == 0) {
			
			return (int)(o.score - this.score);
		}else{
			return diff > 0 ? 1 : -1;
		}
	}
	/* (non-Javadoc)
	 * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		name = in.readUTF();
		score = in.readDouble();
		course = in.readUTF();
	}
	/* (non-Javadoc)
	 * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(name);
		out.writeDouble(score);
		out.writeUTF(course);
	}
	

}

分区器定义:

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午10:13:24
 * @Description:
 */
package lpj.reduceWorkbean;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 *
 */
public class MyPatitioner extends Partitioner<Student, NullWritable>{

	/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
	 */
	@Override
	public int getPartition(Student key, NullWritable value, int numPartitions) {
		if (key.toString().startsWith("math")) {
			return 0;
		}else if (key.toString().startsWith("english")) {
			return 1;
		}else if (key.toString().startsWith("computer")) {
			return 2;
		}else {
			return 3;
		}
	}

}

主体程序:

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午7:16:47
 * @Description:
 */
package lpj.reduceWork;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import lpj.reduceWorkbean.MyPatitioner;
import lpj.reduceWorkbean.Student;
/**
 *
 */
public class StudentScore3_2MR2 {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		conf.addResource("hdfs-site.xml");//使用配置文件
//		System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
		FileSystem fs = FileSystem.get(conf);//默认使用本地
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(StudentScore3_2MR2.class);
		job.setMapperClass(StudentScore3MR_Mapper.class);
		job.setReducerClass(StudentScore3MR_Reducer.class);
		
		job.setMapOutputKeyClass(Student.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Student.class);
		job.setOutputValueClass(NullWritable.class);
		
		job.setPartitionerClass(MyPatitioner.class);//设置分区器
		job.setNumReduceTasks(4);//设置任务数目
//		
//		String inputpath = args[0];
//		String outpath = args[1];
		
		Path inputPath = new Path("d:/a/homework6.txt");
		Path outputPath = new Path("d:/a/homework6_2");
		if (fs.exists(inputPath)) {
			fs.delete(outputPath, true);
		}
		
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		boolean isdone = job.waitForCompletion(true);
		System.exit(isdone ? 0 : 1);
	}
	//2统计每门课程参考学生的平均分,并且按课程存入不同的结果文件,要求一门课程一个结果文件,并且按平均分从高到低排序,分数保留一位小数
	public static class StudentScore3MR_Mapper extends Mapper<LongWritable, Text, Student, NullWritable>{
		Text kout = new Text();
		Text valueout = new Text();
		Student stu = new Student();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			//algorithm,huangzitao,85,86,41,75,93,42,85,75
			String [] reads = value.toString().trim().split(",");
			String kk = reads[0];
			int sum = 0;
			int count = 0;
			double avg = 0;
			for(int i = 2; i < reads.length; i++){
				sum += Integer.parseInt(reads[i]);
				count++;
			}
			avg = 1.0 * sum / count;
		
			stu.setCourse(kk);
			stu.setName(reads[1]);
			stu.setScore(avg);


			context.write(stu, NullWritable.get());
		}
	}
	public static class StudentScore3MR_Reducer extends Reducer< Student, NullWritable,  Student, NullWritable>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void reduce(Student key, Iterable<NullWritable> values, Context context)throws IOException, InterruptedException {
	
			context.write(key, NullWritable.get());
		}
		
	}

}



3、题目涉及排序以及分组,分组使用WritableComparator,进行分组字段设置。其中需要注意的是分组字段与排序字段的关系:分组字段一定是排序字段中的前几个

举例:排序规则:a,b,c,d,e。那么分组规则就只能是以下情况中的任意一种:

a   /    a,b    /  a,b,c    / a,b,c,d    /    a,b,c,d,e    不能跳跃

排序字段一定大于等于分组字段,并且包含分组字段


使用分组组件进行:



实体类如题2

分组类代码:

/**
 * @author: lpj   
 * @date: 2018年3月16日 下午10:36:55
 * @Description:
 */
package lpj.reduceWorkbean;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 *
 */
public class MyGroup extends WritableComparator{
	

	public MyGroup() {
		super(Student.class,true);//创建对象
	}

	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		
		Student s1 = (Student)a;
		Student s2 = (Student)b;
		
		return s1.getCourse().compareTo(s2.getCourse());//设置课程分组器
	}
	
	

}



主体类代码;




/**
 * @author: lpj   
 * @date: 2018年3月16日 下午7:16:47
 * @Description:
 */
package lpj.reduceWork;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import lpj.reduceWorkbean.MyGroup;
import lpj.reduceWorkbean.MyPatitioner;
import lpj.reduceWorkbean.Student;
/**
 *
 */
public class StudentScore3_3MR3 {
	
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
//		conf.addResource("hdfs-site.xml");//使用配置文件
//		System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
		FileSystem fs = FileSystem.get(conf);//默认使用本地
		
		Job job = Job.getInstance(conf);
		job.setJarByClass(StudentScore3_3MR3.class);
		job.setMapperClass(StudentScore3MR_Mapper.class);
		job.setReducerClass(StudentScore3MR_Reducer.class);
		
		job.setMapOutputKeyClass(Student.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Student.class);
		job.setOutputValueClass(NullWritable.class);
		job.setGroupingComparatorClass(MyGroup.class);//调用分组
		
		Path inputPath = new Path("d:/a/homework6.txt");
		Path outputPath = new Path("d:/a/homework6_3");
		if (fs.exists(inputPath)) {
			fs.delete(outputPath, true);
		}
		
		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		boolean isdone = job.waitForCompletion(true);
		System.exit(isdone ? 0 : 1);
	}
	//3求出每门课程参考学生平均分最高的学生的信息:课程,姓名和平均分
	public static class StudentScore3MR_Mapper extends Mapper<LongWritable, Text, Student, NullWritable>{
		Text kout = new Text();
		Text valueout = new Text();
		Student stu = new Student();
		@Override
		protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
			//algorithm,huangzitao,85,86,41,75,93,42,85,75
			String [] reads = value.toString().trim().split(",");
			String kk = reads[0];
			int sum = 0;
			int count = 0;
			double avg = 0;
			for(int i = 2; i < reads.length; i++){
				sum += Integer.parseInt(reads[i]);
				count++;
			}
			avg = 1.0 * sum / count;
		
			stu.setCourse(kk);
			stu.setName(reads[1]);
			stu.setScore(avg);
			context.write(stu, NullWritable.get());
		}
	}
	public static class StudentScore3MR_Reducer extends Reducer< Student, NullWritable,  Student, NullWritable>{
		Text kout = new Text();
		Text valueout = new Text();
		@Override
		protected void reduce(Student key, Iterable<NullWritable> values, Context context)throws IOException, InterruptedException {				
				context.write(key, NullWritable.get());

		}
		
	}

}