题目及数据:
computer,huangxiaoming,85,86,41,75,93,42,85
computer,xuzheng,54,52,86,91,42
computer,huangbo,85,42,96,38
english,zhaobenshan,54,52,86,91,42,85,75
english,liuyifei,85,41,75,21,85,96,14
algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,liuyifei,76,95,86,74,68,74,48
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85
english,zhouqi,85,86,41,75,93,42,85,75,55,47,22
english,huangbo,85,42,96,38,55,47,22
algorithm,liutao,85,75,85,99,66
computer,huangzitao,85,86,41,75,93,42,85
math,wangbaoqiang,85,86,41,75,93,42,85
computer,liujialing,85,41,75,21,85,96,14,74,86
computer,liuyifei,75,85,62,48,54,96,15
computer,liutao,85,75,85,99,66,88,75,91
computer,huanglei,76,95,86,74,68,74,48
english,liujialing,75,85,62,48,54,96,15
math,huanglei,76,95,86,74,68,74,48
math,huangjiaju,85,75,86,85,85,74,86
math,liutao,48,58,67,86,15,33,85
english,huanglei,85,75,85,99,66,88,75,91
math,xuzheng,54,52,86,91,42,85,75
math,huangxiaoming,85,75,85,99,66,88,75,91
math,liujialing,85,86,41,75,93,42,85,75
english,huangxiaoming,85,86,41,75,93,42,85
algorithm,huangdatou,48,58,67,86,15,33,85
algorithm,huangzitao,85,86,41,75,93,42,85,75
一、数据解释
数据字段个数不固定:
第一个是课程名称,总共四个课程,computer,math,english,algorithm,
第二个是学生姓名,后面是每次考试的分数
二、统计需求:
1、统计每门课程的参考人数和课程平均分
2、统计每门课程参考学生的平均分,并且按课程存入不同的结果文件,要求一门课程一个结果文件,并且按平均分从高到低排序,分数保留一位小数
3、求出每门课程参考学生平均分最高的学生的信息:课程,姓名和平均分
题目解析:1、课程平均分需要在map中先计算每个人的课程平均成绩,然后在reduce中求出整体的平均成绩
/**
* @author: lpj
* @date: 2018年3月16日 下午7:16:47
* @Description:
*/
package lpj.reduceWork;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
*
*/
public class StudentScore3MR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.addResource("hdfs-site.xml");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地
Job job = Job.getInstance(conf);
job.setJarByClass(StudentScore3MR.class);
job.setMapperClass(StudentScore3MR_Mapper.class);
job.setReducerClass(StudentScore3MR_Reducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//
// String inputpath = args[0];
// String outpath = args[1];
Path inputPath = new Path("d:/a/homework6.txt");
Path outputPath = new Path("d:/a/homework6");
if (fs.exists(inputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
//1、统计每门课程的参考人数和课程平均分
public static class StudentScore3MR_Mapper extends Mapper<LongWritable, Text, Text, Text>{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
//algorithm,huangzitao,85,86,41,75,93,42,85,75
String [] reads = value.toString().trim().split(",");
String kk = reads[0];
int sum = 0;
int count = 0;
double avg = 0;
for(int i = 2; i < reads.length; i++){
sum += Integer.parseInt(reads[i]);
count++;
}
avg = 1.0 * sum / count;
String vv = avg + "";
kout.set(kk);
valueout.set(vv);
context.write(kout, valueout);
}
}
public static class StudentScore3MR_Reducer extends Reducer<Text, Text, Text, Text>{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException {
double sum = 0;
int count = 0;
double avg = 0;
for(Text text : values){
sum += Double.parseDouble(text.toString());
count ++;
}
avg = sum / count;
String vv = count + "\t" + avg;
valueout.set(vv);
context.write(key, valueout);
}
}
}
结果:
algorithm 6 71.60119047619047
computer 10 69.79896825396825
english 9 66.22655122655122
math 7 72.88265306122449
2、输出结果存储到不同的结果文件中,需要指定setNumReduceTasks,分区规则通过使用partitioner进行分区设定,平均成绩需要进行排序,可以使用封装对象的方式,通过实现WritableComparable接口进行设置排序规则
实体类定义:
/**
* @author: lpj
* @date: 2018年3月14日 下午9:46:02
* @Description:
*/
package lpj.day2.homeworkbean;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.DecimalFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import org.apache.hadoop.io.WritableComparable;
/**
*
*/
public class Student implements WritableComparable<Student>{
private String name;
private double score;
private String course;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getScore() {
return score;
}
public void setScore(double score) {
this.score = score;
}
public String getCourse() {
return course;
}
public void setCourse(String course) {
this.course = course;
}
@Override
public String toString() {
DecimalFormat fs = new DecimalFormat("#.#");
return course + "\t" +name+ "\t"+ fs.format(score);
}
public Student() {
}
public Student(String name, double score, String course) {
super();
this.name = name;
this.score = score;
this.course = course;
}
@Override
public int compareTo(Student o) {
int diff = this.course.compareTo(o.course);
if (diff == 0) {
return (int)(o.score - this.score);
}else{
return diff > 0 ? 1 : -1;
}
}
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
@Override
public void readFields(DataInput in) throws IOException {
name = in.readUTF();
score = in.readDouble();
course = in.readUTF();
}
/* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeDouble(score);
out.writeUTF(course);
}
}
分区器定义:
/**
* @author: lpj
* @date: 2018年3月16日 下午10:13:24
* @Description:
*/
package lpj.reduceWorkbean;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
*
*/
public class MyPatitioner extends Partitioner<Student, NullWritable>{
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
*/
@Override
public int getPartition(Student key, NullWritable value, int numPartitions) {
if (key.toString().startsWith("math")) {
return 0;
}else if (key.toString().startsWith("english")) {
return 1;
}else if (key.toString().startsWith("computer")) {
return 2;
}else {
return 3;
}
}
}
主体程序:
/**
* @author: lpj
* @date: 2018年3月16日 下午7:16:47
* @Description:
*/
package lpj.reduceWork;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import lpj.reduceWorkbean.MyPatitioner;
import lpj.reduceWorkbean.Student;
/**
*
*/
public class StudentScore3_2MR2 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.addResource("hdfs-site.xml");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地
Job job = Job.getInstance(conf);
job.setJarByClass(StudentScore3_2MR2.class);
job.setMapperClass(StudentScore3MR_Mapper.class);
job.setReducerClass(StudentScore3MR_Reducer.class);
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Student.class);
job.setOutputValueClass(NullWritable.class);
job.setPartitionerClass(MyPatitioner.class);//设置分区器
job.setNumReduceTasks(4);//设置任务数目
//
// String inputpath = args[0];
// String outpath = args[1];
Path inputPath = new Path("d:/a/homework6.txt");
Path outputPath = new Path("d:/a/homework6_2");
if (fs.exists(inputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
//2统计每门课程参考学生的平均分,并且按课程存入不同的结果文件,要求一门课程一个结果文件,并且按平均分从高到低排序,分数保留一位小数
public static class StudentScore3MR_Mapper extends Mapper<LongWritable, Text, Student, NullWritable>{
Text kout = new Text();
Text valueout = new Text();
Student stu = new Student();
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
//algorithm,huangzitao,85,86,41,75,93,42,85,75
String [] reads = value.toString().trim().split(",");
String kk = reads[0];
int sum = 0;
int count = 0;
double avg = 0;
for(int i = 2; i < reads.length; i++){
sum += Integer.parseInt(reads[i]);
count++;
}
avg = 1.0 * sum / count;
stu.setCourse(kk);
stu.setName(reads[1]);
stu.setScore(avg);
context.write(stu, NullWritable.get());
}
}
public static class StudentScore3MR_Reducer extends Reducer< Student, NullWritable, Student, NullWritable>{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void reduce(Student key, Iterable<NullWritable> values, Context context)throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
}
3、题目涉及排序以及分组,分组使用WritableComparator,进行分组字段设置。其中需要注意的是分组字段与排序字段的关系:分组字段一定是排序字段中的前几个
举例:排序规则:a,b,c,d,e。那么分组规则就只能是以下情况中的任意一种:
a / a,b / a,b,c / a,b,c,d / a,b,c,d,e 不能跳跃
排序字段一定大于等于分组字段,并且包含分组字段
使用分组组件进行:
实体类如题2
分组类代码:
/**
* @author: lpj
* @date: 2018年3月16日 下午10:36:55
* @Description:
*/
package lpj.reduceWorkbean;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
*
*/
public class MyGroup extends WritableComparator{
public MyGroup() {
super(Student.class,true);//创建对象
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
Student s1 = (Student)a;
Student s2 = (Student)b;
return s1.getCourse().compareTo(s2.getCourse());//设置课程分组器
}
}
主体类代码;
/**
* @author: lpj
* @date: 2018年3月16日 下午7:16:47
* @Description:
*/
package lpj.reduceWork;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import lpj.reduceWorkbean.MyGroup;
import lpj.reduceWorkbean.MyPatitioner;
import lpj.reduceWorkbean.Student;
/**
*
*/
public class StudentScore3_3MR3 {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
// conf.addResource("hdfs-site.xml");//使用配置文件
// System.setProperty("HADOOP_USER_NAME", "hadoop");//使用集群
FileSystem fs = FileSystem.get(conf);//默认使用本地
Job job = Job.getInstance(conf);
job.setJarByClass(StudentScore3_3MR3.class);
job.setMapperClass(StudentScore3MR_Mapper.class);
job.setReducerClass(StudentScore3MR_Reducer.class);
job.setMapOutputKeyClass(Student.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Student.class);
job.setOutputValueClass(NullWritable.class);
job.setGroupingComparatorClass(MyGroup.class);//调用分组
Path inputPath = new Path("d:/a/homework6.txt");
Path outputPath = new Path("d:/a/homework6_3");
if (fs.exists(inputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean isdone = job.waitForCompletion(true);
System.exit(isdone ? 0 : 1);
}
//3求出每门课程参考学生平均分最高的学生的信息:课程,姓名和平均分
public static class StudentScore3MR_Mapper extends Mapper<LongWritable, Text, Student, NullWritable>{
Text kout = new Text();
Text valueout = new Text();
Student stu = new Student();
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
//algorithm,huangzitao,85,86,41,75,93,42,85,75
String [] reads = value.toString().trim().split(",");
String kk = reads[0];
int sum = 0;
int count = 0;
double avg = 0;
for(int i = 2; i < reads.length; i++){
sum += Integer.parseInt(reads[i]);
count++;
}
avg = 1.0 * sum / count;
stu.setCourse(kk);
stu.setName(reads[1]);
stu.setScore(avg);
context.write(stu, NullWritable.get());
}
}
public static class StudentScore3MR_Reducer extends Reducer< Student, NullWritable, Student, NullWritable>{
Text kout = new Text();
Text valueout = new Text();
@Override
protected void reduce(Student key, Iterable<NullWritable> values, Context context)throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
}