computer,huangxiaoming,85,86,41,75,93,42,85
computer,xuzheng,54,52,86,91,42
computer,huangbo,85,42,96,38
english,zhaobenshan,54,52,86,91,42,85,75
english,liuyifei,85,41,75,21,85,96,14
algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,liuyifei,76,95,86,74,68,74,48
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85
english,zhouqi,85,86,41,75,93,42,85,75,55,47,22
english,huangbo,85,42,96,38,55,47,22
algorithm,liutao,85,75,85,99,66
computer,huangzitao,85,86,41,75,93,42,85
math,wangbaoqiang,85,86,41,75,93,42,85
computer,liujialing,85,41,75,21,85,96,14,74,86
computer,liuyifei,75,85,62,48,54,96,15
computer,liutao,85,75,85,99,66,88,75,91
computer,huanglei,76,95,86,74,68,74,48
english,liujialing,75,85,62,48,54,96,15
math,huanglei,76,95,86,74,68,74,48
math,huangjiaju,85,75,86,85,85,74,86
math,liutao,48,58,67,86,15,33,85
english,huanglei,85,75,85,99,66,88,75,91
math,xuzheng,54,52,86,91,42,85,75
math,huangxiaoming,85,75,85,99,66,88,75,91
math,liujialing,85,86,41,75,93,42,85,75
english,huangxiaoming,85,86,41,75,93,42,85
algorithm,huangdatou,48,58,67,86,15,33,85
algorithm,huangzitao,85,86,41,75,93,42,85,75
数据解释:
- 数据字段个数不固定:第一个是课程名称,总共四个课程,computer,math,english,algorithm
- 第二个是学生姓名,后面是每次考试的分数
1.统计每门课程的参考人数和课程平均分
package com.hadoop.mapreduce.shuffle;
/**
* 统计每门课程的参考人数(参加考试人数)和课程平均分
*/
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Test1 {
/*
* map端:
* key:课程名 Text
* value:平均分 DoubleWritable
*/
static class MyMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
Text mk = new Text();
DoubleWritable mv = new DoubleWritable();
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
String[] datas = value.toString().split(",");
int count = 0;
double avg = 0;
int sum = 0;
// 数据中从第三个字段开始才是分数
for (int i = 2; i < datas.length; i++) {
sum += Integer.parseInt(datas[i].trim());
count++;
}
avg = sum / count * 1.0;
mk.set(datas[0]);
mv.set(avg);
context.write(mk, mv);
}
}
/*
* reduce端:
* key:课程名 Text
* value:平均分 次数 Text
*/
static class MyReducer extends Reducer<Text, DoubleWritable, Text, Text> {
Text rv = new Text();
@Override
protected void reduce(Text key, Iterable<DoubleWritable> value,
Reducer<Text, DoubleWritable, Text, Text>.Context context) throws IOException, InterruptedException {
double sum = 0;
int count = 0;
double avg = 0;
//算所有平均分的和
for (DoubleWritable v : value) {
// v.get()直接获取到java中对应数据类型的值
sum += v.get();
count++;
}
avg = sum / count * 1.0;
rv.set(count + "\t" + String.format("%.2f", avg));
context.write(key, rv);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Test1.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(DoubleWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path inpath = new Path("F:\\test\\test1.txt");
FileInputFormat.addInputPath(job, inpath);
Path outpath = new Path("F:\\test\\testout\\");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
}
}
结果:
algorithm 6 71.33
computer 10 69.60
english 9 66.00
math 7 72.57
2.统计每一个学生的每一门课程的平均分,在对结果按照科目 输出到不同的文件
package com.hadoop.mapreduce.shuffle;
/**
* 统计每一个学生的每一门课程的平均分,在对结果按照科目 输出到不同的文件
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Test2 {
/*这一问主要考察hadoop中的Partitioner类
* map端:
* key 课程名 Text
* value 姓名 平均分 Text
*/
static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
Text mk = new Text();
Text mv = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
//与第一问一样的代码
String[] datas = value.toString().split(",");
double avg = 0;
int sum = 0;
int count = 0;
for (int i = 2; i < datas.length; i++) {
sum += Integer.parseInt(datas[i].trim());
count++;
}
avg = sum / count * 1.0;
String.format("%,2f", avg);
mk.set(datas[0]);
mv.set(datas[1] + "\t" + avg);
context.write(mk, mv);
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text> {
// 没有其他要求直接输出就行
@Override
protected void reduce(Text key, Iterable<Text> value, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
for (Text v : value) {
context.write(key, v);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//这里要加载一下自定义分区类,设置我们需要的maptask数
job.setPartitionerClass(MyPartitioner.class);
job.setNumReduceTasks(4);
job.setJarByClass(Test2.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path inpath = new Path("F:\\test\\test1.txt");
FileInputFormat.addInputPath(job, inpath);
Path outpath = new Path("F:\\test\\testout\\");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
}
}
//定义一个自定义分区类来继承Partitioner,我们map里面输出的类型为Text,Text
class MyPartitioner extends Partitioner<Text, Text> {
// 定义一个map集合可以让思路显得很清晰
static Map<String, Integer> map = new HashMap<String, Integer>();
// 这里key中直接写了分组的类型,方便与key的toString值进行比较
// 因为输出的文件名是从0开始的,所以分区算法一般从0开始分,避免Illegal partition for algorithm异常
static {
map.put("algorithm", 0);
map.put("computer", 1);
map.put("english", 2);
map.put("math", 3);
}
@Override
// 重写getPartition方法
public int getPartition(Text key, Text value, int numPartitions) {
// 与map中的key比较
String subject = key.toString();
// 返回value值
Integer id = map.get(subject);
// 直接返回id,进行对应的分区,id是多少就在哪一个分区
return id;
}
}
结果:
algorithm huangzitao 72.0
algorithm huangdatou 56.0
algorithm liuyifei 62.0
algorithm huanglei 74.0
algorithm huangjiaju 82.0
algorithm liutao 82.0
computer huangzitao 72.0
computer huanglei 74.0
computer liutao 83.0
computer liuyifei 62.0
computer liujialing 64.0
computer huangdatou 56.0
computer huangjiaju 83.0
computer huangbo 65.0
computer xuzheng 65.0
computer huangxiaoming 72.0
english huangbo 55.0
english liujialing 62.0
english huangdatou 56.0
english liuyifei 74.0
english zhaobenshan 69.0
english huangxiaoming 72.0
english zhouqi 64.0
english liuyifei 59.0
english huanglei 83.0
math huangjiaju 82.0
math xuzheng 69.0
math huangxiaoming 83.0
math liujialing 72.0
math liutao 56.0
math huanglei 74.0
math wangbaoqiang 72.0
3.统计每一门课程平均分最高的前两个学生
package com.hadoop.mapreduce.shuffle;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
*/
public class CourseBean implements WritableComparable<CourseBean> {
// 根据需求来定义
private String subject;
private String name;
private double avg;
public CourseBean() {
super();
}
public CourseBean(String subject, String name, double avg) {
super();
this.subject = subject;
this.name = name;
this.avg = avg;
}
public String getSubject() {
return subject;
}
public void setSubject(String subject) {
this.subject = subject;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getAvg() {
return avg;
}
// 一定要冲写toString方法不然输出的全是地址
@Override
public String toString() {
return subject + "\t" + name + "\t" + avg;
}
public void setAvg(double avg) {
this.avg = avg;
}
// 先按课程的字典顺序升序排,再按平均分排
@Override
public int compareTo(CourseBean o) {
int temp = -o.subject.compareTo(this.subject);
if (temp == 0)
temp = (int) (o.avg - this.avg);
return temp;
}
// 序列化,自定义类的必须进行的操作
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(subject);
out.writeUTF(name);
out.writeDouble(avg);
}
// 反序列化,自定义类的必须进行的操作
@Override
public void readFields(DataInput in) throws IOException {
subject = in.readUTF();
name = in.readUTF();
avg = in.readDouble();
}
}
package com.hadoop.mapreduce.shuffle;
/**
* 统计每一门课程平均分最高的前两个学生
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/*
* 这一题首先我们要按课程排序,然后相同的课程按平均分排序,最后取一个
* 要使用到自定义类与分组
*/
public class Test3 {
/*
* map端:
* key 自定义 ScoreBean 包含输出的所有信息
* value 给一个 NullWritable即可
*/
static class MyMapper extends Mapper<LongWritable, Text, CourseBean, NullWritable> {
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, CourseBean, NullWritable>.Context context)
throws IOException, InterruptedException {
// 与上面两问代码一样
String[] datas = value.toString().split(",");
double avg = 0;
int sum = 0;
int count = 0;
for (int i = 2; i < datas.length; i++) {
sum += Integer.parseInt(datas[i].trim());
count++;
}
avg = sum / count * 1.0;
String.format("%,2f", avg);
CourseBean mk = new CourseBean(datas[0], datas[1].trim(), avg);
context.write(mk, NullWritable.get());
}
}
static class MyReducer extends Reducer<CourseBean, NullWritable, CourseBean, NullWritable> {
@Override
protected void reduce(CourseBean key, Iterable<NullWritable> value,
Reducer<CourseBean, NullWritable, CourseBean, NullWritable>.Context context)
throws IOException, InterruptedException {
int count = 0;
for (NullWritable v : value) {
context.write(key, v);
count++;
// 因为只要两个,这里我们等于2了就结束循环
if (count == 2)
break;
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(Test3.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(CourseBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(CourseBean.class);
job.setOutputValueClass(NullWritable.class);
//加载分组函数
job.setGroupingComparatorClass(SubjectScoreGroupComparator.class);
Path inpath = new Path("F:\\test\\test1.txt");
FileInputFormat.addInputPath(job, inpath);
Path outpath = new Path("F:\\test\\testout\\");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
}
public static class SubjectScoreGroupComparator extends WritableComparator {
//跟一下WritableComparator的源码我们可以发现这里必须给true了才能创建对象
SubjectScoreGroupComparator() {
//防止空指针异常
super(CourseBean.class, true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
CourseBean cb1 = (CourseBean) a;
CourseBean cb2 = (CourseBean) b;
int result = cb1.getSubject().compareTo(cb2.getSubject());
return result;
}
}
}
结果:
algorithm huangjiaju 82.0
algorithm liutao 82.0
computer huangjiaju 83.0
computer liutao 83.0
english huanglei 83.0
english liuyifei 74.0
math huangxiaoming 83.0
math huangjiaju 82.0