package cn.sjq.bigdata.inverted.index;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.junit.Test;
/**
* 利用MapReduce实现输入多个文件中单词在每个文件中出现的次数,输出格式如下:
* hello (a.txt 2,b.txt 1,c.txt 4)
* tom (a.txt 5,b.txt 3)
* 实现方法:采用倒排索引算法并结合MapReduce Combiner实现
* 中间添加Combiner需要注意不能改变原有实现逻辑及改变Mapper到Reducer的数据类型
*
* 本案例中所有的Mapper、Reducer、Job均采用匿名内部类实现
* @author songjq
*
*/
public class InvertedIndexCaseOne {
/**
* Mapper阶段
* k1:输入key LongWritable 读入数据偏移量
* v1:输入value Text 读入的一行数据
* k2:输出key Text 格式为<hello:a.txt>,<hello:b.txt>
* v2:输出value Text 格式为<1>,<1>
* @author songjq
*
*/
static class InvertedIndexCaseOneMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text tkey = new Text();
private Text tvalue = new Text();
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
//读入数据
String line = v1.toString();
//分词,安装空格切分
String[] words = line.split(" ");
//获取输入文件名称
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String fileName = inputSplit.getPath().getName();
//将数据通过context传输到Reducer
for(String word:words) {
tkey.set(word+":"+fileName);
tvalue.set("1");
context.write(tkey, tvalue);
}
}
}
/**
* Combiner阶段
* 定义Combiner类
* 由于Combiner是一个特殊的Reducer,因此需要继承Reducer
* 其作用就是对Mapper端输入的数据进行部分求和,并发送到Reducer阶段处理
* Mapper端输入的数据格式如下:
* <k2> <v2>
* <hello:a.txt <"1","1">
* <hello:b.txt <"1">
* 通过Combiner处理后,最终输出到Reducer的数据格式如下
* <k3> <v3>
* <hello> <a.txt:"2">
* <hello> <b.txt:"1">
* @author songjq
*
*/
static class InvertedIndexCaseOneCombiner extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text k31, Iterable<Text> v31, Context ctx) throws IOException, InterruptedException {
int total = 0;
for(Text val:v31) {
//单词在每个文件中出现次数统计
total+=Integer.parseInt(val.toString());
}
//k3处理,格式hello:a.txt
String[] split = k31.toString().split(":");
String word = split[0];
String fileName = split[1];
//输出 k3:<hello> v3:<a.txt:"2",b.txt:"1">
ctx.write(new Text(word), new Text(fileName+":"+total));
}
}
/**
* Reducer阶段
* Reducer阶段主要对Combiner阶段输出的数据进行处理
* Combiner阶段输出数据格式如下:
* <k3> <v3>
* <hello> <a.txt:"2",b.txt:"1">
* 通过Reducer处理后,最终输出数据格式如下:
* <k4> <v4>
* <hello> <(a.txt 2,b.txt 1)>
* @author songjq
*
*/
static class InvertedIndexCaseOneReducer extends Reducer<Text, Text, Text, Text> {
/*
* 由于setup方法只会被调用一次,因此可以在这里输出文件头
* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
*/
@Override
protected void setup(Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
context.write(new Text(formatStr("Word", 20)), new Text("Frequency statistics [eg:(a.txt 2,b.txt 1)]"));
}
@Override
protected void reduce(Text k3, Iterable<Text> v3, Context ctx) throws IOException, InterruptedException {
//定义存放输出结果的对象result
StringBuffer result = new StringBuffer();
for(Text val:v3) {
//<v3>数据<a.txt:"2">
String[] split = val.toString().split(":");
String fileName = split[0];
String count = split[1];
result.append(fileName).append(" ").append(count).append(",");
}
//将<k4,v4>写入HDFS
//最终输出到文件的数据格式 hello (a.txt 2,b.txt 1)
ctx.write(new Text(formatStr(k3.toString(), 20)), new Text(result.deleteCharAt(result.length()-1).toString()));
}
/**
* 字符串填充空格
* @param str
* @param length
* @return
*/
public static String formatStr(String str, int length) {
if (str == null) {
str = "";
}
int strLen = str.getBytes().length;
if (strLen == length) {
return str;
} else if (strLen < length) {
int temp = length - strLen;
String tem = "";
for (int i = 0; i < temp; i++) {
tem = tem + " ";
}
return str + tem;
} else {
return str.substring(0, length);
}
}
}
/**
* 提交job
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*
*/
@Test
public void InvertedIndexCaseOneJob() throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(InvertedIndexCaseOne.class);
job.setMapperClass(InvertedIndexCaseOneMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// 设置Combiner Class类
job.setCombinerClass(InvertedIndexCaseOneCombiner.class);
job.setReducerClass(InvertedIndexCaseOneReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("D:\\test\\InvertedIndex\\srcdata"));
FileOutputFormat.setOutputPath(job, new Path("D:\\test\\InvertedIndex\\output2"));
job.waitForCompletion(true);
}
}
直接结果
Word Frequency statistics [eg:(a.txt 2,b.txt 1)]
Are d.txt 1
China b.txt 1
Do e.txt 1
Hello c.txt 1,a.txt 1
I e.txt 1,b.txt 1
Java c.txt 2
We a.txt 1
You d.txt 1
a c.txt 1
are d.txt 1,a.txt 1
boys d.txt 1
china e.txt 2
come e.txt 1
country b.txt 2
friend a.txt 1
from e.txt 1
good d.txt 1,c.txt 1,a.txt 1
greatest b.txt 1
in b.txt 1
is b.txt 1,c.txt 1
language c.txt 1
love b.txt 1
my b.txt 1
ok d.txt 1
the b.txt 2
to e.txt 1
want e.txt 1
word a.txt 1
world b.txt 1
you d.txt 1,e.txt 1