package cn.sjq.bigdata.inverted.index;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.junit.Test;

/**

* 利用MapReduce实现输入多个文件中单词在每个文件中出现的次数,输出格式如下:

* hello (a.txt 2,b.txt 1,c.txt 4)

* tom (a.txt 5,b.txt 3)

* 实现方法:采用倒排索引算法并结合MapReduce Combiner实现

* 中间添加Combiner需要注意不能改变原有实现逻辑及改变Mapper到Reducer的数据类型

*

* 本案例中所有的Mapper、Reducer、Job均采用匿名内部类实现

* @author songjq

*

*/

public class InvertedIndexCaseOne {

/**

* Mapper阶段

* k1:输入key LongWritable  读入数据偏移量

* v1:输入value Text   读入的一行数据

* k2:输出key Text 格式为<hello:a.txt>,<hello:b.txt>

* v2:输出value Text 格式为<1>,<1>

* @author songjq

*

*/

static class InvertedIndexCaseOneMapper extends Mapper<LongWritable, Text, Text, Text> {

private Text tkey = new Text();

private Text tvalue = new Text();

@Override

protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {

//读入数据

String line = v1.toString();

//分词,安装空格切分

String[] words = line.split(" ");

//获取输入文件名称

FileSplit inputSplit = (FileSplit) context.getInputSplit();

String fileName = inputSplit.getPath().getName();

//将数据通过context传输到Reducer

for(String word:words) {

tkey.set(word+":"+fileName);

tvalue.set("1");

context.write(tkey, tvalue);

}

}

}

/**

* Combiner阶段

* 定义Combiner类

* 由于Combiner是一个特殊的Reducer,因此需要继承Reducer

* 其作用就是对Mapper端输入的数据进行部分求和,并发送到Reducer阶段处理

* Mapper端输入的数据格式如下:

* <k2> <v2>

* <hello:a.txt <"1","1">

* <hello:b.txt <"1">

* 通过Combiner处理后,最终输出到Reducer的数据格式如下

* <k3> <v3>

* <hello> <a.txt:"2">

* <hello> <b.txt:"1">

* @author songjq

*

*/

static class InvertedIndexCaseOneCombiner extends Reducer<Text, Text, Text, Text> {

@Override

protected void reduce(Text k31, Iterable<Text> v31, Context ctx) throws IOException, InterruptedException {

int total = 0;

for(Text val:v31) {

//单词在每个文件中出现次数统计

total+=Integer.parseInt(val.toString());

}

//k3处理,格式hello:a.txt

String[] split = k31.toString().split(":");

String word = split[0];

String fileName = split[1];

//输出 k3:<hello> v3:<a.txt:"2",b.txt:"1">

ctx.write(new Text(word), new Text(fileName+":"+total));

}

}

/**

* Reducer阶段

* Reducer阶段主要对Combiner阶段输出的数据进行处理

* Combiner阶段输出数据格式如下:

* <k3> <v3>

* <hello> <a.txt:"2",b.txt:"1">

* 通过Reducer处理后,最终输出数据格式如下:

* <k4> <v4>

* <hello> <(a.txt 2,b.txt 1)>

* @author songjq

*

*/

static class InvertedIndexCaseOneReducer extends Reducer<Text, Text, Text, Text> {

/*

* 由于setup方法只会被调用一次,因此可以在这里输出文件头

* (non-Javadoc)

* @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)

*/

@Override

protected void setup(Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {

context.write(new Text(formatStr("Word", 20)), new Text("Frequency statistics [eg:(a.txt 2,b.txt 1)]"));

}

@Override

protected void reduce(Text k3, Iterable<Text> v3, Context ctx) throws IOException, InterruptedException {

//定义存放输出结果的对象result

StringBuffer result = new StringBuffer();

for(Text val:v3) {

//<v3>数据<a.txt:"2">

String[] split = val.toString().split(":");

String fileName = split[0];

String count = split[1];

result.append(fileName).append(" ").append(count).append(",");

}

//将<k4,v4>写入HDFS

//最终输出到文件的数据格式 hello (a.txt 2,b.txt 1)

ctx.write(new Text(formatStr(k3.toString(), 20)), new Text(result.deleteCharAt(result.length()-1).toString()));

}

/**

* 字符串填充空格

* @param str

* @param length

* @return

*/

public static String formatStr(String str, int length) {

if (str == null) {

str = "";

}

int strLen = str.getBytes().length;

if (strLen == length) {

return str;

} else if (strLen < length) {

int temp = length - strLen;

String tem = "";

for (int i = 0; i < temp; i++) {

tem = tem + " ";

}

return str + tem;

} else {

return str.substring(0, length);

}

}

}

/**

* 提交job

* @throws IOException

* @throws InterruptedException

* @throws ClassNotFoundException

*

*/

@Test

public void InvertedIndexCaseOneJob() throws IOException, ClassNotFoundException, InterruptedException {

Job job = Job.getInstance(new Configuration());

job.setJarByClass(InvertedIndexCaseOne.class);

job.setMapperClass(InvertedIndexCaseOneMapper.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(Text.class);

// 设置Combiner Class类

job.setCombinerClass(InvertedIndexCaseOneCombiner.class);

job.setReducerClass(InvertedIndexCaseOneReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(job, new Path("D:\\test\\InvertedIndex\\srcdata"));

FileOutputFormat.setOutputPath(job, new Path("D:\\test\\InvertedIndex\\output2"));

job.waitForCompletion(true);

}

}

直接结果

Word                 Frequency statistics [eg:(a.txt 2,b.txt 1)]

Are                 d.txt 1

China               b.txt 1

Do                   e.txt 1

Hello               c.txt 1,a.txt 1

I                   e.txt 1,b.txt 1

Java                 c.txt 2

We                   a.txt 1

You                 d.txt 1

a                   c.txt 1

are                 d.txt 1,a.txt 1

boys                 d.txt 1

china               e.txt 2

come                 e.txt 1

country             b.txt 2

friend               a.txt 1

from                 e.txt 1

good                 d.txt 1,c.txt 1,a.txt 1

greatest             b.txt 1

in                   b.txt 1

is                   b.txt 1,c.txt 1

language             c.txt 1

love                 b.txt 1

my                   b.txt 1

ok                   d.txt 1

the                 b.txt 2

to                   e.txt 1

want                 e.txt 1

word                 a.txt 1

world               b.txt 1

you                 d.txt 1,e.txt 1