2018-08-03 期 MapReduce倒排索引编程案例1（Combiner方式）

原创

JackmaSong 2018-08-03 09:04:14 ©著作权

文章标签 MapReduce 倒排索引 Combiner 文章分类 Hadoop 大数据

©著作权归作者所有：来自51CTO博客作者JackmaSong的原创作品，请联系作者获取转载授权，否则将追究法律责任

package cn.sjq.bigdata.inverted.index;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.junit.Test;

/**

* 利用MapReduce实现输入多个文件中单词在每个文件中出现的次数，输出格式如下：

* hello (a.txt 2,b.txt 1,c.txt 4)

* tom (a.txt 5,b.txt 3)

* 实现方法：采用倒排索引算法并结合MapReduce Combiner实现

* 中间添加Combiner需要注意不能改变原有实现逻辑及改变Mapper到Reducer的数据类型

* 本案例中所有的Mapper、Reducer、Job均采用匿名内部类实现

* @author songjq

public class InvertedIndexCaseOne {

/**

* Mapper阶段

* k1：输入key LongWritable 读入数据偏移量

* v1：输入value Text 读入的一行数据

* k2：输出key Text 格式为<hello:a.txt>,<hello:b.txt>

* v2：输出value Text 格式为<1>,<1>

* @author songjq

static class InvertedIndexCaseOneMapper extends Mapper<LongWritable, Text, Text, Text> {

private Text tkey = new Text();

private Text tvalue = new Text();

@Override

protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {

//读入数据

String line = v1.toString();

//分词,安装空格切分

String[] words = line.split(" ");

//获取输入文件名称

FileSplit inputSplit = (FileSplit) context.getInputSplit();

String fileName = inputSplit.getPath().getName();

//将数据通过context传输到Reducer

for(String word:words) {

tkey.set(word+":"+fileName);

tvalue.set("1");

context.write(tkey, tvalue);

}

/**

* Combiner阶段

* 定义Combiner类

* 由于Combiner是一个特殊的Reducer，因此需要继承Reducer

* 其作用就是对Mapper端输入的数据进行部分求和，并发送到Reducer阶段处理

* Mapper端输入的数据格式如下：

* <k2> <v2>

* <hello:a.txt <"1","1">

* <hello:b.txt <"1">

* 通过Combiner处理后，最终输出到Reducer的数据格式如下

* <k3> <v3>

* <hello> <a.txt:"2">

* <hello> <b.txt:"1">

* @author songjq

static class InvertedIndexCaseOneCombiner extends Reducer<Text, Text, Text, Text> {

@Override

protected void reduce(Text k31, Iterable<Text> v31, Context ctx) throws IOException, InterruptedException {

int total = 0;

for(Text val:v31) {

//单词在每个文件中出现次数统计

total+=Integer.parseInt(val.toString());

}

//k3处理,格式hello:a.txt

String[] split = k31.toString().split(":");

String word = split[0];

String fileName = split[1];

//输出 k3:<hello> v3:<a.txt:"2",b.txt:"1">

ctx.write(new Text(word), new Text(fileName+":"+total));

}

/**

* Reducer阶段

* Reducer阶段主要对Combiner阶段输出的数据进行处理

* Combiner阶段输出数据格式如下：

* <k3> <v3>

* <hello> <a.txt:"2",b.txt:"1">

* 通过Reducer处理后，最终输出数据格式如下：

* <k4> <v4>

* <hello> <(a.txt 2,b.txt 1)>

* @author songjq

static class InvertedIndexCaseOneReducer extends Reducer<Text, Text, Text, Text> {

* 由于setup方法只会被调用一次，因此可以在这里输出文件头

* (non-Javadoc)

* @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)

@Override

protected void setup(Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {

context.write(new Text(formatStr("Word", 20)), new Text("Frequency statistics [eg:(a.txt 2,b.txt 1)]"));

}

@Override

protected void reduce(Text k3, Iterable<Text> v3, Context ctx) throws IOException, InterruptedException {

//定义存放输出结果的对象result

StringBuffer result = new StringBuffer();

for(Text val:v3) {

//<v3>数据<a.txt:"2">

String[] split = val.toString().split(":");

String fileName = split[0];

String count = split[1];

result.append(fileName).append(" ").append(count).append(",");

}

//将<k4,v4>写入HDFS

//最终输出到文件的数据格式 hello (a.txt 2,b.txt 1)

ctx.write(new Text(formatStr(k3.toString(), 20)), new Text(result.deleteCharAt(result.length()-1).toString()));

}

/**

* 字符串填充空格

* @param str

* @param length

* @return

public static String formatStr(String str, int length) {

if (str == null) {

str = "";

}

int strLen = str.getBytes().length;

if (strLen == length) {

return str;

} else if (strLen < length) {

int temp = length - strLen;

String tem = "";

for (int i = 0; i < temp; i++) {

tem = tem + " ";

}

return str + tem;

} else {

return str.substring(0, length);

}

/**

* 提交job

* @throws IOException

* @throws InterruptedException

* @throws ClassNotFoundException

@Test

public void InvertedIndexCaseOneJob() throws IOException, ClassNotFoundException, InterruptedException {

Job job = Job.getInstance(new Configuration());

job.setJarByClass(InvertedIndexCaseOne.class);

job.setMapperClass(InvertedIndexCaseOneMapper.class);

job.setMapOutputKeyClass(Text.class);

job.setMapOutputValueClass(Text.class);

// 设置Combiner Class类

job.setCombinerClass(InvertedIndexCaseOneCombiner.class);

job.setReducerClass(InvertedIndexCaseOneReducer.class);

job.setOutputKeyClass(Text.class);

job.setOutputValueClass(Text.class);

FileInputFormat.setInputPaths(job, new Path("D:\\test\\InvertedIndex\\srcdata"));

FileOutputFormat.setOutputPath(job, new Path("D:\\test\\InvertedIndex\\output2"));

job.waitForCompletion(true);

}

直接结果

Word Frequency statistics [eg:(a.txt 2,b.txt 1)]

Are d.txt 1

China b.txt 1

Do e.txt 1

Hello c.txt 1,a.txt 1

I e.txt 1,b.txt 1

Java c.txt 2

We a.txt 1

You d.txt 1

a c.txt 1

are d.txt 1,a.txt 1

boys d.txt 1

china e.txt 2

come e.txt 1

country b.txt 2

friend a.txt 1

from e.txt 1

good d.txt 1,c.txt 1,a.txt 1

greatest b.txt 1

in b.txt 1

is b.txt 1,c.txt 1

language c.txt 1

love b.txt 1

my b.txt 1

ok d.txt 1

the b.txt 2

to e.txt 1

want e.txt 1

word a.txt 1

world b.txt 1

you d.txt 1,e.txt 1

上一篇：2018-08-02 期 MapReduce实现多表查询自连接

下一篇：2018-08-04 期 MapReduce倒排索引编程案例2（jobControll方式）

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯