倒排索引(Inverted Index):也常被称为反向索引、置入档案或反向档案,是一种索引方法,被用来存储在全文搜索下某个单词在一个文档或者一组文档中的存储位置的映射。它是文档检索系统中最常用的数据结构。
案例:
两份数据
mapreduce1.txt
huangbo love xuzheng
huangxiaoming love baby huangxiaoming love yangmi
liangchaowei love liujialing
huangxiaoming xuzheng huangbo wangbaoqiang
mapreduce2.txt
mapreduce-4-2.txt
hello huangbo
hello xuzheng
hello huangxiaoming
要求:
- 创建倒排索引统计每个关键词在每个文档中当中的第几个偏移量开始出现了多少次
- 例如:
- huangxiaoming关键词的格式:
huangixaoming mapreduce-4-1.txt:2,2; mapreduce-4-1.txt:4,1;mapreduce-4-2.txt:3,1 - 关键词 文件名 行数(偏移量) 这一行出现的次数
- huangxiaoming关键词的格式:
package com.hadoop.mapreduce.Test4;
/**
* 倒排索引
*/
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InvertedIndex {
static class MyMapper extends Mapper<LongWritable, Text, Text, Text> {
String fileName = null;
Text mk = new Text();
Text mv = new Text();
// 在setup中通过切片来获取文件名
@Override
protected void setup(Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
FileSplit files = (FileSplit) context.getInputSplit();
fileName = files.getPath().getName();
}
/*
* map端: huangixaoming mapreduce-4-1.txt:2,2; 每个出现的词为key value为文件名,偏移量,出现的次数
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
String[] datas = value.toString().split(" ");
Map<String, Integer> map = new HashMap<String, Integer>();
// 这里是获取次数
for (String k : datas) {
if (map.containsKey(k))
map.put(k, map.get(k) + 1);
else
map.put(k, 1);
}
// 通过遍历map的key值设置context的参数
Set<String> keySet = map.keySet();
for (String s : keySet) {
// mk设置为key值
mk.set(s);
// 按要求拼接
mv.set(fileName + ":" + key.get() + "," + map.get(s));
// 每一次循环都要写一次
context.write(mk, mv);
}
}
}
static class MyReducer extends Reducer<Text, Text, Text, Text> {
Text rv = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
throws IOException, InterruptedException {
// 用StringBuilde来进行拼接
StringBuilder s = new StringBuilder();
for (Text v : values) {
s.append(v.toString()).append(";");
}
// 去掉最后一行的分号
rv.set(s.substring(0, s.length() - 1));
// 写一次即可
context.write(key, rv);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InvertedIndex.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
Path inpath = new Path("F:\\test\\InvertedIndex");
FileInputFormat.addInputPath(job, inpath);
Path outpath = new Path("F:\\test\\testout\\");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outpath)) {
fs.delete(outpath, true);
}
FileOutputFormat.setOutputPath(job, outpath);
job.waitForCompletion(true);
}
}
结果:
baby mapreduce-4-1.txt:22,1
hello mapreduce-4-2.txt:30,1;mapreduce-4-2.txt:15,1;mapreduce-4-2.txt:0,1
huangbo mapreduce-4-2.txt:0,1;mapreduce-4-1.txt:103,1;mapreduce-4-1.txt:0,1
huangxiaoming mapreduce-4-1.txt:103,1;mapreduce-4-1.txt:22,2;mapreduce-4-2.txt:30,1
liangchaowei mapreduce-4-1.txt:73,1
liujialing mapreduce-4-1.txt:73,1
love mapreduce-4-1.txt:73,1;mapreduce-4-1.txt:0,1;mapreduce-4-1.txt:22,2
wangbaoqiang mapreduce-4-1.txt:103,1
xuzheng mapreduce-4-1.txt:0,1;mapreduce-4-1.txt:103,1;mapreduce-4-2.txt:15,1
yangmi mapreduce-4-1.txt:22,1