1 问题说明
2 方法和代码
- 2.1 基础倒排索引
- 2.1.1 方法说明
- 2.1.2 代码
- 2.2 采用partitioner的倒排索引
- 2.2.1 方法说明
- 2.2.2 代码
- 2.3 采用gap压缩的倒排索引
- 2.3.1 方法说明
- 2.3.2 代码
- 2.4 采用二进制压缩的倒排索引
- 2.4.1 方法说明
- 2.4.2 代码
- 3 实验结果
1 问题说明
关于倒排索引的介绍可以参考这篇博客。本文采用MapReduce构建倒排索引并实现对倒排索引的压缩。
2 方法和代码
2.1 基础倒排索引
2.1.1 方法说明
- 基础倒排索引的Mapper先提取每个文档中出现的所有单词,以及每个单词出现的次数;输出的key为当前word,value为PairOfInts(docID, count)。
- Reducer实现的功能是对每个word的PairOfInts(docID, count)按照docID进行排序,然后输出。
2.1.2 代码
注:依赖的jar包edu.umd.cloud9.
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;
// mapper的key用单个word,不需要partitioner
public class InvertedIndexBasic {
public static class MyMapper extends Mapper<LongWritable, Text, Text, PairOfInts> {
@Override
protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
// 获取文档编号
String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
// 分词
StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));
// 对文档中的词计数
Map<String, Integer> word_count_map = new HashMap<String, Integer>();
String word = new String();
while (docTokenier.hasMoreTokens()) {
word = docTokenier.nextToken();
word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
}
// 产生(KEY, VALUE),KEY为当前word,VALUE是PairOfInts(docID, COUNT)
Text KEY = new Text();
PairOfInts VALUE = new PairOfInts();
for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
KEY.set(entry.getKey());
VALUE.set((int) docID, entry.getValue());
context.write(KEY, VALUE);
}
}
}
public static class MyReducer extends Reducer<Text, PairOfInts, Text, ArrayListWritable<PairOfInts>> {
// 定义Comparator,用于对list排序
private static Comparator<PairOfInts> cmp = new Comparator<PairOfInts>() {
@Override
public int compare(PairOfInts o1, PairOfInts o2) {
return (o1.getKey() < o2.getKey()) ? -1 : 1;
}
};
@Override
protected void reduce(Text key, Iterable<PairOfInts> values, Context context) throws IOException, InterruptedException {
ArrayList<PairOfInts> pair_list = new ArrayList<PairOfInts>();
for (PairOfInts pair : values) {
pair_list.add(new PairOfInts(pair.getKey(), pair.getValue())); //应该指向的是地址,需要重新构建
}
// 排序
pair_list.sort(cmp);
// 写出
context.write(key, new ArrayListWritable<>(pair_list));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InvertedIndexBasic.class);
job.setMapperClass(InvertedIndexBasic.MyMapper.class);
job.setReducerClass(InvertedIndexBasic.MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(ArrayListWritable.class);
job.setMapOutputValueClass(PairOfInts.class);
String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexBasic"; // 输出路径
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
fs.delete(new Path(outputPath), true);
Long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
double runtime = (System.currentTimeMillis() - startTime) / 1000;
System.out.println("InvertedIndexBasic任务结束,总运行时间为:" + runtime + " 秒!");
}
}
2.2 采用partitioner的倒排索引
2.2.1 方法说明
- 由于基础的倒排索引方法的reduce过程需要排序,本方法希望利用shuffle过程中的排序,避免reduce过程还要排序。
- Mapper过程还是先提取每个文档中出现的所有单词,以及每个单词出现的次数;输出的key为PairOfStringInt(word, docID),value是count。
- 自定义Partitioner,按照word的hash值分配到不同的机器上。
- Reducer过程,以word为输出的key,(docID, count) list为输出的value。
2.2.2 代码
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;
public class InvertedIndexPartition {
public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
@Override
protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
// 获取文档编号
String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
// 分词
StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));
// 对文档中的词计数
Map<String, Integer> word_count_map = new HashMap<String, Integer>();
String word = new String();
while (docTokenier.hasMoreTokens()) {
word = docTokenier.nextToken();
word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
}
// 产生(KEY, VALUE),KEY为PairOfStringInt(word,docID),VALUE是COUNT
PairOfStringInt KEY = new PairOfStringInt();
IntWritable VALUE = new IntWritable();
for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
KEY.set(entry.getKey(), (int) docID);
VALUE.set(entry.getValue());
context.write(KEY, VALUE);
}
}
}
public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
@Override
public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
// 取出需要的word, tostring得到的时(string, int)
String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, ArrayListWritable<PairOfInts>> {
private String pre_word = null; // 用来核查是否还在当前词
private String temp_word = null;
private ArrayListWritable<PairOfInts> pair_list_writable = new ArrayListWritable<PairOfInts>();
@Override
protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
temp_word = key.getKey();
// 检查上个词是否采集完全,如果采集完全了就输出
if (!temp_word.equals(pre_word)) {
if (pre_word != null) {
context.write(new Text(pre_word), pair_list_writable);
pre_word = temp_word;
pair_list_writable.clear();
} else { //是初始化状态
pre_word = temp_word;
}
}
for (IntWritable val : values) {
pair_list_writable.add(new PairOfInts(key.getValue(), val.get()));
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// 在这里把最后一个词输出
context.write(new Text(pre_word), pair_list_writable);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InvertedIndexPartition.class);
job.setMapperClass(InvertedIndexPartition.MyMapper.class);
job.setPartitionerClass(InvertedIndexPartition.MyPartitioner.class);
job.setReducerClass(InvertedIndexPartition.MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ArrayListWritable.class);
job.setMapOutputKeyClass(PairOfStringInt.class);
job.setMapOutputValueClass(IntWritable.class);
String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexPartition"; // 输出路径
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
fs.delete(new Path(outputPath), true);
Long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
double runtime = (System.currentTimeMillis() - startTime) / 1000;
System.out.println("InvertedIndexPartition任务结束,总运行时间为:" + runtime + " 秒!");
}
}
2.3 采用gap压缩的倒排索引
2.3.1 方法说明
- 在2.2方法的基础上,不必要保存每个docID,只需要保存下一个文档距离上一个文档序号的gap,从而一定程度上压缩索引量。
2.3.2 代码
import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
public class InvertedIndexGap {
public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
@Override
protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
// 获取文档编号
String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
// 分词
StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));
// 对文档中的词计数
Map<String, Integer> word_count_map = new HashMap<String, Integer>();
String word = new String();
while (docTokenier.hasMoreTokens()) {
word = docTokenier.nextToken();
word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
}
// 产生(KEY, VALUE),KEY为当前(word,docID),VALUE是COUNT
PairOfStringInt KEY = new PairOfStringInt();
IntWritable VALUE = new IntWritable();
for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
KEY.set(entry.getKey(), (int) docID);
VALUE.set(entry.getValue());
context.write(KEY, VALUE);
}
}
}
public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
@Override
public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
// 取出需要的word, tostring得到的时(string, int)
String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, ArrayListWritable<PairOfInts>> {
private final static ByteArrayOutputStream postingByteStream = new ByteArrayOutputStream();
private final static DataOutputStream outStream = new DataOutputStream(postingByteStream);
private final static ArrayList<PairOfInts> pair_list = new ArrayList<PairOfInts>();
private String pre_word = null; // 用来核查是否还在当前词
private String temp_word = null;
private int lastDocID = 0;
private int tempDocID;
private int docGap;
@Override
protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
temp_word = key.getKey();
tempDocID = key.getValue();
docGap = tempDocID - lastDocID;
lastDocID = tempDocID;
// 检查上个词是否采集完全,如果采集完全了就输出
if (!temp_word.equals(pre_word)) {
if (pre_word != null) {
context.write(new Text(pre_word), new ArrayListWritable<PairOfInts>(pair_list));
pair_list.clear();
pre_word = temp_word;
lastDocID = 0;
} else { //是初始化状态
pre_word = temp_word;
}
}
for (IntWritable val : values) {
pair_list.add(new PairOfInts(docGap, val.get()));
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// 在这里把最后一个词输出
context.write(new Text(pre_word), new ArrayListWritable<PairOfInts>(pair_list));
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InvertedIndexGap.class);
job.setMapperClass(InvertedIndexGap.MyMapper.class);
job.setPartitionerClass(InvertedIndexGap.MyPartitioner.class);
job.setReducerClass(InvertedIndexGap.MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ArrayListWritable.class);
job.setMapOutputKeyClass(PairOfStringInt.class);
job.setMapOutputValueClass(IntWritable.class);
String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexGap"; // 输出路径
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
fs.delete(new Path(outputPath), true);
Long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
double runtime = (System.currentTimeMillis() - startTime) / 1000;
System.out.println("InvertedIndexCompress任务结束,总运行时间为:" + runtime + " 秒!");
}
}
2.4 采用二进制压缩的倒排索引
2.4.1 方法说明
- 在2.3的基础上,以字节流的形式传输数据,并以二进制流输出,从而实现索引压缩。
2.4.2 代码
import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
public class InvertedIndexCompress {
public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
@Override
protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
// 获取文档编号
String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
// 分词
StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));
// 对文档中的词计数
Map<String, Integer> word_count_map = new HashMap<String, Integer>();
String word = new String();
while (docTokenier.hasMoreTokens()) {
word = docTokenier.nextToken();
word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
}
// 产生(KEY, VALUE),KEY为当前(word,docID),VALUE是COUNT
PairOfStringInt KEY = new PairOfStringInt();
IntWritable VALUE = new IntWritable();
for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
KEY.set(entry.getKey(), (int) docID);
VALUE.set(entry.getValue());
context.write(KEY, VALUE);
}
}
}
public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
@Override
public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
// 取出需要的word, tostring得到的时(string, int)
String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, BytesWritable> {
private final static ByteArrayOutputStream postingByteStream = new ByteArrayOutputStream();
private final static DataOutputStream outStream = new DataOutputStream(postingByteStream);
private String pre_word = null; // 用来核查是否还在当前词
private String temp_word = null;
private int lastDocID = 0;
private int tempDocID;
private int docGap;
@Override
protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
temp_word = key.getKey();
tempDocID = key.getValue();
docGap = tempDocID - lastDocID;
lastDocID = tempDocID;
// 检查上个词是否采集完全,如果采集完全了就输出
if (!temp_word.equals(pre_word)) {
if (pre_word != null) {
outStream.flush();
postingByteStream.flush();
context.write(new Text(pre_word), new BytesWritable(postingByteStream.toByteArray()));
postingByteStream.reset(); // 清空之前的内容
pre_word = temp_word;
lastDocID = 0;
} else { //是初始化状态
pre_word = temp_word;
}
}
for (IntWritable val : values) {
WritableUtils.writeVInt(outStream, docGap); // 写入IDgap
WritableUtils.writeVInt(outStream, val.get()); // 写入词频
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
// 在这里把最后一个词输出
outStream.flush();
postingByteStream.flush();
context.write(new Text(pre_word), new BytesWritable(postingByteStream.toByteArray()));
postingByteStream.close();
outStream.close();
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(InvertedIndexCompress.class);
job.setMapperClass(InvertedIndexCompress.MyMapper.class);
job.setPartitionerClass(InvertedIndexCompress.MyPartitioner.class);
job.setReducerClass(InvertedIndexCompress.MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
job.setMapOutputKeyClass(PairOfStringInt.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class); // 二进制输出
String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexCompress"; // 输出路径
FileInputFormat.setInputPaths(job, new Path(inputPath));
FileOutputFormat.setOutputPath(job, new Path(outputPath));
FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
fs.delete(new Path(outputPath), true);
Long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
double runtime = (System.currentTimeMillis() - startTime) / 1000;
System.out.println("InvertedIndexCompress任务结束,总运行时间为:" + runtime + " 秒!");
}
}
3 实验结果
计算时间和索引存储空间如下表所示,采用partition的方法可以减少排序消耗的时间,采用gap和二进制压缩可以减少索引占用的空间。
算法 | Basic | Partition | Gap | Compress |
计算时间 | 11秒 | 10秒 | 10秒 | 9秒 |
存储空间 | 3.3M | 3.3M | 23.9M | 1.2M |