目录

1 问题说明

2 方法和代码

  • 2.1 基础倒排索引
  • 2.1.1 方法说明
  • 2.1.2 代码
  • 2.2 采用partitioner的倒排索引
  • 2.2.1 方法说明
  • 2.2.2 代码
  • 2.3 采用gap压缩的倒排索引
  • 2.3.1 方法说明
  • 2.3.2 代码
  • 2.4 采用二进制压缩的倒排索引
  • 2.4.1 方法说明
  • 2.4.2 代码
  • 3 实验结果

1 问题说明

关于倒排索引的介绍可以参考这篇博客。本文采用MapReduce构建倒排索引并实现对倒排索引的压缩。

2 方法和代码

2.1 基础倒排索引

2.1.1 方法说明

  • 基础倒排索引的Mapper先提取每个文档中出现的所有单词,以及每个单词出现的次数;输出的key为当前word,value为PairOfInts(docID, count)。
  • Reducer实现的功能是对每个word的PairOfInts(docID, count)按照docID进行排序,然后输出。

2.1.2 代码

注:依赖的jar包edu.umd.cloud9.

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

// mapper的key用单个word,不需要partitioner
public class InvertedIndexBasic {

    public static class MyMapper extends Mapper<LongWritable, Text, Text, PairOfInts> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE),KEY为当前word,VALUE是PairOfInts(docID, COUNT)
            Text KEY = new Text();
            PairOfInts VALUE = new PairOfInts();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey());
                VALUE.set((int) docID, entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public static class MyReducer extends Reducer<Text, PairOfInts, Text, ArrayListWritable<PairOfInts>> {
        // 定义Comparator,用于对list排序
        private static Comparator<PairOfInts> cmp = new Comparator<PairOfInts>() {
            @Override
            public int compare(PairOfInts o1, PairOfInts o2) {
                return (o1.getKey() < o2.getKey()) ? -1 : 1;
            }
        };

        @Override
        protected void reduce(Text key, Iterable<PairOfInts> values, Context context) throws IOException, InterruptedException {
            ArrayList<PairOfInts> pair_list = new ArrayList<PairOfInts>();
            for (PairOfInts pair : values) {
                pair_list.add(new PairOfInts(pair.getKey(), pair.getValue()));  //应该指向的是地址,需要重新构建
            }
            // 排序
            pair_list.sort(cmp);
            // 写出
            context.write(key, new ArrayListWritable<>(pair_list));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexBasic.class);
        job.setMapperClass(InvertedIndexBasic.MyMapper.class);
        job.setReducerClass(InvertedIndexBasic.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ArrayListWritable.class);
        job.setMapOutputValueClass(PairOfInts.class);

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexBasic"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexBasic任务结束,总运行时间为:" + runtime + " 秒!");
    }
}

2.2 采用partitioner的倒排索引

2.2.1 方法说明

  • 由于基础的倒排索引方法的reduce过程需要排序,本方法希望利用shuffle过程中的排序,避免reduce过程还要排序。
  • Mapper过程还是先提取每个文档中出现的所有单词,以及每个单词出现的次数;输出的key为PairOfStringInt(word, docID),value是count。
  • 自定义Partitioner,按照word的hash值分配到不同的机器上。
  • Reducer过程,以word为输出的key,(docID, count) list为输出的value。

2.2.2 代码

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

public class InvertedIndexPartition {
    public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE),KEY为PairOfStringInt(word,docID),VALUE是COUNT
            PairOfStringInt KEY = new PairOfStringInt();
            IntWritable VALUE = new IntWritable();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey(), (int) docID);
                VALUE.set(entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
        @Override
        public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
            // 取出需要的word, tostring得到的时(string, int)
            String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
            return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
        }
    }

    public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, ArrayListWritable<PairOfInts>> {
        private String pre_word = null; // 用来核查是否还在当前词
        private String temp_word = null;
        private ArrayListWritable<PairOfInts> pair_list_writable = new ArrayListWritable<PairOfInts>();

        @Override
        protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            temp_word = key.getKey();
            // 检查上个词是否采集完全,如果采集完全了就输出
            if (!temp_word.equals(pre_word)) {
                if (pre_word != null) {
                    context.write(new Text(pre_word), pair_list_writable);
                    pre_word = temp_word;
                    pair_list_writable.clear();
                } else { //是初始化状态
                    pre_word = temp_word;
                }
            }

            for (IntWritable val : values) {
                pair_list_writable.add(new PairOfInts(key.getValue(), val.get()));
            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 在这里把最后一个词输出
            context.write(new Text(pre_word), pair_list_writable);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexPartition.class);
        job.setMapperClass(InvertedIndexPartition.MyMapper.class);
        job.setPartitionerClass(InvertedIndexPartition.MyPartitioner.class);
        job.setReducerClass(InvertedIndexPartition.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArrayListWritable.class);
        job.setMapOutputKeyClass(PairOfStringInt.class);
        job.setMapOutputValueClass(IntWritable.class);

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexPartition"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexPartition任务结束,总运行时间为:" + runtime + " 秒!");
    }
}

2.3 采用gap压缩的倒排索引

2.3.1 方法说明

  • 在2.2方法的基础上,不必要保存每个docID,只需要保存下一个文档距离上一个文档序号的gap,从而一定程度上压缩索引量。

2.3.2 代码

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;

public class InvertedIndexGap {
    public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE),KEY为当前(word,docID),VALUE是COUNT
            PairOfStringInt KEY = new PairOfStringInt();
            IntWritable VALUE = new IntWritable();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey(), (int) docID);
                VALUE.set(entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
        @Override
        public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
            // 取出需要的word, tostring得到的时(string, int)
            String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
            return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
        }
    }

    public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, ArrayListWritable<PairOfInts>> {
        private final static ByteArrayOutputStream postingByteStream = new ByteArrayOutputStream();
        private final static DataOutputStream outStream = new DataOutputStream(postingByteStream);

        private final static ArrayList<PairOfInts> pair_list = new ArrayList<PairOfInts>();

        private String pre_word = null; // 用来核查是否还在当前词
        private String temp_word = null;
        private int lastDocID = 0;
        private int tempDocID;
        private int docGap;


        @Override
        protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            temp_word = key.getKey();
            tempDocID = key.getValue();
            docGap = tempDocID - lastDocID;
            lastDocID = tempDocID;

            // 检查上个词是否采集完全,如果采集完全了就输出
            if (!temp_word.equals(pre_word)) {
                if (pre_word != null) {
                    context.write(new Text(pre_word), new ArrayListWritable<PairOfInts>(pair_list));
                    pair_list.clear();
                    pre_word = temp_word;
                    lastDocID = 0;
                } else { //是初始化状态
                    pre_word = temp_word;
                }
            }

            for (IntWritable val : values) {
                pair_list.add(new PairOfInts(docGap, val.get()));

            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 在这里把最后一个词输出
            context.write(new Text(pre_word), new ArrayListWritable<PairOfInts>(pair_list));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexGap.class);
        job.setMapperClass(InvertedIndexGap.MyMapper.class);
        job.setPartitionerClass(InvertedIndexGap.MyPartitioner.class);
        job.setReducerClass(InvertedIndexGap.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArrayListWritable.class);
        job.setMapOutputKeyClass(PairOfStringInt.class);
        job.setMapOutputValueClass(IntWritable.class);

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexGap"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexCompress任务结束,总运行时间为:" + runtime + " 秒!");
    }
}

2.4 采用二进制压缩的倒排索引

2.4.1 方法说明

  • 在2.3的基础上,以字节流的形式传输数据,并以二进制流输出,从而实现索引压缩。

2.4.2 代码

import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;

public class InvertedIndexCompress {
    public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE),KEY为当前(word,docID),VALUE是COUNT
            PairOfStringInt KEY = new PairOfStringInt();
            IntWritable VALUE = new IntWritable();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey(), (int) docID);
                VALUE.set(entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
        @Override
        public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
            // 取出需要的word, tostring得到的时(string, int)
            String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
            return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
        }
    }

    public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, BytesWritable> {
        private final static ByteArrayOutputStream postingByteStream = new ByteArrayOutputStream();
        private final static DataOutputStream outStream = new DataOutputStream(postingByteStream);

        private String pre_word = null; // 用来核查是否还在当前词
        private String temp_word = null;
        private int lastDocID = 0;
        private int tempDocID;
        private int docGap;

        @Override
        protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            temp_word = key.getKey();
            tempDocID = key.getValue();
            docGap = tempDocID - lastDocID;
            lastDocID = tempDocID;

            // 检查上个词是否采集完全,如果采集完全了就输出
            if (!temp_word.equals(pre_word)) {
                if (pre_word != null) {
                    outStream.flush();
                    postingByteStream.flush();
                    context.write(new Text(pre_word), new BytesWritable(postingByteStream.toByteArray()));
                    postingByteStream.reset(); // 清空之前的内容
                    pre_word = temp_word;
                    lastDocID = 0;
                } else { //是初始化状态
                    pre_word = temp_word;
                }
            }

            for (IntWritable val : values) {
                WritableUtils.writeVInt(outStream, docGap); // 写入IDgap
                WritableUtils.writeVInt(outStream, val.get()); // 写入词频
            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 在这里把最后一个词输出
            outStream.flush();
            postingByteStream.flush();
            context.write(new Text(pre_word), new BytesWritable(postingByteStream.toByteArray()));
            postingByteStream.close();
            outStream.close();
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexCompress.class);
        job.setMapperClass(InvertedIndexCompress.MyMapper.class);
        job.setPartitionerClass(InvertedIndexCompress.MyPartitioner.class);
        job.setReducerClass(InvertedIndexCompress.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setMapOutputKeyClass(PairOfStringInt.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class); // 二进制输出

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexCompress"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexCompress任务结束,总运行时间为:" + runtime + " 秒!");
    }
}

3 实验结果

计算时间和索引存储空间如下表所示,采用partition的方法可以减少排序消耗的时间,采用gap和二进制压缩可以减少索引占用的空间。

算法

Basic

Partition

Gap

Compress

计算时间

11秒

10秒

10秒

9秒

存储空间

3.3M

3.3M

23.9M

1.2M