MapReduce案例倒排索引 mapreduce实现倒排索引

转载

蓝月亮 2024-06-05 12:59:07

文章标签 MapReduce案例倒排索引 mapreduce 大数据 hadoop java 文章分类 架构后端开发

1 问题说明

2 方法和代码

2.1 基础倒排索引

2.1.1 方法说明
2.1.2 代码

2.2 采用partitioner的倒排索引

2.2.1 方法说明
2.2.2 代码

2.3 采用gap压缩的倒排索引

2.3.1 方法说明
2.3.2 代码

2.4 采用二进制压缩的倒排索引

2.4.1 方法说明
2.4.2 代码

3 实验结果

1 问题说明

关于倒排索引的介绍可以参考这篇博客。本文采用MapReduce构建倒排索引并实现对倒排索引的压缩。

2 方法和代码

2.1 基础倒排索引

2.1.1 方法说明

基础倒排索引的Mapper先提取每个文档中出现的所有单词，以及每个单词出现的次数；输出的key为当前word，value为PairOfInts(docID, count)。
Reducer实现的功能是对每个word的PairOfInts(docID, count)按照docID进行排序，然后输出。

2.1.2 代码

注：依赖的jar包edu.umd.cloud9.

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

// mapper的key用单个word，不需要partitioner
public class InvertedIndexBasic {

    public static class MyMapper extends Mapper<LongWritable, Text, Text, PairOfInts> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE)，KEY为当前word，VALUE是PairOfInts(docID, COUNT)
            Text KEY = new Text();
            PairOfInts VALUE = new PairOfInts();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey());
                VALUE.set((int) docID, entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public static class MyReducer extends Reducer<Text, PairOfInts, Text, ArrayListWritable<PairOfInts>> {
        // 定义Comparator，用于对list排序
        private static Comparator<PairOfInts> cmp = new Comparator<PairOfInts>() {
            @Override
            public int compare(PairOfInts o1, PairOfInts o2) {
                return (o1.getKey() < o2.getKey()) ? -1 : 1;
            }
        };

        @Override
        protected void reduce(Text key, Iterable<PairOfInts> values, Context context) throws IOException, InterruptedException {
            ArrayList<PairOfInts> pair_list = new ArrayList<PairOfInts>();
            for (PairOfInts pair : values) {
                pair_list.add(new PairOfInts(pair.getKey(), pair.getValue()));  //应该指向的是地址，需要重新构建
            }
            // 排序
            pair_list.sort(cmp);
            // 写出
            context.write(key, new ArrayListWritable<>(pair_list));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexBasic.class);
        job.setMapperClass(InvertedIndexBasic.MyMapper.class);
        job.setReducerClass(InvertedIndexBasic.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ArrayListWritable.class);
        job.setMapOutputValueClass(PairOfInts.class);

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexBasic"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexBasic任务结束，总运行时间为：" + runtime + " 秒！");
    }
}

2.2 采用partitioner的倒排索引

2.2.1 方法说明

由于基础的倒排索引方法的reduce过程需要排序，本方法希望利用shuffle过程中的排序，避免reduce过程还要排序。
Mapper过程还是先提取每个文档中出现的所有单词，以及每个单词出现的次数；输出的key为PairOfStringInt(word, docID)，value是count。
自定义Partitioner，按照word的hash值分配到不同的机器上。
Reducer过程，以word为输出的key，(docID, count) list为输出的value。

2.2.2 代码

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

public class InvertedIndexPartition {
    public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE)，KEY为PairOfStringInt(word,docID)，VALUE是COUNT
            PairOfStringInt KEY = new PairOfStringInt();
            IntWritable VALUE = new IntWritable();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey(), (int) docID);
                VALUE.set(entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
        @Override
        public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
            // 取出需要的word, tostring得到的时(string, int)
            String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
            return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
        }
    }

    public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, ArrayListWritable<PairOfInts>> {
        private String pre_word = null; // 用来核查是否还在当前词
        private String temp_word = null;
        private ArrayListWritable<PairOfInts> pair_list_writable = new ArrayListWritable<PairOfInts>();

        @Override
        protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            temp_word = key.getKey();
            // 检查上个词是否采集完全，如果采集完全了就输出
            if (!temp_word.equals(pre_word)) {
                if (pre_word != null) {
                    context.write(new Text(pre_word), pair_list_writable);
                    pre_word = temp_word;
                    pair_list_writable.clear();
                } else { //是初始化状态
                    pre_word = temp_word;
                }
            }

            for (IntWritable val : values) {
                pair_list_writable.add(new PairOfInts(key.getValue(), val.get()));
            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 在这里把最后一个词输出
            context.write(new Text(pre_word), pair_list_writable);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexPartition.class);
        job.setMapperClass(InvertedIndexPartition.MyMapper.class);
        job.setPartitionerClass(InvertedIndexPartition.MyPartitioner.class);
        job.setReducerClass(InvertedIndexPartition.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArrayListWritable.class);
        job.setMapOutputKeyClass(PairOfStringInt.class);
        job.setMapOutputValueClass(IntWritable.class);

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexPartition"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexPartition任务结束，总运行时间为：" + runtime + " 秒！");
    }
}

2.3 采用gap压缩的倒排索引

2.3.1 方法说明

在2.2方法的基础上，不必要保存每个docID，只需要保存下一个文档距离上一个文档序号的gap，从而一定程度上压缩索引量。

2.3.2 代码

import edu.umd.cloud9.io.array.ArrayListWritable;
import edu.umd.cloud9.io.pair.PairOfInts;
import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;

public class InvertedIndexGap {
    public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE)，KEY为当前(word,docID)，VALUE是COUNT
            PairOfStringInt KEY = new PairOfStringInt();
            IntWritable VALUE = new IntWritable();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey(), (int) docID);
                VALUE.set(entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
        @Override
        public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
            // 取出需要的word, tostring得到的时(string, int)
            String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
            return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
        }
    }

    public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, ArrayListWritable<PairOfInts>> {
        private final static ByteArrayOutputStream postingByteStream = new ByteArrayOutputStream();
        private final static DataOutputStream outStream = new DataOutputStream(postingByteStream);

        private final static ArrayList<PairOfInts> pair_list = new ArrayList<PairOfInts>();

        private String pre_word = null; // 用来核查是否还在当前词
        private String temp_word = null;
        private int lastDocID = 0;
        private int tempDocID;
        private int docGap;


        @Override
        protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            temp_word = key.getKey();
            tempDocID = key.getValue();
            docGap = tempDocID - lastDocID;
            lastDocID = tempDocID;

            // 检查上个词是否采集完全，如果采集完全了就输出
            if (!temp_word.equals(pre_word)) {
                if (pre_word != null) {
                    context.write(new Text(pre_word), new ArrayListWritable<PairOfInts>(pair_list));
                    pair_list.clear();
                    pre_word = temp_word;
                    lastDocID = 0;
                } else { //是初始化状态
                    pre_word = temp_word;
                }
            }

            for (IntWritable val : values) {
                pair_list.add(new PairOfInts(docGap, val.get()));

            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 在这里把最后一个词输出
            context.write(new Text(pre_word), new ArrayListWritable<PairOfInts>(pair_list));
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexGap.class);
        job.setMapperClass(InvertedIndexGap.MyMapper.class);
        job.setPartitionerClass(InvertedIndexGap.MyPartitioner.class);
        job.setReducerClass(InvertedIndexGap.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ArrayListWritable.class);
        job.setMapOutputKeyClass(PairOfStringInt.class);
        job.setMapOutputValueClass(IntWritable.class);

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexGap"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexCompress任务结束，总运行时间为：" + runtime + " 秒！");
    }
}

2.4 采用二进制压缩的倒排索引

2.4.1 方法说明

在2.3的基础上，以字节流的形式传输数据，并以二进制流输出，从而实现索引压缩。

2.4.2 代码

import edu.umd.cloud9.io.pair.PairOfStringInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;

public class InvertedIndexCompress {
    public static class MyMapper extends Mapper<LongWritable, Text, PairOfStringInt, IntWritable> {
        @Override
        protected void map(LongWritable key, Text doc, Context context) throws IOException, InterruptedException {
            // 获取文档编号
            String filename = ((FileSplit) context.getInputSplit()).getPath().toString();
            int docID = Integer.valueOf(filename.split("HarryPotterPart|\\.")[1]);
            // 分词
            StringTokenizer docTokenier = new StringTokenizer(doc.toString().replaceAll("[^a-z A-Z]", " "));

            // 对文档中的词计数
            Map<String, Integer> word_count_map = new HashMap<String, Integer>();
            String word = new String();
            while (docTokenier.hasMoreTokens()) {
                word = docTokenier.nextToken();
                word_count_map.put(word, 1 + word_count_map.getOrDefault(word, 0));
            }

            // 产生(KEY, VALUE)，KEY为当前(word,docID)，VALUE是COUNT
            PairOfStringInt KEY = new PairOfStringInt();
            IntWritable VALUE = new IntWritable();
            for (Map.Entry<String, Integer> entry : word_count_map.entrySet()) {
                KEY.set(entry.getKey(), (int) docID);
                VALUE.set(entry.getValue());
                context.write(KEY, VALUE);
            }
        }
    }

    public class MyPartitioner<PairOfStringInt, IntWritable> extends Partitioner<PairOfStringInt, IntWritable> {
        @Override
        public int getPartition(PairOfStringInt pairOfStringInt, IntWritable intWritable, int numPartitions) {
            // 取出需要的word, tostring得到的时(string, int)
            String str = pairOfStringInt.toString().split(",", 2)[0].substring(1);
            return (str.hashCode() & Integer.MAX_VALUE) % numPartitions;
        }
    }

    public static class MyReducer extends Reducer<PairOfStringInt, IntWritable, Text, BytesWritable> {
        private final static ByteArrayOutputStream postingByteStream = new ByteArrayOutputStream();
        private final static DataOutputStream outStream = new DataOutputStream(postingByteStream);

        private String pre_word = null; // 用来核查是否还在当前词
        private String temp_word = null;
        private int lastDocID = 0;
        private int tempDocID;
        private int docGap;

        @Override
        protected void reduce(PairOfStringInt key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            temp_word = key.getKey();
            tempDocID = key.getValue();
            docGap = tempDocID - lastDocID;
            lastDocID = tempDocID;

            // 检查上个词是否采集完全，如果采集完全了就输出
            if (!temp_word.equals(pre_word)) {
                if (pre_word != null) {
                    outStream.flush();
                    postingByteStream.flush();
                    context.write(new Text(pre_word), new BytesWritable(postingByteStream.toByteArray()));
                    postingByteStream.reset(); // 清空之前的内容
                    pre_word = temp_word;
                    lastDocID = 0;
                } else { //是初始化状态
                    pre_word = temp_word;
                }
            }

            for (IntWritable val : values) {
                WritableUtils.writeVInt(outStream, docGap); // 写入IDgap
                WritableUtils.writeVInt(outStream, val.get()); // 写入词频
            }
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            // 在这里把最后一个词输出
            outStream.flush();
            postingByteStream.flush();
            context.write(new Text(pre_word), new BytesWritable(postingByteStream.toByteArray()));
            postingByteStream.close();
            outStream.close();
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(InvertedIndexCompress.class);
        job.setMapperClass(InvertedIndexCompress.MyMapper.class);
        job.setPartitionerClass(InvertedIndexCompress.MyPartitioner.class);
        job.setReducerClass(InvertedIndexCompress.MyReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setMapOutputKeyClass(PairOfStringInt.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class); // 二进制输出

        String inputPath = "hdfs://master:9000/homework/HW3/input/HarryPotter_new"; // 输入路径
        String outputPath = "hdfs://master:9000/homework/HW3/output/InvertedIndexCompress"; // 输出路径
        FileInputFormat.setInputPaths(job, new Path(inputPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        FileSystem fs = FileSystem.get(new URI("hdfs://master:9000"), new Configuration());
        fs.delete(new Path(outputPath), true);
        Long startTime = System.currentTimeMillis();
        job.waitForCompletion(true);
        double runtime = (System.currentTimeMillis() - startTime) / 1000;
        System.out.println("InvertedIndexCompress任务结束，总运行时间为：" + runtime + " 秒！");
    }
}

3 实验结果

计算时间和索引存储空间如下表所示，采用partition的方法可以减少排序消耗的时间，采用gap和二进制压缩可以减少索引占用的空间。

算法	Basic	Partition	Gap	Compress
计算时间	11秒	10秒	10秒	9秒
存储空间	3.3M	3.3M	23.9M	1.2M

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：java异步回调的原理 javascript异步回调函数

下一篇：nginx配置企微可信域名 nginx部署微服务

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯

MapReduce案例倒排索引 mapreduce实现倒排索引

MapReduce案例倒排索引 mapreduce实现倒排索引

1 问题说明

2 方法和代码

2.1 基础倒排索引

2.1.1 方法说明

2.1.2 代码

2.2 采用partitioner的倒排索引

2.2.1 方法说明

2.2.2 代码

2.3 采用gap压缩的倒排索引

2.3.1 方法说明

2.3.2 代码

2.4 采用二进制压缩的倒排索引

2.4.1 方法说明

2.4.2 代码

3 实验结果

51CTO博客