MapReduce实战之倒排索引案例（多job串联）

原创

年轻即出发 2022-11-11 10:53:21 博主文章分类：Hadoop ©著作权

文章标签 倒排索引案例（多job串联） apache hadoop mapreduce 文章分类 运维

©著作权归作者所有：来自51CTO博客作者年轻即出发的原创作品，请联系作者获取转载授权，否则将追究法律责任

0）需求：有大量的文本（文档、网页），需要建立搜索索引

输出数据：

atguigu pingping
atguigu ss
atguigu ss

atguigu pingping
atguigu pingping
pingping ss

atguigu ss
atguigu pingping

MapReduce实战之倒排索引案例（多job串联）_hadoop

（1）第一次预期输出结果

atguigu--a.txt 3 atguigu--b.txt 2 atguigu--c.txt 2 pingping--a.txt 1 pingping--b.txt 3 pingping--c.txt 1 ss--a.txt 2 ss--b.txt 1 ss--c.txt 1

（2）第二次预期输出结果

atguigu c.txt-->2 b.txt-->2 a.txt-->3 pingping c.txt-->1 b.txt-->3 a.txt-->1 ss c.txt-->1 b.txt-->1 a.txt-->2

1）第一次处理

（1）第一次处理，编写OneIndexMapper

package com.atguigu.mapreduce.index; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; public class OneIndexMapper extends Mapper<LongWritable, Text, Text , IntWritable>{ String name; Text k = new Text(); IntWritable v = new IntWritable(); @Override protected void setup(Context context) throws IOException, InterruptedException { // 获取文件名称 FileSplit split = (FileSplit) context.getInputSplit(); name = split.getPath().getName(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 获取1行 String line = value.toString(); // 2 切割 String[] fields = line.split(" "); for (String word : fields) { // 3 拼接 k.set(word+"--"+name); v.set(1); // 4 写出 context.write(k, v); } } }

package com.atguigu.mapreduce.index;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 
public class OneIndexMapper extends Mapper<LongWritable, Text, Text  , IntWritable>{
      
       String name;
       Text k = new Text();
       IntWritable v = new IntWritable();
      
       @Override
       protected void setup(Context context)
                     throws IOException, InterruptedException {
              // 获取文件名称
              FileSplit split = (FileSplit) context.getInputSplit();
             
              name = split.getPath().getName();
       }
      
       @Override
       protected void map(LongWritable key, Text value, Context context)
                     throws IOException, InterruptedException {
              // 1 获取1行
              String line = value.toString();
             
              // 2 切割
              String[] fields = line.split(" ");
             
              for (String word : fields) {
                     // 3 拼接
                     k.set(word+"--"+name);
                     v.set(1);
                    
                     // 4 写出
                     context.write(k, v);
              }
       }
}

（2）第一次处理，编写OneIndexReducer

package com.atguigu.mapreduce.index; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; // 1 累加求和 for(IntWritable value: values){ count +=value.get(); } // 2 写出 context.write(key, new IntWritable(count)); } }

package com.atguigu.mapreduce.index;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
 
public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
      
       @Override
       protected void reduce(Text key, Iterable<IntWritable> values,
                     Context context) throws IOException, InterruptedException {
             
              int count = 0;
              // 1 累加求和
              for(IntWritable value: values){
                     count +=value.get();
              }
             
              // 2 写出
              context.write(key, new IntWritable(count));
       }
}

（3）第一次处理，编写OneIndexDriver

package com.atguigu.mapreduce.index; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class OneIndexDriver { public static void main(String[] args) throws Exception { args = new String[] { "e:/input/inputoneindex", "e:/output5" }; Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(OneIndexDriver.class); job.setMapperClass(OneIndexMapper.class); job.setReducerClass(OneIndexReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); } }

package com.atguigu.mapreduce.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
public class OneIndexDriver {
 
       public static void main(String[] args) throws Exception {
 
              args = new String[] { "e:/input/inputoneindex", "e:/output5" };
 
              Configuration conf = new Configuration();
 
              Job job = Job.getInstance(conf);
              job.setJarByClass(OneIndexDriver.class);
 
              job.setMapperClass(OneIndexMapper.class);
              job.setReducerClass(OneIndexReducer.class);
 
              job.setMapOutputKeyClass(Text.class);
              job.setMapOutputValueClass(IntWritable.class);
             
              job.setOutputKeyClass(Text.class);
              job.setOutputValueClass(IntWritable.class);
 
              FileInputFormat.setInputPaths(job, new Path(args[0]));
              FileOutputFormat.setOutputPath(job, new Path(args[1]));
 
              job.waitForCompletion(true);
       }
}

（4）查看第一次输出结果

atguigu--a.txt 3 atguigu--b.txt 2 atguigu--c.txt 2 pingping--a.txt 1 pingping--b.txt 3 pingping--c.txt 1 ss--a.txt 2 ss--b.txt 1 ss--c.txt 1

2）第二次处理

（1）第二次处理，编写TwoIndexMapper

package com.atguigu.mapreduce.index; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text>{ Text k = new Text(); Text v = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 获取1行数据 String line = value.toString(); // 2用“--”切割 String[] fields = line.split("--"); k.set(fields[0]); v.set(fields[1]); // 3 输出数据 context.write(k, v); } }

package com.atguigu.mapreduce.index;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
 
public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text>{
       Text k = new Text();
       Text v = new Text();
      
       @Override
       protected void map(LongWritable key, Text value, Context context)
                     throws IOException, InterruptedException {
             
              // 1 获取1行数据
              String line = value.toString();
             
              // 2用“--”切割
              String[] fields = line.split("--");
             
              k.set(fields[0]);
              v.set(fields[1]);
             
              // 3 输出数据
              context.write(k, v);
       }
}

（2）第二次处理，编写TwoIndexReducer

package com.atguigu.mapreduce.index; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // atguigu a.txt 3 // atguigu b.txt 2 // atguigu c.txt 2 // atguigu c.txt-->2 b.txt-->2 a.txt-->3 StringBuilder sb = new StringBuilder(); // 1 拼接 for (Text value : values) { sb.append(value.toString().replace("\t", "-->") + "\t"); } // 2 写出 context.write(key, new Text(sb.toString())); } }

package com.atguigu.mapreduce.index;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> {
 
       @Override
       protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
              // atguigu a.txt 3
              // atguigu b.txt 2
              // atguigu c.txt 2
 
              // atguigu c.txt-->2 b.txt-->2 a.txt-->3
 
              StringBuilder sb = new StringBuilder();
        // 1 拼接
              for (Text value : values) {
                     sb.append(value.toString().replace("\t", "-->") + "\t");
              }
              // 2 写出
              context.write(key, new Text(sb.toString()));
       }
}

（3）第二次处理，编写TwoIndexDriver

package com.atguigu.mapreduce.index; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class TwoIndexDriver { public static void main(String[] args) throws Exception { args = new String[] { "e:/input/inputtwoindex", "e:/output6" }; Configuration config = new Configuration(); Job job = Job.getInstance(config); job.setJarByClass(TwoIndexDriver.class); job.setMapperClass(TwoIndexMapper.class); job.setReducerClass(TwoIndexReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean result = job.waitForCompletion(true); System.exit(result?0:1); } }

package com.atguigu.mapreduce.index;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
public class TwoIndexDriver {
 
       public static void main(String[] args) throws Exception {
 
args = new String[] { "e:/input/inputtwoindex", "e:/output6" };
 
              Configuration config = new Configuration();
              Job job = Job.getInstance(config);
 
job.setJarByClass(TwoIndexDriver.class);
              job.setMapperClass(TwoIndexMapper.class);
              job.setReducerClass(TwoIndexReducer.class);
 
              job.setMapOutputKeyClass(Text.class);
              job.setMapOutputValueClass(Text.class);
             
              job.setOutputKeyClass(Text.class);
              job.setOutputValueClass(Text.class);
 
              FileInputFormat.setInputPaths(job, new Path(args[0]));
              FileOutputFormat.setOutputPath(job, new Path(args[1]));
 
              boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
       }
}