## ✌✌✌古人有云,好记性不如烂笔头,千里之行,始于足下,每日千行代码必不可少,每日总结写一写,目标大厂,满怀希望便会所向披靡,哈哈哈!!!✌✌✌

Hadoop学习:MapReduce实现倒排索引_python

一、✌题目要求

文件1:a.txt

Hadoop学习:MapReduce实现倒排索引_spark_02

文件2:b.txt

Hadoop学习:MapReduce实现倒排索引_mapreduce_03

文件3:c.txt

Hadoop学习:MapReduce实现倒排索引_python_04


最终输出格式:

Hadoop学习:MapReduce实现倒排索引_大数据_05

二、✌实现思想

> 首先在map阶段,获得每个单词所在的文件名称
> 然后在方法中,每个单词作为Key,所在文件名称+1作为Value
> 在Reduce阶段,针对每个Key,对他们的Value迭代,将Value切割获得个数,不断累加
> 最终按照指定格式写出

三、✌代码实现

1.✌Map类

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class Map extends Mapper<LongWritable, Text, Text, Text> {

String name;

//获得切片文件名称
@Override
protected void setup(Context context) throws IOException, InterruptedException {

FileSplit inputSplit = (FileSplit) context.getInputSplit();

name = inputSplit.getPath().getName();

}

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

String line = value.toString();

String[] words = line.split(" ");

//输出格式为:apple a.txt2 pear b.txt1
for (String word : words) {
context.write(new Text(word), new Text(name + 1));
}

}

}

2.✌Reduce类

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class Reduce extends Reducer<Text, Text, Text, Text> {

@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

int a_sum = 0, b_sum = 0, c_sum = 0;

//计数,对每个key
for (Text value : values) {

if (value.toString().contains("a.txt")) {
a_sum += 1;
} else if (value.toString().contains("b.txt")) {
b_sum += 1;
} else {
c_sum += 1;
}
}

//输出格式:apple a.txt-->3 b.txt-->1 c.txt-->2
context.write(key, new Text("a.txt-->" + a_sum + "\t" + "b.txt-->" + b_sum + "\t" + "c.txt-->" + c_sum + "\t"));

}

}

3.✌Driver类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.BasicConfigurator;

import java.io.IOException;

public class Driver {

public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

//配置文件路径
args = new String[]{"D:/input/inputword", "D:/output"};

//打印日志信息
BasicConfigurator.configure();

//设置配置文件
Configuration conf = new Configuration();

//获得Job对象
Job job = Job.getInstance(conf);

//关联Map、Reduce、Driver类
job.setJarByClass(Driver.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);

//设置Map输出格式
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);

//设置最终输出格式
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);

//设置文件路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

//提交任务
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);

}

}