MapReduce项目实战（一）

原创

wx62be9d88ce294 2022-07-02 00:08:10 博主文章分类：大数据 ©著作权

文章标签 mapreduce hadoop hdfs apache 文章分类 虚拟化云计算

©著作权归作者所有：来自51CTO博客作者wx62be9d88ce294的原创作品，请联系作者获取转载授权，否则将追究法律责任

1.文件源

部分数据
MapReduce项目实战（一）_mapreduce

2.创建FirstJob

package com.demo03;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


/**计算某一个词语在某一篇微博中出现的总数，同时计算一个有多少片微博
 *
 */
public class FirstJob {
    public static void main(String[] args) {
        Configuration conf = new Configuration(true);
        conf.set("mapreduce.framework.name","local");
        try {
            Job job = Job.getInstance(conf, "weibo1");
            job.setJarByClass(FirstJob.class);
            //设置Mapper相关参数  继续_id  1   或者 count  1
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setMapperClass(FirstMapper.class);

            //设置分区相关参数：reducerTask数量和对象的分区类
            job.setNumReduceTasks(4);
            job.setPartitionerClass(FirstPartitioner.class);
            //设置combiner类
            job.setCombinerClass(FirstReducer.class);

            //设置Reducer相关参数
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            job.setReducerClass(FirstReducer.class);

            //设置输入路径
            FileInputFormat.addInputPath(job,new Path("/weibo/Input"));
            //设置输出路径
            FileSystem fs = FileSystem.get(conf);
            Path outputPath = new Path("/weibo/Output");
            if(fs.exists(outputPath)){
                fs.delete(outputPath,true);
            }
            FileOutputFormat.setOutputPath(job,outputPath);

            job.waitForCompletion(true);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

3.创建FirstMapper

package com.demo03;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;
import java.io.StringReader;

public class FirstMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
    //词语_微博id
    private Text mkey = new Text();
    private IntWritable outVal = new IntWritable(1);
    private Text countKey = new Text("count");

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //3823890210294392  今天我约了豆浆，油条
        String[] splits = value.toString().trim().split("\t");
        if(splits.length>=2){
            //获取微博的id
            String id = splits[0].trim();
            //获取微博内容
            String content = splits[1].trim();
            StringReader stringReader = new StringReader(content);
            //需要分词  使用IK分词器提供的类
            IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);
            Lexeme wordLexeme = null;
            while((wordLexeme=ikSegmenter.next())!=null){
                String word = wordLexeme.getLexemeText();
                //词语_微博id   1
                mkey.set(word+"_"+id);
                context.write(mkey,outVal);
            }
            //处理一行数据，就是处理了一篇微博，输出：count 1
            context.write(countKey,outVal);
        }else{
            System.err.println(value.toString()+"----数据不合法---");
        }
    }
}

4.创建getPartition

package com.demo03;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;

public class FirstPartitioner extends HashPartitioner<Text,IntWritable> {
    @Override
    public int getPartition(Text key, IntWritable value, int numReduceTasks) {
        //分区4 专门用于记录微博的数量
        if("count".equals(key.toString())){
            return 3;
        }else{
            //其他分区  按照key对应的hash值%(numReduceTasks-1)
            return super.getPartition(key,value,numReduceTasks-1);
        }
    }
}

5.创建FirstReducer

package com.demo03;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FirstReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable outValue = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        // "今天_3823890210294392  1
        // "今天_3823890210294392  1
        int sum = 0;
        //循环遍历计算和
        for(IntWritable val:values){
            sum += val.get();
        }
        outValue.set(sum);
        context.write(key,outValue);

    }
}