MapReduce
一种分布式计算模型,解决海量数据的计算问题,MapReduce将计算过程抽象成两个函数

Map(映射):对一些独立元素(拆分后的小块)组成的列表的每一个元素进行指定的操作,可以高度并行。
Reduce(化简):对一个列表的元素进行合并
input -> map -> reduce -> output
数据流通格式<kay,value>

eg:
原始数据                ->    map input                 map     map output(reduce input)            shuffle                         reduce          reduce output
example example         ->    <0,example example>       ->      <example,1> <example,1>             ->      <example,list(1,1,1)>       ->          <example,3>
helo wrold example      ->    <16,helo wrold example>   ->      <hello,1> <wrold,1> <example,1>     ->      <hello,list(1)> <...>       ->          <hello,1> <wrold,1>

MapReduce底层执行流程

一.Input

InputFormat
读取数据
转换成<key,value>
FileInputFormat
TextInputFormat 文本初始化,一行变成一个KY对,用偏移量作为Key、

二.Map

ModuleMapper类继承Mapper类
执行map(KEYIN,VALUEIN,KETOUT,VALUEOUT),
    默认情况下
    KEYIN:LongWritable
    KEYVALUE:TEXT

三.shuffle(洗牌)

map,output<key,value>
a)先存在内存中
b)合并combiner[可选] ->  <hadoop,1> <hadoop,1> =>>  <hadoop,2>
c)spill,溢写到磁盘中,存储成很多小文件,过程如下
    1.分区Partition(数量跟Reduce数量一致)
    2.在分区内进行排序sort
d)合并,Merge  ->大文件(Map Task任务运行的机器的本地磁盘中)
e)排序sort
f)压缩[可选]

四.reduce

reduce Task会到Map Task运行的机器上COPY要处理的数据
a)合并merge
b)排序
c)分组Group(相同的key的value放在一起)
ModuleReduceper类继承Reduce类
执行reduce(KEYIN,VALUEIN,KETOUT,VALUEOUT)
    map的输出类型就是reduce的输入类型,中间的shuffle只是进行合并分组排序,不会改变数据类型

五.output

OutputFormat
写数据
FileOutputFormat
TextInputFormat 每个KeyValue对输出一行,key和value之间使用分隔符\t,默认调用key和value的toString方法

mapreduce执行map不动_大数据

代码如下:

package com.cenzhongman.mapreduce;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//继承Configured类,从而继承了该类的getConf();line 81
//实现Tool方法,实现run方法 line79
//通过Toolrunner工具类的run方法实现,setConf(),达到conf传递的效果
public class WordCount extends Configured implements Tool {
    // 1.Map class
    public static class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private Text mapOutputKey = new Text();
        private final static IntWritable mapOutputValue = new IntWritable(1);

        @Override
        public void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // line value
            String lineValue = value.toString();

            // split
            // String[] strs = lineValue.split(" ");
            StringTokenizer stringTokennizer = new StringTokenizer(lineValue);

            // iterator
            while (stringTokennizer.hasMoreTokens()) {
                // get word value
                String wordValue = stringTokennizer.nextToken();

                // set value
                mapOutputKey.set(wordValue);

                // output
                context.write(mapOutputKey, mapOutputValue);
            }
        }
        
        @Override
        public void cleanup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // nothing
            // 在执行map之前会执行该函数,可用于JDBC等
            // Reduce同理,不再重复
        }

        @Override
        public void setup(Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // nothing
            // 在执行map之后会执行该函数,可用于JDBC断开等
        }

    }

    // 2.Reduce class
    public static class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable reduceOutputValue = new IntWritable();

        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            // sum tmp
            int sum = 0;
            // iterator
            for (IntWritable value : values) {
                // total
                sum += value.get();
            }
            // set value
            reduceOutputValue.set(sum);

            // output
            context.write(key, reduceOutputValue);

        }

    }

    // 3.driver
    public int run(String[] args) throws Exception {
        // 1.get configuration
        Configuration conf = getConf();

        // 2.create Job
        Job job = Job.getInstance(conf, this.getClass().getSimpleName());
        // run jar
        job.setJarByClass(this.getClass());

        // 3.set job
        // input -> map -> reduce -> output
        // 3.1 input from type
        Path inPath = new Path(args[0]);
        FileInputFormat.addInputPath(job, inPath);

        // 3.2 map
        job.setMapperClass(WordcountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //****************shuffle配置***********************
        //1)Partition分区
//      job.setPartitionerClass(cls);
        //2)sort排序
//      job.setSortComparatorClass(cls);
        //combiner[可选]Map中的合并
//      job.setCombinerClass(cls);
        //Group分组
//      job.setGroupingComparatorClass(cls);
        //压缩设置在配置文件中设置,也可以在conf对象中设置
        
        //****************shuffle配置***********************

        // 3.3 reduce
        job.setReducerClass(WordcountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 3.4 output
        Path outPath = new Path(args[1]);
        FileOutputFormat.setOutputPath(job, outPath);

        // 4 submit job
        boolean isSuccess = job.waitForCompletion(true);

            //set reduce number[可选,优化方式之一,默认值为1]配置文件mapreduce.job.reduces
            job.setNumReduceTasks(2);

        return isSuccess ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        //set compress设置压缩方式,可以从官方文件和源码中得到-----可选,优化方式之一
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");

        // int status = new WordCount().run(args);
        int status = ToolRunner.run(conf, new WordCount(), args);
        System.exit(status);
    }
}