07Mapreduce实例——二次排序

实验原理

在Map阶段,使用job.setInputFormatClass定义的InputFormat将输入的数据集分割成小数据块splites,同时InputFormat提供一个RecordReder的实现。本实验中使用的是TextInputFormat,他提供的RecordReder会将文本的字节偏移量作为key,这一行的文本作为value。这就是自定义Map的输入是<LongWritable, Text>的原因。然后调用自定义Map的map方法,将一个个<LongWritable, Text>键值对输入给Map的map方法。注意输出应该符合自定义Map中定义的输出<IntPair, IntWritable>。最终是生成一个List<IntPair, IntWritable>。在map阶段的最后,会先调用job.setPartitionerClass对这个List进行分区,每个分区映射到一个reducer。每个分区内又调用job.setSortComparatorClass设置的key比较函数类排序。可以看到,这本身就是一个二次排序。 如果没有通过job.setSortComparatorClass设置key比较函数类,则可以使用key实现的compareTo方法进行排序。 在本实验中,就使用了IntPair实现的compareTo方法。

在Reduce阶段,reducer接收到所有映射到这个reducer的map输出后,也是会调用job.setSortComparatorClass设置的key比较函数类对所有数据对排序。然后开始构造一个key对应的value迭代器。这时就要用到分组,使用job.setGroupingComparatorClass设置的分组函数类。只要这个比较器比较的两个key相同,他们就属于同一个组,它们的value放在一个value迭代器,而这个迭代器的key使用属于同一个组的所有key的第一个key。最后就是进入Reducer的reduce方法,reduce方法的输入是所有的(key和它的value迭代器)。同样注意输入与输出的类型必须与自定义的Reducer中声明的一致。

实验步骤

1.建一个文本文档,用逗号分隔开,数据如下

goods_visit2表
goods_id click_num
1010037 100
1010102 100
1010152 97
1010178 96
1010280 104
1010320 103
1010510 104
1010603 96

1010637 97虚拟机中启动Hadoop

2.本地新建/data/mapreduce8目录。

       mkdir -p /data/mapreduce8

3.将表上传到虚拟机中

4.上传并解压hadoop2lib文件

5.在HDFS上新建/mymapreduce8/in目录,然后将Linux本地/data/mapreduce8目录下的goods_visit2文件导入到HDFS的/mymapreduce8/in目录中。

         hadoop fs -mkdir -p /mymapreduce8/in  

         hadoop fs -put /data/mapreduce8/goods_visit2 /mymapreduce8/in  

6.IDEA中编写Java代码

package mapreduce7;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.io.WritableComparable;

import org.apache.hadoop.io.WritableComparator;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Partitioner;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class SecondarySort

{


    public static class IntPair implements WritableComparable<IntPair>

    {

        int first;

        int second;


        public void set(int left, int right)

        {

            first = left;

            second = right;

        }

        public int getFirst()

        {

            return first;

        }

        public int getSecond()

        {

            return second;

        }

        @Override


        public void readFields(DataInput in) throws IOException

        {

            // TODO Auto-generated method stub

            first = in.readInt();

            second = in.readInt();

        }

        @Override


        public void write(DataOutput out) throws IOException

        {

            // TODO Auto-generated method stub

            out.writeInt(first);

            out.writeInt(second);

        }

        @Override


        public int compareTo(IntPair o)

        {

            // TODO Auto-generated method stub

            if (first != o.first)

            {

                return first < o.first ? 1 : -1;

            }

            else if (second != o.second)

            {

                return second < o.second ? -1 : 1;

            }

            else

            {

                return 0;

            }

        }

        @Override

        public int hashCode()

        {

            return first * 157 + second;

        }

        @Override

        public boolean equals(Object right)

        {

            if (right == null)

                return false;

            if (this == right)

                return true;

            if (right instanceof IntPair)

            {

                IntPair r = (IntPair)
right;

                return r.first == first && r.second ==
second;

            }

            else

            {

                return false;

            }

        }

    }


    public static class FirstPartitioner extends Partitioner<IntPair, IntWritable>

    {

        @Override

        public int getPartition(IntPair key, IntWritable value,int numPartitions)

        {

            return Math.abs(key.getFirst() * 127) % numPartitions;

        }

    }

    public static class GroupingComparator extends WritableComparator

    {

        protected GroupingComparator()

        {

            super(IntPair.class, true);

        }

        @Override

        //Compare two WritableComparables.

        public int compare(WritableComparable w1, WritableComparable w2)

        {

            IntPair ip1 = (IntPair) w1;

            IntPair ip2 = (IntPair) w2;

            int l = ip1.getFirst();

            int r = ip2.getFirst();

            return l == r ? 0 : (l < r ? -1 : 1);

        }

    }

    public static class Map extends Mapper<LongWritable, Text, IntPair, IntWritable>

    {

        private final IntPair intkey = new IntPair();

        private final IntWritable intvalue = new IntWritable();

        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException

        {

            String line =
value.toString();

            StringTokenizer tokenizer = new StringTokenizer(line);

            int left = 0;

            int right = 0;

            if (tokenizer.hasMoreTokens())

            {

                left =
Integer.parseInt(tokenizer.nextToken());

                if (tokenizer.hasMoreTokens())

                    right =
Integer.parseInt(tokenizer.nextToken());

                intkey.set(right, left);

                intvalue.set(left);

                context.write(intkey, intvalue);

            }

        }

    }


    public static class Reduce extends Reducer<IntPair, IntWritable, Text, IntWritable>

    {

        private final Text left = new Text();

        private static final Text SEPARATOR = new Text("------------------------------------------------");


        public void reduce(IntPair key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException

        {

            context.write(SEPARATOR, null);

            left.set(Integer.toString(key.getFirst()));

            System.out.println(left);

            for (IntWritable val : values)

            {

                context.write(left, val);

                //System.out.println(val);

            }

        }

    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException

    {


        Configuration conf = new Configuration();

        Job job = new Job(conf, "secondarysort");

        job.setJarByClass(SecondarySort.class);

        job.setMapperClass(Map.class);

        job.setReducerClass(Reduce.class);

        job.setPartitionerClass(FirstPartitioner.class);


        job.setGroupingComparatorClass(GroupingComparator.class);

        job.setMapOutputKeyClass(IntPair.class);


        job.setMapOutputValueClass(IntWritable.class);


        job.setOutputKeyClass(Text.class);


        job.setOutputValueClass(IntWritable.class);


        job.setInputFormatClass(TextInputFormat.class);


        job.setOutputFormatClass(TextOutputFormat.class);

        String[] otherArgs=new String[2];

        otherArgs[0]="hdfs://192.168.149.10:9000/mymapreduce8/in/goods_visit2";

        otherArgs[1]="hdfs://192.168.149.10:9000/mymapreduce8/out";


        FileInputFormat.setInputPaths(job, new Path(otherArgs[0]));


        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));


        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

7.将hadoop2lib目录中的jar包,拷贝到hadoop2lib目录下。

8.拷贝log4j.properties文件

9.运行结果

mapreduce二次排序实例 mapreduce二次排序原理_apache

 

mapreduce二次排序实例 mapreduce二次排序原理_apache_02

 

 

 

mapreduce二次排序实例 mapreduce二次排序原理_hadoop_03

 

 PS:本次表不用把空格替换成逗号,数据之间使用一个空格隔开