MapReduce文件切割算法 mapreduce partitioner

转载

mob6454cc6cee7e 2024-04-12 22:24:07

文章标签 MapReduce文件切割算法 Text mapreduce apache 文章分类 架构后端开发

1. Partition 分区个数、ReduceTask并行度、分区器

MapReduce文件切割算法 mapreduce partitioner_Text

点击查看分区器源码

/*
 分区器使用流程
 1. Driver中 指定分区个数 和分区器实现类
        分区个数(ReduceTask个数) : job.setNumReduceTasks(n) 或 mapreduce.job.reduces=n
        分区器实现类 : job.setPartitionerClass(clusmPartitioner.class) 或 mapreduce.job.partitioner.class
 2. 判断使用哪个分区器(根据分区个数)
        分区个数 > 1 : job.setPartitionerClass(clusmPartitioner.class) 或 mapreduce.job.partitioner.class
               = 1 :  创建Partitioner实现类(匿名内部类)  1个分区且分区编号为0
 3. 使用分区器(map输出时)
        根据 key 获取当前 key,value 所在的分区编号
*/

/************设置分区个数(reduceTask个数 不指定时为1)**********************************************/

  // 指定 reduceTask 个数
  //    1. 驱动类中 指定job.setNumReduceTasks(n)
  //    2. 配置文件中 指定 mapreduce.job.reduces=n
  // 不指定时 将使用默认值 1
  
  Job类
  // 获取 reduceTask 个数
  int numReduces = conf.getNumReduceTasks();

  // 指定 reduceTask 个数
  public void setNumReduceTasks(int tasks) throws IllegalStateException {
    ensureState(JobState.DEFINE);
    conf.setNumReduceTasks(tasks);
  }

  JobConf类
  // 指定 reduceTask 个数
  public void setNumReduceTasks(int n) { setInt(JobContext.NUM_REDUCES, n); }
  // 获取 reduceTask 个数
  // 使用 NUM_REDUCES设置值, 没有指定时使用 默认值1
  public int getNumReduceTasks() { return getInt(JobContext.NUM_REDUCES, 1); }
  public static final String NUM_REDUCES = "mapreduce.job.reduces";


/************根据分区个数,判断 使用哪个分区器**********************************************/

  Job类
  public static final String PARTITIONER_CLASS_ATTR = "mapreduce.job.partitioner.class";

  // 指定 分区器
  public void setPartitionerClass(Class<? extends Partitioner> cls
                                ) throws IllegalStateException {
  ensureState(JobState.DEFINE);
  conf.setClass(PARTITIONER_CLASS_ATTR, cls, 
                Partitioner.class);
  }

  // 获取 分区器
  // 使用 mapreduce.job.partitioner.class 或者 job.setPartitionerClass(clusmPartitioner.class) 
  // 不指定 默认使用HashPartitioner.class 
  public Class<? extends Partitioner<?,?>> getPartitionerClass() 
     throws ClassNotFoundException {
    return (Class<? extends Partitioner<?,?>>) 
      conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);
  }


  MapTask的内部类 
  // 根据 分区个数,判断使用 那种分区器
  private class NewOutputCollector<K,V>
    extends org.apache.hadoop.mapreduce.RecordWriter<K,V> {

    NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
                       JobConf job,
                       TaskUmbilicalProtocol umbilical,
                       TaskReporter reporter
                       ) throws IOException, ClassNotFoundException {
      collector = createSortingCollector(job, reporter);
      partitions = jobContext.getNumReduceTasks();
      // 分区个数 >1
      // 使用 jobContext.getPartitionerClass() 分区器
      if (partitions > 1) {
        partitioner = (org.apache.hadoop.mapreduce.Partitioner<K,V>)
          ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
      } else {
      // 分区个数 = 1时 ,内部类创建分区(只有1个分区,且分区编号为0)
        partitioner = new org.apache.hadoop.mapreduce.Partitioner<K,V>() {
          @Override
          public int getPartition(K key, V value, int numPartitions) {
            return partitions - 1;
          }
        };
      }
    }


/****************使用分区器******************************************/

    MapTask 
    // 写入 环形缓冲区
    @Override
    public void write(K key, V value) throws IOException, InterruptedException {
      collector.collect(key, value,
                        // 根据 key 获取当前 key,value 所在的分区编号
                        partitioner.getPartition(key, value, partitions));
    }


/****************分区器******************************************/
// 功能 :  分区器
// 该类的每个子类 都对应着不同的分区规则
public abstract class Partitioner<KEY, VALUE> {
  
  /** 
   *  通过key,获取该条记录 所在的分区编号
   *  给定分区数 = reduceTask数
   *
   * @param key 要被分区的key
   * @param value the entry value.
   * @param numPartitions 分区数
   * @return 当前key 所在的分区编号 分区编号规则 >=0 且< 分区个数
   */
  public abstract int getPartition(KEY key, VALUE value, int numPartitions);
  
}

// Partitioner 的实现类
// 分区规则 
//    key.hashCode % numReduceTasks = 该条记录所在分区编号
public class HashPartitioner<K, V> extends Partitioner<K, V> {

  public int getPartition(K key, V value,
                          int numReduceTasks) {
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

}

点击查看自定义分区器

/*
 分区器的作用
     1. 将 输出数据按照指定条件 输出到不同的文件中(分区)

 使用 自定义分区器
     1. 创建 自定义分区器 
          1. 继承 Partitioner
          2. 重写 getPartition方法
     2. 指定 job使用分区器
          job.setPartitionerClass(classOf[ThreeNationPartitioner[Text, IntWritable]])
     3. 指定 分区个数(ReduceTask个数)
          job.setNumReduceTasks(3)  // 大于1时,步骤2中指定的分区器才能生效
 
  注意事项 
     1. reduceTask个数 与 分区编号 的关系 
           即job.setNumReduceTasks(n) 与 x = getPartition(key: Text, value: IntWritable, numPartitions: Int) 的关系

          (1)如果ReduceTask的数量> getPartition的结果数，则会多产生几个空的输出文件part-r-000xx;
          (2)如果1<ReduceTask的数量<getPartition的结果数，则有一部分分区数据无处安放，会Exception;
          (3)如果ReduceTask的数量=1，则不管MapTask端输出多少个分区文件，最终结果都交给这一个 ReduceTask，最终也就只会产生一个结果文件 part-r-00000;
          (4)分区号必须从零开始，逐一累加;
     总结
        getPartition的分区编号 in [0,ReduceTask -1]
           示例 ReduceTask = 5  分区编号 in (0,1,2,3,4)
                               分区编号 in (0,1,2)  ....  会生产5个分区,只是3、4号分区为空文件 
                               分区编号 >=5 报错 .... java.lang.Exception: java.io.IOException: Illegal partition for 张飞 (3)
*/

/*******************自定义分区案例*************************************************************************/

package com.dxm.mapreduce.customPartitionPk

import java.lang

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{IntWritable, LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.mapreduce.{Job, Mapper, Partitioner, Reducer}

// Mapper 类
class WCMapper extends Mapper[LongWritable, Text, Text, IntWritable] {
  var text = new Text
  var intWritable = new IntWritable(1)

  // 每行记录调用一次map方法
  override def map(key: LongWritable, value: Text, context: Mapper[LongWritable, Text, Text, IntWritable]#Context) = {
    //1. 获取一行记录
    val line = value.toString

    //2. 切割
    val words = line.split(" ")

    //3. 输出到缓冲区
    words.foreach(
      key1 => {
        text.set(key1);
        context.write(text, intWritable)
      }
    )

  }
}

// Reducer 类
class WCReducer extends Reducer[Text, IntWritable, Text, IntWritable] {

  private val intWritable = new IntWritable

  // 每个key调用一次
  // 张飞 <1,1,1,1,1>
  override def reduce(key: Text, values: lang.Iterable[IntWritable], context: Reducer[Text, IntWritable, Text, IntWritable]#Context) = {
    var sum: Int = 0
    // 1. 对词频数 求sum
    values.forEach(sum += _.get)

    // 2. 输出结果
    intWritable.set(sum)
    context.write(key, intWritable)

  }
}


// 自定义 Partition
/*
*  需求:
*    分区个数 : 3
*    分区逻辑 :
*              0 - 魏国人
*              1 - 蜀国人
*              2 - 吴国人
*
* */
// 泛型为 map输出key,value 类型
// 分区编号规则 >=0 且< 分区个数
class ThreeNationPartitioner[Text, IntWritable] extends Partitioner[Text, IntWritable] {
  override def getPartition(key: Text, value: IntWritable, numPartitions: Int): Int = {
    var one: List[String] = List("曹操", "曹仁", "曹植")
    var two: List[String] = List("张飞", "刘备", "关羽")
    var three: List[String] = List("孙权", "张昭", "周瑜")

    key.toString match {
      case e: String if one.contains(e) => 0
      //case e: String if two.contains(e) => 1
      //case e: String if three.contains(e) => 2
      case _ => 3
    }

  }
}

// Driver
object CustomPartitionDriver {
  def main(args: Array[String]): Unit = {
    //1. 获取配置信息以及 获取job对象
    //读取配置文件  Configuration: core-default.xml, core-site.xml
    var configuration = new Configuration
    var job: Job = Job.getInstance(configuration)

    //2. 注册本Driver程序的jar
    job.setJarByClass(this.getClass)

    job.setJobName("scala mr")

    //3. 注册 Mapper 和 Reducer的jar
    job.setMapperClass(classOf[WCMapper])
    job.setReducerClass(classOf[WCReducer])

    //4. 设置Mapper 类输出key-value 数据类型
    job.setMapOutputKeyClass(classOf[Text])
    job.setMapOutputValueClass(classOf[IntWritable])

    //5. 设置最终输出key-value 数据类型
    job.setOutputKeyClass(classOf[Text])
    job.setOutputValueClass(classOf[IntWritable])

    //6. 设置输入输出路径
    FileInputFormat.setInputPaths(job, "src/main/data/input/1.txt")
    FileOutputFormat.setOutputPath(job, new Path("src/main/data/output"))

    //7. 指定分区个数
    //注意事项 分区个数 > 1时,job.setPartitionerClass 指定的分区才会生效
    //               = 1 时,使用默认分区器(1个分区,返回分区编号为0)
    job.setNumReduceTasks(3)

    //8. 指定分区器
    job.setPartitionerClass(classOf[ThreeNationPartitioner[Text, IntWritable]])

    //9. 提交job
    val bool: Boolean = job.waitForCompletion(true)
    System.exit(bool match {
      case true => "0".toInt
      case false => "1".toInt
    })

  }
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。