mapreduce 编程环境 mapreduce编程规范

转载

mob64ca1415f0ab 2024-04-22 07:16:55

文章标签 mapreduce 编程环境 hadoop 大数据 Text apache 文章分类 架构后端开发

一、Mapper部分

二、Reducer部分

三、Driver部分

四、wordcount演示实例

4.1需求：

4.2测试数据：

4.3代码实现

4.3.1 pom.xml⽂件的配置

4.3.2 定义⼀个mapper内部类

4.3.3 定义⼀个reducer内部类

4.3.4 定义⼀个Driver类

⽤户编写的程序分为3 个部分： Mapper 、 Reducer 、 Driver （提交 mr 程序的客户端）

一、Mapper部分

1. ⾃定义类，继承 Mapper 类型

2. 定义 K1,V1,K2,V2 的泛型（ K1,V1 是 Mapper 的输⼊数据类型， K2,V2 是 Mapper 的输出数据类型）

3. 重写 map ⽅法（处理逻辑）

参考下图：

mapreduce 编程环境 mapreduce编程规范_apache

注意 : map ⽅法，每⼀个 KV 对都会调⽤⼀次。

二、Reducer部分

1. ⾃定义类，继承 Reducer 类型

2. 定义 K2,V2,K3,V3 的泛型（ K2,V2 是 Reducer 的输⼊数据类型， K3,V3 是 Reducer的输出数据类型）

3. 重写 reduce ⽅法的处理逻辑

参考下图：

mapreduce 编程环境 mapreduce编程规范_mapreduce 编程环境_02

注意 : reduce ⽅法，默认按 key 分组，每⼀组都调⽤⼀次。

三、Driver部分

整个程序需要⼀个Driver 来进⾏提交，提交的是⼀个描述了各种必要信息的 job 对象，如下

1. 获取 Job 对象

2. 指定驱动类

3. 设置 Mapper 和 Reducer 类型

4. 设置 Mapper 的输出 K2 、 V2 的类型（如果类型和 K3,V3 相同 , 可省略）

5. 设置 Reducer 的输出 K3 、 V3 的类型

6. 设置 Reduce 的个数（默认为 1 ）

7. 设置 Mapper 的输⼊数据的路径

8. 设置 Reducer 的输出数据的路径

9. 提交作业

参考下图：

mapreduce 编程环境 mapreduce编程规范_apache_03

四、wordcount演示实例

4.1需求：

统计⽂档中的单词数量

4.2测试数据：

a.txt

hello qianfeng hello 1999 hello beijing hello

world hello hello java good

b.txt

hello xisanqi hello bingbing

hello chenchen hello

ACMilan hello china

c.txt

hello hadoop hello java hello storm hello spark hello redis

hello zookeeper

hello hive hello hbase hello flume

4.3代码实现

⾸先注意我们要有⼀个主类 WordCount, 在主类⾥⾯有两个内部类 WordCountMapper,WordCountReducer和 main ⽅法

下⾯是wordcount类的框架

public static class WordCountMapper extends Mapper<LongWritable,Text, Text, IntWritable>{
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {

    }
}

public static class WordCountReducer extends Reducer<Text,IntWritable, Text, LongWritable>{
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException {

    }
}

//注意:这⾥我们可以直接使⽤WordCount充当主类,所以主类的功能可以简化成main⽅法
    public static void main(String[] args) throws IOException,ClassNotFoundException, InterruptedException {

}

4.3.1 pom.xml⽂件的配置

<dependencies>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>2.7.6</version>
    </dependency>

    <!--https://mvnrepository.com/artifact/org.apache.hadoop/hadoopclient -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.7.6</version>
    </dependency>

     <!--https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs-->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-hdfs</artifactId>
        <version>2.7.6</version>
    </dependency>
</dependencies>

4.3.2 定义⼀个mapper内部类

/**
* @Description 写⼀个wordcount程序的mapper类型
*
* 读取块⽂件时，K1是⾏偏移量，使⽤LongWritable类型
* v1是⾏记录， 使⽤Text类型
* 经过map函数处理后，
* k2是单词 使⽤Text类型
* v2是1 使⽤IntWritable类型
*
*/
public static class WordCountMapper extends Mapper<LongWritable,Text,Text, IntWritable> {
    /**
     * 重写Mapper类⾥提供的map⽅法
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException,InterruptedException {
    //map⽅法的key就是k1, ⾏偏移量不需要，因此不需要处理。只需要处理value，因为value就是v1，⾏记录
    //⼀对k1,v1就会调⽤⼀次map函数，因此map⽅法执⾏的次数和⾏记录数有关系
    //1: 将value的类型转为java的String类型
    String line = value.toString();
    //2: 使⽤空格对⾏记录进⾏切分成字符串数组
    String[] words = line.split(" ");
    //3: 遍历数组
    for (String word : words) {
    //要将word类型转为Text类型 ，当成k2 IntWritable类型的1作为value
    Text k2 = new Text(word);
    IntWritable v2 = new IntWritable(1);
    //4 将k2,v2,作为输出数据写出去，写到shuffle流程中的缓存区
    context.write(k2,v2);
    }
}

4.3.3 定义⼀个reducer内部类

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
 * @Description 写⼀个wordcount案例的Reducer类型
 *
 * 输⼊数据是map阶段产⽣的数据，经过shuffle阶段 进⾏了fetch和归并排序，并且按key分组，value整合成列表（迭代器）
 * k2就是Map阶段的k2,因此是Text类型
 * v2就是Map阶段的v2的列表,因此是IntWritable类型
 * 输出数据：k3是单词，因此和k2的类型⼀致
 * v3是叠加的数字，因此是IntWritable类型
 */
public static class WordCountReducer extends Reducer<Text,IntWritable, Text,IntWritable> {
    /**
     * 重写reduce⽅法
     * reduce的参数key就是k2
     * 参数values就是v2的列表
     * <"hello",<1,1,1,1,1,1,1,1,1>>
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException {
    //1: 获取迭代器对象
    Iterator<IntWritable> iterator = values.iterator();
    int sum = 0;//计数器
    //2: 进⾏询问有没有下⼀个元素
    while(iterator.hasNext()){
        //3: 取出元素
        IntWritable v2 = iterator.next();
        //4: 将IntWritable类型转为int类型进⾏叠加
        sum+=v2.get();
    }
    //5:将累加和转成IntWritable类型
    IntWritable v3 = new IntWritable(sum);
    //6: 将k2作为k3,和v3⼀起写出去    键值对应该是：<hello,8>
    context.write(key,v3);
    }
}

4.3.4 定义⼀个Driver类

定义⼀个主类，⽤来描述job并提交job

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
 * @Description 写⼀个wordcount案例的驱动类
 */
public static void main(String[] args) throws IOException,ClassNotFoundException, InterruptedException {
    //1:获取配置信息
    Configuration conf = new Configuration();
    //2:获取job对象
    Job job = Job.getInstance(conf);
    //设置驱动类型
    job.setJarByClass(wordcount.class);

    //3:设置mapper和reducer类型
    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(WordCountReducer.class);

    //4: 设置map的输出类型k2和v2，因为k2和k3类型相同，v2和v3类型相同，因此可以省略
    /* job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);*/

    //5:设置reduce的输出k3和v3的类型
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    //6:可以设置reduceTask的个数，默认值是1
    job.setNumReduceTasks(2);

    //7:设置mapreduce程序的输⼊路径和输出路径
    FileInputFormat.setInputPaths(job,new Path(args[0]));
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    //8:提交
    System.exit(job.waitForCompletion(true)?0:1);
 }

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：XXE SYSTEM关键词被过滤关键字过滤如何实现

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯