hadoop统计单词长度 hadoop单词统计代码详解

转载

autohost 2023-07-21 14:49:52

文章标签 hadoop统计单词长度 hadoop mapreduce java Text 文章分类 Hadoop 大数据

学习目标：

掌握MapReduce单词统计原理

学习内容：

文字描述

读取的数据为

hello,word,kafka,mapreduce,hadoop
hello,word,kafka,mapreduce,hadoop
hello,word,kafka,mapreduce,hadoop
hello,word,kafka,mapreduce,hadoop
hello,word,kafka,mapreduce,hadoop

1.首先利用InputFormat抽象类的子类TextOutputFormat从文件中读取数据

TextOutputFormat，会一行一行的读取数据
读后的数据是一个一个的键值对形式

<k1,v1>
<第1行的偏移量,第1行>

<0,“hello,word,kafka,mapreduce,hadoop”>

<第2行的偏移量,第2行>，假设第二行的偏移量为23

<23,“hello,word,kafka,mapreduce,hadoop”>

。。。

2.map阶段将每个<k1,v1>进行处理，map阶段需要编写java程序，实现处理的逻辑

map阶段会把<k1,v1> ==> <k2,v2>

<k2,v2>
<hello,1>
<word,1>
<kafka,1>
<mapreduce,1>
<hadoop,1>
<hello,1>
。。。

代码逻辑

public static class MyMapper extends Mapper<LongWritable,Text, Text,LongWritable>{
        //重写map方法
        /*
        把<k1,v1> --> <k2,v2>
        <0,hadoop,mapReduce,flink,spark> --> <hadoop,1>
                                              <mapReduce,1>
                                              <flink,1>
                                              <spark,1>
         */
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //获取一行的文本数据，并分割
            String[] splits = value.toString().split(",");
            //遍历分割后的数据
            for (String split : splits) {
                //将k2,v2写入上下文
                context.write(new Text(split),new LongWritable(1));
            }
        }
    }

3.shuffle阶段，会把<k2,v2> ==>新<k2,v2>，

shuffle阶段有四个阶段：分区、排序、规约、分组，这里不对shuflle阶段编写逻辑，使用默认逻辑（默认逻辑不需要编写代码），默认逻辑处理后的<k2,v2>

<k2,v2>
<hello,<1,1,1,1>>
<word,<1,1,1,1>>
。。。

把相同key的value保存到同一集合<1,1,1,1>

4.reduce阶段，把<k2,v2>转换为<k3,v3>，reduce阶段也是需要编写代码把每个键值的values集合<1,1,1,1>变为4，key不变

<k3,v3>
<hello,4>
<word,4>
。。。
代码逻辑

public static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
        //重写reduce方法

        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //遍历value集合将1进行相加  <1,1,1>  --> <3>
            long count = 0;
            for (LongWritable value : values) {
                //注意value是LongWritable类型，需要调用get方法获取long类型
                count += value.get();
            }
            //写入上下文环境
            context.write(key, new LongWritable(count));
        }
    }

5.输出为文本文件，需要OutputFormat的子类TextOutputFormat
完整代码（含注释）

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

/*需要统计的单词：
hadoop,mapReduce,flink,spark
hello,word,zookeeper,kafka
hadoop,mapReduce,flink,spark
hello,word,zookeeper,kafka
hadoop,mapReduce,flink,spark
hello,word,zookeeper,kafka
hadoop,mapReduce,flink,spark
hello,word,zookeeper,kafka
 */


public class WordCount {


    //map处理逻辑
    /**
     * Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
     *     KEYIN： k1数据类型
     *     VALUEIN：v1数据类型
     *
     *     KEYOUT： k2数据类型
     *     VALUEOUT：v2数据类型
     */
    /*k1是这一行对应开头的偏移量，v1是这一行的数据
        k1              v1
        0      hadoop,mapReduce,flink,spark
        20     hello,word,zookeeper,kafka
        .
        .
        .
    ---------------------------------------------------
        k2              v2
        hadoop          1
        mapReduce       1
        flink           1
        spark           1
        .
        .
        .
        hadoop          1
        mapReduce       1
        .
        .
        .

     */
    public static class MyMapper extends Mapper<LongWritable,Text, Text,LongWritable>{
        //重写map方法
        /*
        把<k1,v1> --> <k2,v2>
        <0,hadoop,mapReduce,flink,spark> --> <hadoop,1>
                                              <mapReduce,1>
                                              <flink,1>
                                              <spark,1>
         */
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //获取一行的文本数据，并分割
            String[] splits = value.toString().split(",");
            //遍历分割后的数据
            for (String split : splits) {
                //将k2,v2写入上下文
                context.write(new Text(split),new LongWritable(1));
            }
        }
    }

    //shuffle处理逻辑
    /**
     * shuffle处理逻辑一般不写，但是会有默认的处理
     */
    /*
        <k2,v2>  --> 新的<k2,v2>
        <hadoop,1> --> <hadoop,<1,1,1>>  集合中1的数量对应hadoop出现的次数
     */


    //reduce处理逻辑
    /**
     * Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
     *     KEYIN：k2数据类型
     *     VALUEIN：v2数据类型
     *
     *     KEYOUT：k3数据类型
     *     VALUEOUT：v3数据类型
     */
    /*
        <k2,v2>  -->  <k3,v3>
        <hadoop,<1,1,1>>  --> <hadoop,3>
     */
    public static class MyReducer extends Reducer<Text,LongWritable,Text,LongWritable>{
        //重写reduce方法

        @Override
        protected void reduce(Text key, Iterable<LongWritable> values, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            //遍历value集合将1进行相加  <1,1,1>  --> <3>
            long count = 0;
            for (LongWritable value : values) {
                //注意value是LongWritable类型，需要调用get方法获取long类型
                count += value.get();
            }
            //写入上下文环境
            context.write(key, new LongWritable(count));
        }
    }

    //主方法
    public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
        //创建配置环境
        Configuration configuration = new Configuration();
        //创建job任务对象
        Job job = Job.getInstance(configuration, "WordCount");
            //如果打包运行出错，则需要加该配置
            job.setJarByClass(WordCount.class);

        //配置job对象（八个步骤）
        //1.指定文件读取方式和读取路径
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.setInputPaths(job, new Path("/wordcount"));

        //2.指定map阶段的处理方式和输出的数据类型
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        //3. 4. 5. 6. 是shuffle阶段的分区、排序、规约、分组，使用默认方式。

        //7.指定reduce阶段的处理方式和输出的数据类型
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

        //8.指定输出文件的方式和输出路径
        job.setOutputFormatClass(TextOutputFormat.class);
        //判断输出路径是否存在，如果存在则删除,
        //获取FileSystem
        FileSystem fileSystem = FileSystem.get(new Configuration());
        Path path = new Path("/wordcount_out");
        //判断路径是否存在，如果存在则删除
        if (fileSystem.exists(path)){
            fileSystem.delete(path, true);
        }

        TextOutputFormat.setOutputPath(job, path);

        //等待任务结束
        job.waitForCompletion(true);
    }
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。