Mapreduce经典案例WordCount

原创

cerana 2022-09-30 10:13:50 博主文章分类：大数据 ©著作权

文章标签 hadoop sed jar 文章分类 运维

©著作权归作者所有：来自51CTO博客作者cerana的原创作品，请联系作者获取转载授权，否则将追究法律责任

主应用APP

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 主应用APP
 */
public class WCApp {

  public static void main(String[] args) throws Exception {
    Job job = Job.getInstance();
    
    job.setJarByClass(WCMapper.class);    //find jar by ClassName
    job.setJobName("wc");        //job name
    
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    
    job.setMapperClass(WCMapper.class);
    job.setReducerClass(WCReducer.class);
    
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

Mapper类

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WCMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
    //参数：                             |          |    |       |
    //LongWritable：输入key--------------|          |    |       |
    //Text：输入value-------------------------------|    |       |
    //Text：输出key--------------------------------------|       |
    //IntWritable：输出value-------------------------------------|
    
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
            throws IOException, InterruptedException {
        String str = value.toString();
        String[] arr = str.split(" ");
        if(arr != null && arr.length > 0){
            for(String s : arr){
                context.write(new Text(s), new IntWritable(1));
            }
        }
    }
}

Reducer类

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> value,
            Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
        Iterator<IntWritable> it = value.iterator();
        int count = 0 ;
        while(it.hasNext()){
            count = count + it.next().get();
        }
        context.write(key, new IntWritable(count));
    }
}

Context类变量用于在各处理流程中传递处理的中间结果。

Ant打包
将build.xml放在项目根目录下——与src目录平级。

<?xml version="1.0" encoding="UTF-8"?>
<project name="mywordcount" basedir="." default="finish">
  <!-- 准备目录 -->
  <target name="prepare">
    <delete>
      <fileset dir="${basedir}/build" />
      <filename name="${basedir}/lib/mywordcount.jar" />
    </delete>
    <mkdir dir="${basedir}/build/classes" />
  </target>

  <!-- 定义路径变量 -->
  <path id="hadooplib">
    <fileset dir="E:/hadoop-2.9.2/_libs">
      <include name="*.jar" />
    </fileset>
  </path>

  <!--编译程序 -->
  <target name="compile" depends="prepare">
    <javac srcdir="${basedir}/src" destdir="${basedir}/build/classes" encoding="utf-8" includeantruntime="on">
      <classpath refid="hadooplib" />
    </javac>
    <copy file="${basedir}/conf/core-site.xml" todir="${basedir}/build/classes" />
    <copy file="${basedir}/conf/mapred-site.xml" todir="${basedir}/build/classes" />
    <copy file="${basedir}/conf/yarn-site.xml" todir="${basedir}/build/classes" />
  </target>

  <!-- 打包  -->
  <target name="package" depends="compile">
    <jar destfile="${basedir}/lib/mywordcount.jar" basedir="${basedir}/build/classes">
      <manifest>
        <attribute name="Main-Class" value="com.bee.wc.WCApp" />
      </manifest>
    </jar>
  </target>

  <target name="finish" depends="package">
    <copy file="${basedir}/lib/mywordcount.jar" todir="E:/tmp" />
    <echo>------ 打包完成 ------</echo>
  </target>
</project>

打包放入的hadoop配置文件

core-site.xml

<?xml version="1.0"?>
<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://node01:8020</value>
    </property>
    <property>
        <name>mapreduce.app-submission.cross-platform</name>
        <value>true</value>
    </property>
</configuration>

mapred-site.xml

<?xml version="1.0"?>
<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
</configuration>

yarn-site.xml

<?xml version="1.0"?>
<configuration>
  <property>
    <name>yarn.resourcemanager.hostname</name>
    <value>node01</value>
  </property>
</configuration>

提交Job到hadoop的NameNode上运行

# 查看jar包内容
jar -tvf mywordcount.jar

# 如果在jar包的manifest清单文件中给出了Main class，则命令行中可以不给出类名
hadoop jar mywordcount.jar /data/input /data/output

# 如果在jar包的manifest清单文件中没有给出Main class（入口类），则命令行中必须给出类名

注意：目录/data/output在执行jar之前不能存在！

放入输入文件

[bee@node01 bigdataprj]$ hdfs dfs -put speech.txt /data/input
[bee@node01 bigdataprj]$ hdfs dfs -lsr /
lsr: DEPRECATED: Please use 'ls -R'

Job运行完毕

[bee@node01 bigdataprj]$ hdfs dfs -lsr /
lsr: DEPRECATED: Please use 'ls -R'

上一篇：VLC组播测试Server及Client使用简介

下一篇：java基础性代码拾遗-7（反射）

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯