1. 主应用APP
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
* 主应用APP
*/
public class WCApp {

public static void main(String[] args) throws Exception {
Job job = Job.getInstance();

job.setJarByClass(WCMapper.class); //find jar by ClassName
job.setJobName("wc"); //job name

FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));

job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
  1. Mapper类
import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class WCMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
//参数: | | | |
//LongWritable:输入key--------------| | | |
//Text:输入value-------------------------------| | |
//Text:输出key--------------------------------------| |
//IntWritable:输出value-------------------------------------|

@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws IOException, InterruptedException {
String str = value.toString();
String[] arr = str.split(" ");
if(arr != null && arr.length > 0){
for(String s : arr){
context.write(new Text(s), new IntWritable(1));
}
}
}
}
  1. Reducer类
import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> value,
Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
Iterator<IntWritable> it = value.iterator();
int count = 0 ;
while(it.hasNext()){
count = count + it.next().get();
}
context.write(key, new IntWritable(count));
}
}

Context类变量用于在各处理流程中传递处理的中间结果。

  1. Ant打包
    将build.xml放在项目根目录下——与src目录平级。
<?xml version="1.0" encoding="UTF-8"?>
<project name="mywordcount" basedir="." default="finish">
<!-- 准备目录 -->
<target name="prepare">
<delete>
<fileset dir="${basedir}/build" />
<filename name="${basedir}/lib/mywordcount.jar" />
</delete>
<mkdir dir="${basedir}/build/classes" />
</target>

<!-- 定义路径变量 -->
<path id="hadooplib">
<fileset dir="E:/hadoop-2.9.2/_libs">
<include name="*.jar" />
</fileset>
</path>

<!--编译程序 -->
<target name="compile" depends="prepare">
<javac srcdir="${basedir}/src" destdir="${basedir}/build/classes" encoding="utf-8" includeantruntime="on">
<classpath refid="hadooplib" />
</javac>
<copy file="${basedir}/conf/core-site.xml" todir="${basedir}/build/classes" />
<copy file="${basedir}/conf/mapred-site.xml" todir="${basedir}/build/classes" />
<copy file="${basedir}/conf/yarn-site.xml" todir="${basedir}/build/classes" />
</target>

<!-- 打包 -->
<target name="package" depends="compile">
<jar destfile="${basedir}/lib/mywordcount.jar" basedir="${basedir}/build/classes">
<manifest>
<attribute name="Main-Class" value="com.bee.wc.WCApp" />
</manifest>
</jar>
</target>

<target name="finish" depends="package">
<copy file="${basedir}/lib/mywordcount.jar" todir="E:/tmp" />
<echo>------ 打包完成 ------</echo>
</target>
</project>
  1. 打包放入的hadoop配置文件
  • core-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://node01:8020</value>
</property>
<property>
<name>mapreduce.app-submission.cross-platform</name>
<value>true</value>
</property>
</configuration>
  • mapred-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
  • yarn-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>node01</value>
</property>
</configuration>
  1. 提交Job到hadoop的NameNode上运行
# 查看jar包内容
jar -tvf mywordcount.jar

# 如果在jar包的manifest清单文件中给出了Main class,则命令行中可以不给出类名
hadoop jar mywordcount.jar /data/input /data/output

# 如果在jar包的manifest清单文件中没有给出Main class(入口类),则命令行中必须给出类名

注意:目录/data/output在执行jar之前不能存在!

放入输入文件

[bee@node01 bigdataprj]$ hdfs dfs -put speech.txt /data/input
[bee@node01 bigdataprj]$ hdfs dfs -lsr /
lsr: DEPRECATED: Please use 'ls -R'

Job运行完毕

[bee@node01 bigdataprj]$ hdfs dfs -lsr /
lsr: DEPRECATED: Please use 'ls -R'