源代码程序
1. import java.io.IOException;
2.
3. import java.util.StringTokenizer;
4.
5. import org.apache.hadoop.conf.Configuration;
6.
7. import org.apache.hadoop.fs.Path;
8.
9. import org.apache.hadoop.io.IntWritable;
10.
11. import org.apache.hadoop.io.Text;
12.
13. import org.apache.hadoop.mapreduce.Job;
14.
15. import org.apache.hadoop.mapreduce.Mapper;
16.
17. import org.apache.hadoop.mapreduce.Reducer;
18.
19. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
20.
21. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
22.
23. import org.apache.hadoop.util.GenericOptionsParser;
24.
25. public class WordCount {
26.
27. public static class TokenizerMapper
28.
29. extends Mapper<Object, Text, Text, IntWritable>{
30.
31. private final static IntWritable one = new IntWritable(1);
32.
33. private Text word = new Text();
34.
35.
36.
37. public void map(Object key, Text value, Context context)
38.
39. throws IOException, InterruptedException {
40.
41. new StringTokenizer(value.toString());
42.
43. while (itr.hasMoreTokens()) {
44.
45. word.set(itr.nextToken());
46.
47. context.write(word, one);
48.
49. }
50.
51. }
52.
53. }
54.
55. public static class IntSumReducer
56.
57. extends Reducer<Text,IntWritable,Text,IntWritable> {
58.
59. private IntWritable result = new IntWritable();
60.
61. public void reduce(Text key, Iterable<IntWritable> values,Context context)
62.
63. throws IOException, InterruptedException {
64.
65. int sum = 0;
66.
67. for (IntWritable val : values) {
68.
69. sum += val.get();
70.
71. }
72.
73. result.set(sum);
74.
75. context.write(key, result);
76.
77. }
78.
79. }
80.
81.
82.
83. public static void main(String[] args) throws Exception {
84.
85. new Configuration();
86.
87. new GenericOptionsParser(conf, args).getRemainingArgs();
88.
89. if (otherArgs.length != 2) {
90.
91. "Usage: wordcount <in> <out>");
92.
93. 2);
94.
95. }
96.
97. new Job(conf, "word count");
98.
99. class);
100.
101. class);
102.
103. class);
104.
105. class);
106.
107. class);
108.
109. class);
110.
111. new Path(otherArgs[0]));
112.
113. new Path(otherArgs[1]));
114.
115. true) ? 0 : 1);
116.
117. }
118.
119. }
本节将对WordCount进行更详细的讲解。详细执行步骤如下
1)将文件拆分成splits,由于测试用的文件较小,所以每个文件为一个split,并将文件按行分割形成<key,value>对,如图所示。这一步由MapReduce框架自动完成,其中偏移量(即key值)包括了回车所占的字符数
2)将分割好的<key,value>对交给用户定义的map方法进行处理,生成新的<key,value>对,如图所示
3)得到map方法输出的<key,value>对后,Mapper会将它们按照key值进行排序,并执行Combine过程,将key至相同value值累加,得到Mapper的最终输出结果。如图所示
4)Reducer先对从Mapper接收的数据进行排序,再交由用户自定义的reduce方法进行处理,得到新的<key,value>对,并作为WordCount的输出结果,如图所示