hadoop简单应用-统计文本文件单词个数

原创

wx63086371c7e9c 2022-08-26 14:43:43 博主文章分类：Hadoop ©著作权

文章标签 hadoop class output path string 文章分类 云平台云计算

©著作权归作者所有：来自51CTO博客作者wx63086371c7e9c的原创作品，请联系作者获取转载授权，否则将追究法律责任

=============hadoop-0.12.2-core 版本===========================

MyMap.java

map方法把文本文件单词输出到中间过程output中,格式：<key,value>

handoop 1

Bye 1

handoop 1

World 1

public class MyMap extends MapReduceBase implements Mapper {
  Text t = new Text();
  private final static IntWritable one = new IntWritable(1);
  private Text word = new Text();
  @Override
  public void map(WritableComparable key,
      Writable value, OutputCollector output,
      Reporter reporter) throws IOException {
    String line = value.toString();
    StringTokenizer stz = new StringTokenizer(line);
    while(stz.hasMoreTokens()){
      word.set(stz.nextToken());
      output.collect(word, one);
    }
  }
}

MyReduce.java

reduce方法

遍历values 就可以得到同一个key的所有value

public class MyReduce extends MapReduceBase implements Reducer {

  public void reduce(WritableComparable key, Iterator values, OutputCollector output,Reporter reporter) throws IOException {
      int sum = 0;
      while(values.hasNext()){
        sum+=Integer.parseInt(values.next().toString());
      }
      output.collect(key, new IntWritable(sum));
  }
}

任务，主调方法

public class JobTest{
  
  public int run(String... args) throws IOException{
    JobConf conf = new JobConf(new Configuration());
    conf.setJobName("wordCount");
    conf.setInputPath(new Path(args[0]));
    conf.setOutputPath(new Path(args[1]));
    conf.setMapperClass(MyMap.class);
    conf.setReducerClass(MyReduce.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);
    JobClient.runJob(conf);
    return 0;
    
  }
  
  public static void main(String[] args){
    try {
      new JobTest().run("D:\\files\\wordCount.txt","D:\\files\\wordCoutOut");
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

打开D:\files\wordCoutOut\part-00000文件如下结果：

Bye 3
Hadoop 4
Hello 3
World 2

===========hadoop-0.20.2-core版本========================

MyMap.java

public class MyMap extends Mapper<Object, Text, Text, IntWritable> {
  Text t = new Text();
  private final static IntWritable one = new IntWritable(1);
  private Text word = new Text();
  
  public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    //output 和reporter 都集成到Context 中
    StringTokenizer itr = new StringTokenizer(value.toString());
    while(itr.hasMoreTokens()){
      word.set(itr.nextToken());
      context.write(word, one);
    }
  }
  
}

MyReduce.java

public class MyReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
  private IntWritable result = new IntWritable();

  @Override
  protected void reduce(Text key, Iterable<IntWritable> values,Context context)
      throws IOException, InterruptedException {
    int sum = 0;
    for(IntWritable val:values){
      sum+=val.get();
    }
    result.set(sum);
    context.write(key, result);
  }
  
}

JobTest.java

public class JobTest{
  
  public int run(String... args) throws IOException, InterruptedException, ClassNotFoundException{
    Job job = new Job(new Configuration(),"word count");
    job.setJarByClass(JobTest.class);
    job.setMapperClass(MyMap.class);
    job.setCombinerClass(MyReduce.class);
    job.setReducerClass(MyReduce.class);
    job.setOutputKeyClass(Text.class);//设置reduce输出Key 类型
    job.setOutputValueClass(IntWritable.class);//设置输出value 类型
    FileInputFormat.addInputPath(job, new Path(args[0]));//设置输入路径
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true)?0:1);
    return 0;
  }
  
  public static void main(String[] args) throws InterruptedException, ClassNotFoundException{
    try {
      new JobTest().run("D:\\files\\wordCount.txt","D:\\files\\wordCoutOut");
    } catch (IOException e) {
      e.printStackTrace();
    }
  }