1)需求:统计输入文件中每一行的第一个单词相同的行数。

2)输入文件:

banzhang ni hao

xihuan hadoop banzhang dc

banzhang ni hao

xihuan hadoop banzhang dc

3)输出

       banzhang 2

xihuan    2

4)代码实现

(1)编写mapper

package com.atguigu.mapreduce.KeyValueTextInputFormat;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

 

public class KVTextMapper extends Mapper<Text, Text, Text, LongWritable>{

      

    final Text k = new Text(); 

    final LongWritable v = new LongWritable(); 

   

       @Override

       protected void map(Text key, Text value, Context context)

                     throws IOException, InterruptedException {

// banzhang ni hao

              // 1 设置key和value

        // banzhang

        k.set(key); 

        // 设置key的个数

        v.set(1); 

       

        // 2 写出

        context.write(k, v); 

       }

}

(2)编写reducer

package com.atguigu.mapreduce.KeyValueTextInputFormat;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Reducer;

 

public class KVTextReducer extends Reducer<Text, LongWritable, Text, LongWritable>{

      

    LongWritable v = new LongWritable(); 

   

       @Override

       protected void reduce(Text key, Iterable<LongWritable> values,

                     Context context) throws IOException, InterruptedException {

             

               long count = 0L; 

               // 1 汇总统计

         for (LongWritable value : values) { 

             count += value.get(); 

         } 

        

         v.set(count); 

        

         // 2 输出

         context.write(key, v); 

       }

}

(3)编写Driver

package com.atguigu.mapreduce.keyvaleTextInputFormat;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.KeyValueLineRecordReader;

import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 

public class MyDriver {

 

       public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

             

              Configuration conf = new Configuration();

              // 设置切割符

              conf.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR, " ");

              // 获取job对象

              Job job = Job.getInstance(conf);

             

              // 设置jar包位置,关联mapper和reducer

              job.setJarByClass(MyDriver.class);

              job.setMapperClass(MyMapper.class);

              job.setOutputValueClass(LongWritable.class);

             

              // 设置map输出kv类型

              job.setMapOutputKeyClass(Text.class);

              job.setMapOutputValueClass(LongWritable.class);

 

              // 设置最终输出kv类型

              job.setReducerClass(MyReducer.class);

              job.setOutputKeyClass(Text.class);

             

              // 设置输入输出数据路径

              FileInputFormat.setInputPaths(job, new Path(args[0]));

             

              // 设置输入格式

              job.setInputFormatClass(KeyValueTextInputFormat.class);

             

              // 设置输出数据路径

              FileOutputFormat.setOutputPath(job, new Path(args[1]));

             

              // 提交job

              job.waitForCompletion(true);

       }

}