第一个MapReduce程序

原创

mb5f199d99afeb3 2021-11-12 16:58:39 博主文章分类：hadoop ©著作权

文章标签 hadoop apache mapreduce java jar 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者mb5f199d99afeb3的原创作品，请联系作者获取转载授权，否则将追究法律责任

package lxkj.com.hadoop_02;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;;

//Mapper类是一个泛型类参数分别指(输入键【长整数偏移量】，输入值【一行文本】,输出键，输出值) 是一套可优化网络序列化传输的基本类型，这类类型都在org.apache.hadoop.io包中
public class MaxTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private static final int MISSING=9999;
public void map(LongWritable key, Text value, Context context )
   throws IOException, InterruptedException {
  //将输入的Text值转换为java的String类型
  String line=value.toString();
  //用substring()方法提取我们感兴趣的列
  String year=line.substring(15, 19);
  int airTemperature;
  if(line.charAt(87)=='+'){
    airTemperature=Integer.parseInt(line.substring(88, 92));
  }else{
     airTemperature=Integer.parseInt(line.substring(87, 92));
  }
  String quality=line.substring(92, 93);
  if(airTemperature!=MISSING&&quality.matches("[01459]")){
   //输出写入内容
   context.write(new Text(year), new IntWritable(airTemperature));

  }
}

}

package lxkj.com.hadoop_02;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
//同样reduce团也有4个参数指输入和输出类型，reduce的输入类型必须是map的输出类型
public class MaxTemperatureReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
   //Iterable<> 相当于List
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
   Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
    int maxValue=Integer.MIN_VALUE;
    for(IntWritable value:values){
       maxValue=Math.max(maxValue, value.get());
    }
    //输出类型必须是hadoop自带的类型
    context.write(key, new IntWritable(maxValue));
}

}

package lxkj.com.hadoop_02;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

/**
* Hello world!
*
*/
public class App
{
    public static void main( String[] args ) throws IOException, ClassNotFoundException, InterruptedException
    {
     if(args.length!=2){
    System.out.println("Usage:MaxTempleature <input path> <ouput path>" );
    System.exit(-1);
     }
     //job对象指定作业额规范,可以控制整个作业的运行
     Job job=new Job();
     //在hadoop集群上作业时,要把代码打成一个Jar、文件，不必明确指定jar文件的名字，在job对象的setjarByClass()方法中传递一个类即可
       job.setJarByClass(App.class);
       //给作业起一个名字
       job.setJobName("Max templature");
       //输入数据路径
       FileInputFormat.addInputPath(job, new Path(args[0]));
       //输出数据的路径,只能有一个输出路径
       FileInputFormat.addInputPath(job, new Path(args[1]));
       //只能mapper类型和reduce类型
       job.setMapperClass(MaxTemperatureMapper.class);
       job.setReducerClass(MaxTemperatureReduce.class);
       //reduce函数的输出类型
       job.setOutputKeyClass(Text.class);

       job.setOutputValueClass(IntWritable.class);
       System.exit(job.waitForCompletion(true)?0:1);
    }
}