Hadoop2.2.0 mapreduce 例子

原创

云原生总监 2022-09-05 15:08:26 博主文章分类：hadoop ©著作权

文章标签 hadoop apache mapreduce 文章分类 后端开发

©著作权归作者所有：来自51CTO博客作者云原生总监的原创作品，请联系作者获取转载授权，否则将追究法律责任

1wordcount

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

//Administrator
public class WordCountExample {
  private static class WordCountMapper extends Mapper<Object, Text, Text, IntWritable>{

    @Override
    protected void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      String str=value.toString();
      String []strArray=str.split(" ");
      for(String s:strArray){
        context.write(new Text(s), new IntWritable(1));
      }
    }
    
  }
  
  private static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
        Context context)
        throws IOException, InterruptedException {
      int sum=0;
      for(IntWritable count:values){
        sum+=count.get();
      }
      context.write(key, new IntWritable(sum));
    }
    
  }

  /**
   * @param args
   */
  public static void main(String[] args) throws Exception{
    Configuration conf=new Configuration();
    String []argArray=new GenericOptionsParser(conf,args).getRemainingArgs();
    if(argArray.length!=2){
      System.out.println("需要两个参数");
      System.exit(1);
    }
    Job job=new Job(conf,"wordcount");
    job.setJarByClass(WordCountExample.class);
    job.setMapperClass(WordCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setReducerClass(WordCountReducer.class);
    FileInputFormat.addInputPath(job, new Path(argArray[0]));
    FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
    System.exit(job.waitForCompletion(true)?0:1);
  }

}

2去重

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

//Administrator
public class DeleteRepeatExample {
  private static class DeleteRepeatMapper extends Mapper<Object, Text, Text, IntWritable>{

    @Override
    protected void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      context.write(value, new IntWritable(0));
    }
    
  }
  
  private static class DeleteRepeatReducer extends Reducer<Text, IntWritable, Text, Object>{

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
        Context context)
        throws IOException, InterruptedException {
      context.write(key, null);
    }
    
  }
  
  /**
   * @param args
   */
  public static void main(String[] args) throws Exception{
    Configuration conf=new Configuration();
    String[]argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
    if(argArray.length!=2){
      System.out.println("请提供两个参数");
      System.exit(1);
    }
    Job job=new Job(conf,"delete repeat");
    job.setJarByClass(DeleteRepeatExample.class);
    job.setMapperClass(DeleteRepeatMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(DeleteRepeatReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Object.class);
    FileInputFormat.addInputPath(job, new Path(argArray[0]));
    FileOutputFormat.setOutputPath(job,new Path(argArray[1]));
    System.exit(job.waitForCompletion(true)?0:1);

  }

}

3排序

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

//Administrator
public class SortExample {
  private static class SortMapper extends Mapper<Object, Text, IntWritable, IntWritable>{

    @Override
    protected void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      context.write(new IntWritable(Integer.parseInt(value.toString())), new IntWritable(0));
    }
    
  }
  
  private static class SortReducer extends Reducer<IntWritable, IntWritable, Text,Text>{
    private int index=0;
    @Override
    protected void reduce(IntWritable key, Iterable<IntWritable> values,
        Context context)
        throws IOException, InterruptedException {
      for(IntWritable i:values){
        index++;
        context.write(new Text(index+""),new Text(key.get()+""));
      }
    }
    
  }

  /**
   * @param args
   */
  public static void main(String[] args) throws Exception{
    Configuration conf=new Configuration();
    String[]argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
    if(argArray.length!=2){
      System.out.println("请输入两个参数");
      System.exit(1);
    }
    Job job=new Job(conf,"sort");
    job.setJarByClass(SortExample.class);
    job.setMapperClass(SortMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setReducerClass(SortReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(argArray[0]));
    FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
    System.exit(job.waitForCompletion(true)?0:1);

  }

}

4表自连接

package demo;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.vod.Ejob;

public class SelfJoin {
  private static class SelfJoinMapper extends Mapper<Object, Text, Text, Text>{

    @Override
    protected void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      String str=value.toString();
      String[] nameArray=str.split(" ");
      context.write(new Text(nameArray[1]), new Text("1-"+nameArray[0]+"-"+nameArray[1]));
      context.write(new Text(nameArray[0]), new Text("2-"+nameArray[0]+"-"+nameArray[1]));
      
    }
    
  }
  private static class SelfJoinReducer extends Reducer<Text, Text, Text, Text>{

    @Override
    protected void reduce(Text key, Iterable<Text> values,
        Context context)
        throws IOException, InterruptedException {
      List<String> outKey=new ArrayList<String>();
      List<String> outValue=new ArrayList<String>();
      /*for(Text value:values){
      context.write(NullWritable.get(), value);
      }
      context.write(NullWritable.get(), new Text("---------"));*/
      for(Text value:values){
        String[] relationArray=value.toString().split("-");
        if(relationArray[0].equals("1")){
          outKey.add(relationArray[1]);
        }else if(relationArray[0].equals("2")){
          outValue.add(relationArray[2]);
        }
      }
      for(String k:outKey){
        for(int i=0;i<outValue.size();i++){
          context.write(new Text(k), new Text(outValue.get(i)));
        }
      }
    }
    
  }
  public static void main(String[] args) throws Exception{
    File jarFile = Ejob.createTempJar("bin");
      //Ejob.addClasspath("/opt/hadoop/conf");
        ClassLoader classLoader = Ejob.getClassLoader();
      Thread.currentThread().setContextClassLoader(classLoader);
      
    Configuration conf=new Configuration();
    String [] argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
    if(argArray.length!=2){
      System.out.println("参数错误");
      System.exit(1);
    }
    JobConf jobConf=new JobConf(conf);
    jobConf.setJar(jarFile.toString());
    Job job=new Job(jobConf,"self join");
    job.setJarByClass(SelfJoin.class);
    job.setMapperClass(SelfJoinMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setReducerClass(SelfJoinReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(argArray[0]));
    FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
    System.exit(job.waitForCompletion(true)?0:1);

  }

}

数据：

Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma

结果：

Tom  Alice
Tom  Jesse
Jone  Alice
Jone  Jesse
Tom  Mary
Tom  Ben
Jone  Mary
Jone  Ben
Philip  Alice
Philip  Jesse
Mark  Alice
Mark  Jesse

5多表连接

package demo;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.vod.Ejob;

public class MultiTableJoin {
  private static class MultiTableMapper extends Mapper<Object, Text, Text, Text>{

    @Override
    protected void map(Object key, Text value, Context context)
        throws IOException, InterruptedException {
      String str=value.toString();
      if(str.charAt(0)>'0'&&str.charAt(0)<'9'){
        context.write(new Text(str.charAt(0)+""), new Text("2-"+str.substring(1).trim()));
      }else{
        context.write(new Text(str.substring(str.length()-1)), new Text("1-"+str.substring(0, str.length()-1).trim()));
      }
    }
    
  }
  
  private static class MultiTableReducer extends Reducer<Text, Text, Text, Text>{

    @Override
    protected void reduce(Text key, Iterable<Text> values,
        Context context)
        throws IOException, InterruptedException {
      List<String>keyList=new ArrayList<String>();
      List<String>valueList=new ArrayList<String>();
      for(Text value:values){
        String str=value.toString();
        String []strArray=str.split("-");
        if(strArray[0].equals("1")){
          keyList.add(strArray[1]);
        }else if(strArray[0].equals("2")){
          valueList.add(strArray[1]);
        }
      }
      for(String skey:keyList){
        for(String svalue:valueList){
          context.write(new Text(skey), new Text(svalue));
        }
      }
    }
    
  }
  
  
  public static void main(String[] args) throws Exception{
    File jarFile=Ejob.createTempJar("bin");
    ClassLoader classLoader=Ejob.getClassLoader();
    Thread.currentThread().setContextClassLoader(classLoader);
    
    Configuration conf=new Configuration();
    String [] argArray=new GenericOptionsParser(conf, args).getRemainingArgs();
    if(argArray.length!=2){
      System.out.println("参数错误");
      System.exit(1);
    }
    JobConf jobConf=new JobConf(conf);
    jobConf.setJar(jarFile.toString());
    Job job=new Job(jobConf,"multiTalbe join");
    job.setMapperClass(MultiTableMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setReducerClass(MultiTableReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(argArray[0]));
    FileOutputFormat.setOutputPath(job, new Path(argArray[1]));
    System.exit(job.waitForCompletion(true)?0:1);
    
  }
}

数据：table1.txt

Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Bank of Beijing 1

table2.txt

1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian

运行结果：

Beijing Red Star  Beijing
Beijing Rising  Beijing
Bank of Beijing  Beijing
Guangzhou Honda  Guangzhou
Guangzhou Development Bank  Guangzhou
Shenzhen Thunder  Shenzhen
Tencent  Shenzhen