MapReduce Join

对两份数据data1和data2进行关键词连接是一个很通用的问题,如果数据量比较小,可以在内存中完成连接。

如果数据量比较大,在内存进行连接操会发生OOM。mapreduce join可以用来解决大数据的连接。 

1 思路 

1.1 reduce join

在map阶段, 把关键字作为key输出,并在value中标记出数据是来自data1还是data2。因为在shuffle阶段已经自然按key分组,reduce阶段,判断每一个value是来自data1还是data2,在内部分成2组,做集合的乘积。

这种方法有2个问题:

1, map阶段没有对数据瘦身,shuffle的网络传输和排序性能很低。

2, reduce端对2个集合做乘积计算,很耗内存,容易导致OOM。

实现代码如下:

主程序入口代码:

package com.ibeifeng.mapreduce.join;  
import java.io.IOException;  
import java.util.ArrayList;  
import java.util.Iterator;  
import java.util.List;  
import java.util.StringTokenizer;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.conf.Configured;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.mapreduce.task.reduce.Shuffle;  
import org.apache.hadoop.util.Tool;  
import org.apache.hadoop.util.ToolRunner;  
  

public class MapReduceJoin extends Configured implements Tool{  
    //定义map处理类模板  
    public static class map extends Mapper<LongWritable, Text, IntWritable, DataJoin>{  
        private IntWritable outputkey = new IntWritable();  
        private DataJoin datajoin = new DataJoin();  
        protected void map(LongWritable key, Text values, Context context)  
                throws IOException, InterruptedException {  
            //1.获取字符串  
            String str = values.toString();  
            //2.对字符串进行分割  
            String[] value = str.split(",");  
            //3.对非法数据进行过滤  
            int len = value.length;  
            if(len!=3&&len!=4) {  
                return;  
            }  
            //4.取出cid  
            String cid = value[0];  
            //5.判断是是customer表还是order表  
            if(len == 3) {  
                //表示是customer表  
                String cname = value[1];  
                String cphone = value[2];  
                datajoin.set("Customer", cid+","+cname+","+cphone);  
            }  
            if(len == 4) {  
                //表示是order表  
                String oname = value[1];  
                String oprice = value[2];  
                String otime = value[3];  
                datajoin.set("Order", cid+","+oname+","+oprice+","+otime);  
            }  
            outputkey.set(Integer.valueOf(cid));  
            context.write(outputkey, datajoin);  
        }  
    }  
     
    //定义reduce处理类模板  
public static class reduce extends Reducer<IntWritable, DataJoin, NullWritable, Text{            private Text outputvalue = new Text();  
        @Override  
        protected void reduce(IntWritable key, Iterable<DataJoin> values,  
                Context context) throws IOException, InterruptedException {             
            //定义一个字符串用于保存客户信息  
            String customerInfo = null;  
            //定义一个List集合用于保存订单信息  
            List<String> list = new ArrayList<String>();  
            for(DataJoin datajoin : values) {  
                if(datajoin.getTag().equals("Customer")) {  
                    System.out.println(datajoin.getData());  
                    customerInfo = datajoin.getData();  
                }  
                if(datajoin.getTag().equals("Order")) {  
                    list.add(datajoin.getData());  
                }  
            }  
            //进行输出  
            for(String s :list) {  
                outputvalue.set(customerInfo+","+s);  
                context.write(NullWritable.get(), outputvalue);  
            }  
        }  
    }  
    //配置Driver模块  
    public int run(String[] args) {          
        //1.获取配置配置文件对象  
        Configuration configuration = new Configuration();  
        //2.创建给mapreduce处理的任务  
        Job job = null;  
        try {  
            job = Job.getInstance(configuration,this.getClass().getSimpleName());  
        } catch (IOException e) {  
            e.printStackTrace();  
        }  
        try {  
            //3.创建输入路径  
            Path source_path = new Path(args[0]);  
            FileInputFormat.addInputPath(job, source_path);  
            //4.创建输出路径  
            Path des_path = new Path(args[1]);  
            FileOutputFormat.setOutputPath(job, des_path);  
        } catch (IllegalArgumentException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        }          
        //设置让任务打包jar运行  
        job.setJarByClass(MapReduceJoin.class);  
        //5.设置map  
        job.setMapperClass(map.class);  
        job.setMapOutputKeyClass(IntWritable.class);  
        job.setMapOutputValueClass(DataJoin.class);          
        //================shuffle========================  
        //1.分区  
//      job.setPartitionerClass(MyPartitioner.class);  
        //2.排序  
//      job.setSortComparatorClass(cls);  
        //3.分组  
//      job.setGroupingComparatorClass(MyGroup.class);  
        //4.可选项,设置combiner,相当于map过程的reduce处理,优化选项  
//      job.setCombinerClass(Combiner.class);  
        //设置reduce个数  
//      job.setNumReduceTasks(2);  
        //================shuffle========================  
        //6.设置reduce  
        job.setReducerClass(reduce.class);  
        job.setOutputKeyClass(NullWritable.class);  
        job.setOutputValueClass(Text.class);          
        //7.提交job到yarn组件上  
        boolean isSuccess = false;  
        try {  
            isSuccess = job.waitForCompletion(true);  
        } catch (ClassNotFoundException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } catch (InterruptedException e) {  
            e.printStackTrace();  
        }  
        return isSuccess?0:1;  
    }       
    //书写主函数  
    
    public static void main(String[] args) {  
        Configuration configuration = new Configuration();  
        //1.书写输入和输出路径  
        String[] args1 = new String[] {  
                "hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/input",  
                "hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/output"  
       };           
        //2.设置系统以什么用户执行job任务  
        System.setProperty("HADOOP_USER_NAME", "beifeng");  
        //3.运行job任务  
        int status = 0;  
        try {  
            status = ToolRunner.run(configuration, new MapReduceJoin(), args1);  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
//      int status = new MyWordCountMapReduce().run(args1);  
        //4.退出系统  
        System.exit(status);  
    }    
}

自定义包装类代码:

package com.ibeifeng.mapreduce.join;  
import java.io.DataInput;  
import java.io.DataOutput;  
import java.io.IOException;   
import org.apache.hadoop.io.Writable;   
public class DataJoin implements Writable{  
    private String tag;  
    private String data;  
    public String getTag() {  
        return tag;  
    }  
    public String getData() {  
        return data;  
    }   
    public void set(String tag,String data) {  
        this.tag = tag;  
        this.data = data;  
    }   
    @Override  
    public String toString() {  
        return tag+","+data;  
    }   
    public void write(DataOutput out) throws IOException {  
        out.writeUTF(this.tag);  
        out.writeUTF(this.data);  
    }  
    public void readFields(DataInput in) throws IOException {  
        this.tag = in.readUTF();  
        this.data = in.readUTF();  
    }  
}

准备测试数据如下(两个csv文件):

mapreduce中间结果 mapreduce mapjoin_mapreduce中间结果

mapreduce中间结果 mapreduce mapjoin_mapreduce_02

 

 将csv文件上传至HDFS当中,并且将代码打包成jar,然后执行以下命令:

bin/yarn jar datas/mapreduce_join.jar /user/beifeng/wordcount/input/ /user/beifeng/wordcount/output

mapreduce中间结果 mapreduce mapjoin_hadoop_03

结果如下:

mapreduce中间结果 mapreduce mapjoin_mapreduce_04

Map join

MapJoin 适用于有一份数据较小的连接情况。做法是直接把该小份数据直接全部加载到内存当中,按链接关键字建立索引。然后大份数据就作为 MapTask 的输入,对 map()方法的每次输入都去内存当中直接去匹配连接。然后把连接结果按 key 输出,这种方法要使用 hadoop中的 DistributedCache 把小份数据分布到各个计算节点,每个 maptask 执行任务的节点都需要加载该数据到内存,并且按连接关键字建立索引:

这里假设Customer为小表,Orders为大表,这也符合实际生产环境。

关于这种分布式缓存的用法,直接看下代码的演示:

主函数入口代码:

package com.ibeifeng.mapreduce.join;  
import java.io.BufferedReader;  
import java.io.IOException;  
import java.io.InputStreamReader;  
import java.net.URI;  
import java.util.HashMap;  
import java.util.Map;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.conf.Configured;  
import org.apache.hadoop.fs.FSDataInputStream;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.NullWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.util.Tool;  
import org.apache.hadoop.util.ToolRunner;  
import javax.jdo.annotations.Order;  
  

public class MapJoin extends Configured implements Tool{  
    //定义缓存文件的读取路径  
    private static String  cacheFile = "hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/input1/customers.csv";  
    //定义map处理类模板  
    public static class map extends Mapper<LongWritable, Text, NullWritable, Text>{  
        private Text outputValue = new Text();  
        Map<Integer,Customer> map = null;       
        @Override  
        protected void setup(Context context)throws IOException, InterruptedException {  
            //读取分布式缓存文件  
            FileSystem fs = FileSystem.get(URI.create(cacheFile),context.getConfiguration());  
            FSDataInputStream fdis  = fs.open(new Path(cacheFile));  
            BufferedReader br = new BufferedReader(new InputStreamReader(fdis));  
            //创建一个map集合来保存读取文件的数据  
            map = new HashMap<Integer,Customer>();  
            String line = null;  
            while((line = br.readLine())!=null) {  
                String[] split = line.split(",");  
                Customer   customer  = new Customer(Integer.parseInt(split[0]), split[1], split[2]);  
                map.put(customer.getCid(),customer);  
            }  
            //关闭IO流  
            br.close();  
        }  
        @Override  
        protected void map(LongWritable key, Text values, Context context)  
                throws IOException, InterruptedException {  
            //将Customer表和Orders表的数据进行组合  
            String str = values.toString();  
            String[] Orders = str.split(",");  
            int joinID = Integer.valueOf(Orders[0]);  
            Customer customerid = map.get(joinID);  
            StringBuffer sbf = new StringBuffer();  
            sbf.append(Orders[0]).append(",")  
                    .append(customerid.getCname()).append(",")  
                    .append(customerid.getCphone()).append(",")  
                    .append(Orders[1]).append(",")  
                    .append(Orders[2]).append(",")  
                    .append(Orders[3]).append(",");  
            outputValue.set(sbf.toString());  
            context.write(NullWritable.get(),outputValue);  
    }  
    }        
    //无reduce程序            
    //配置Driver模块  
    public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {          
        //获取配置配置文件对象  
        Configuration configuration = new Configuration();  
        //创建给mapreduce处理的任务  
        Job job = Job.getInstance(configuration,this.getClass().getSimpleName());  
        //获取将要读取到内存的文件的路径,并加载进内存  
        job.addCacheFile(URI.create(cacheFile));  
        //创建输入路径  
        Path source_path = new Path(args[0]);  
        //创建输出路径  
        Path des_path = new Path(args[1]);  
        //创建操作hdfs的FileSystem对象  
        FileSystem fs = FileSystem.get(configuration);  
        if (fs.exists(des_path)) {  
            fs.delete(des_path,true);  
        }  
        FileInputFormat.addInputPath(job, source_path);  
        FileOutputFormat.setOutputPath(job, des_path);           
        //设置让任务打包jar运行  
        job.setJarByClass(MapJoin.class);  
        //设置map  
        job.setMapperClass(map.class);  
        job.setMapOutputKeyClass(NullWritable.class);  
        job.setMapOutputValueClass(Text.class);  
        //设置reduceTask的任务数为0,即没有reduce阶段和shuffle阶段  
        job.setNumReduceTasks(0);          
        //提交job到yarn组件上  
        boolean isSuccess = job.waitForCompletion(true);  
        return isSuccess?0:1;  
    }  
    //书写主函数      
    public static void main(String[] args) {  
        Configuration configuration = new Configuration();  
        //1.书写输入和输出路径  
        String[] args1 = new String[] {  
                "hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/input",  
                "hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/output"  
        };  
        //2.设置系统以什么用户执行job任务  
        System.setProperty("HADOOP_USER_NAME", "beifeng");  
        //3.运行job任务  
        int status = 0;  
        try {  
            status = ToolRunner.run(configuration, new MapJoin(), args1);  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
//      int status = new MyWordCountMapReduce().run(args1);  
        //4.退出系统  
        System.exit(status);  
    }    
}

构造类代码:

package com.ibeifeng.mapreduce.join;  
import java.io.DataInput;  
import java.io.DataOutput;  
import java.io.IOException;  
import org.apache.hadoop.io.Writable;  
  

public class Customer implements Writable{  
    private int cid;  
    private String cname;  
    private String cphone;     
    public int getCid() {  
        return cid;  
    }   
    public void setCid(int cid) {  
        this.cid = cid;  
    }  
    public String getCname() {  
        return cname;  
    }  
    public void setCname(String cname) {  
        this.cname = cname;  
    }   
    public String getCphone() {  
        return cphone;  
    }    
    public void setCphone(String cphone) {  
        this.cphone = cphone;  
    }  
public Customer(int cid, String cname, String cphone) {  
    super();  
    this.cid = cid;  
    this.cname = cname;  
    this.cphone = cphone;  
    }        
public void write(DataOutput out) throws IOException {  
    out.writeInt(this.cid);  
    out.writeUTF(this.cname);  
    out.writeUTF(this.cphone);  
    }    
public void readFields(DataInput in) throws IOException {  
    this.cid = in.readInt();  
    this.cname = in.readUTF();  
    this.cphone = in.readUTF();  
    }   
    @Override  
public String toString() {  
    return "Customer [cid=" + cid + ", cname=" + cname + ", cphone=" + cphone + "]";  
 }       
}

mapreduce中间结果 mapreduce mapjoin_apache_05

mapreduce中间结果 mapreduce mapjoin_hadoop_06

执行命令:bin/yarn jar datas/map_join.jar也是可以得到同样的结果:

mapreduce中间结果 mapreduce mapjoin_apache_07