文章目录

  • 一.概述
  • 二.需求
  • 三.map+reduce实现join
  • 四.MapReduce Map端 join实现原理(没有reduce处理)

一.概述

熟悉SQL的读者都知道,使用SQL语法实现join是很简单的,只需要一条SQL语句即可,但是在大数据场景下使用MapReduce编程模型实现join还是比较繁琐的在实际生产中我们可以借助Hive,Spark SQL 等框架来实现join,但是对于join的实现原理我们需要掌握,这对于理解join的底层实现很有帮助,本文介绍如何使用MapReduce API 来实现join

二.需求

实现如下SQL的功能: select c.customer_id,c.customer_name,o.orderId,o.order_status from customer c join order o on c.customer_id=o.customer_id 文件链接: https://pan.baidu.com/s/1GziR0W7pNwk26lHf-ZZ8NA 提取码: 2piw

三.map+reduce实现join

map

  • 判断字段个数如果是4个字段就是order表,9个字段就是customer表
  • ( customer_id,(customer_id,customer_name,orderId,order_status,flag))

reduce
对同一个customer_id的key进行处理,将value值进行拼接
代码实现
-1)先写所需字段的实体类如下:

package hadoop.mapreduce.reducejoin;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @author sunyong
 * @date 2020/07/02
 * @description
 */
public class CustomerOrders implements Writable {
    //-( customer_id,(customer_id,customer_name,orderId,order_status,flag))
    private String customer_id;//客户id
    private String customer_name;//客户名
    private String orderId;//订单id
    private String order_status;//订单状态
    private String flag;//标志位(是map识别文件的标志)

    public CustomerOrders() {
    }

    public CustomerOrders(String customer_id, String customer_name, String orderId, String order_status, String flag) {
        this.customer_id = customer_id;
        this.customer_name = customer_name;
        this.orderId = orderId;
        this.order_status = order_status;
        this.flag = flag;
    }
	//序列化
    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(customer_id);
        dataOutput.writeUTF(customer_name);
        dataOutput.writeUTF(orderId);
        dataOutput.writeUTF(order_status);
        dataOutput.writeUTF(flag);
    }
	//反序列化(顺序要一致)
    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.customer_id=dataInput.readUTF();
        this.customer_name=dataInput.readUTF();
        this.orderId=dataInput.readUTF();
        this.order_status=dataInput.readUTF();
        this.flag=dataInput.readUTF();
    }

    public String getCustomer_id() {
        return customer_id;
    }

    public void setCustomer_id(String customer_id) {
        this.customer_id = customer_id;
    }

    public String getCustomer_name() {
        return customer_name;
    }

    public void setCustomer_name(String customer_name) {
        this.customer_name = customer_name;
    }

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getOrder_status() {
        return order_status;
    }

    public void setOrder_status(String order_status) {
        this.order_status = order_status;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return customer_id + ',' +
              customer_name + ',' +
           orderId + ',' +
         order_status ;
    }
}
  • 2)编写map类,如下:
package hadoop.mapreduce.reducejoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author sunyong
 * @date 2020/07/02
 * @description
 */
public class CustomerOrderMapper extends Mapper<LongWritable, Text,Text,CustomerOrders> {
    CustomerOrders v = new CustomerOrders();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //将字段进行切割,返回字段数值
        String[] fields = value.toString().split(",");
        //进行判断,4字段是订单表,否则就是顾客表
        if(fields.length==4){
            //订单表中可赋值的字段进行赋值
            v.setCustomer_id(fields[2]);
            v.setCustomer_name("");
            v.setOrderId(fields[0]);
            v.setOrder_status(fields[3]);
            v.setFlag("1");
        }else{
            //顾客表中可赋值的字段进行赋值
            v.setCustomer_id(fields[0]);
            v.setOrderId("");
            v.setOrder_status("");
            v.setCustomer_name(fields[1]);
            v.setFlag("0");
        }
        //从Map端写出
        context.write(new Text(v.getCustomer_id()),v);
    }
}
  • 3)编写reduce类,如下:
package hadoop.mapreduce.reducejoin;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

/**
 * @author sunyong
 * @date 2020/07/02
 * @description
 */
public class CustomerOrderReducer extends Reducer<Text,CustomerOrders,CustomerOrders, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<CustomerOrders> values, Context context) throws IOException, InterruptedException {
        //1.准备订单记录集合(因为一个顾客id可对应多个订单数据,所以需要一个集合存放)
        ArrayList<CustomerOrders> ordeBeans = new ArrayList<>();
        //准备顾客bean对象(因为顾客id只对应一个顾客,同一个id应该只有一个顾客对象,需要一个对象来存放)
        CustomerOrders cusBean = new CustomerOrders();
        //2.遍历map端输出内容将数据放入到集合中,准备合并bean对象
        for (CustomerOrders bean : values) {
            if(bean.getFlag().equals("1")){//订单表
               CustomerOrders orderBean =  new CustomerOrders();
                try {
                    BeanUtils.copyProperties(orderBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                ordeBeans.add(orderBean);
            }else {//顾客表
                try {
                    BeanUtils.copyProperties(cusBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }
        //3.遍历集合,进行空白字段拼接
        for (CustomerOrders bean : ordeBeans) {
        	//将顾客对象的姓名取出来填充到list中的对象中去
            bean.setCustomer_name(cusBean.getCustomer_name());
            //4.调用写出方法
            context.write(bean,NullWritable.get());
        }
    }
}
  • 4)编写Driver类运行,如下:
package hadoop.mapreduce.reducejoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author sunyong
 * @date 2020/07/01
 * @description
 */
public class CustomerOrderDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1.创建配置文件,创建Job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf,"sqlJoin");

        //2.设置jar的位置
        job.setJarByClass(CustomerOrderDriver.class);

        //3.设置map和reduce的位置
        job.setMapperClass(CustomerOrderMapper.class);
        job.setReducerClass(CustomerOrderReducer.class);

        //4.设置map输出端的key,value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CustomerOrders.class);

        //5.设置reduce输出的key,value类型
        job.setOutputKeyClass(CustomerOrders.class);
        job.setOutputValueClass(NullWritable.class);
        //6.设置输出路径
        FileInputFormat.setInputPaths(job,new Path("F:\\sunyong\\Java\\codes\\javaToHdfs\\join"));
        FileOutputFormat.setOutputPath(job,new Path("joinOut"));

        //7.提交程序运行
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}
  • 5)执行之后去查看文件,如下:

四.MapReduce Map端 join实现原理(没有reduce处理)

实际把一个表缓存到内存里(小表),可以使用HashMap缓存,再遍历另一个表,通过key到HashMap中进行取值
客户表:一个客户一个记录-->小表
订单表:一个客户可有多个订单

  • 1.编写实体类,如上:
  • 2.编写map类.这里不同
package hadoop.mapreduce.join;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

/**
 * @author sunyong
 * @date 2020/07/02
 * @description
 */
public class MapJoinMapper extends Mapper<LongWritable, Text,CustomerOrders, NullWritable> {
    //hashmap存储顾客id顾客姓名
    HashMap<String,String> customerMap = new HashMap<>();
    //准备顾客订单对象
   CustomerOrders customerOrders = new CustomerOrders();
   //对顾客表操作
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取缓存文件的URI,这里只有一个文件
        URI[] cacheFiles = context.getCacheFiles();
        if(cacheFiles!=null && cacheFiles.length>0){
            //获取文件路径,文件名
           String fileName = cacheFiles[0].getPath().toString();
           //缓冲流并设置utf8格式
           BufferedReader bw = new BufferedReader(new InputStreamReader(new FileInputStream(fileName),"UTF-8"));
           String line;
            //读取文件将第一列和第二列作为map的键和值
            while(StringUtils.isNotEmpty(line = bw.readLine())){
               String[] split = line.split(",");
               customerMap.put(split[0],split[1]);
             }
             //关闭资源
             bw.close();
        }

    }
    //对订单表操作
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取第一行切割成字段
        String[] fields = value.toString().split(",");
        //进行赋值
        customerOrders.setCustomer_id(fields[2]);
        customerOrders.setOrderId(fields[0]);
        customerOrders.setOrder_status(fields[3]);
        //从HashMap获取姓名
        customerOrders.setCustomer_name(customerMap.get(fields[2]));
        //写出一个个对象(map方法每个键都会执行)
        context.write(customerOrders,NullWritable.get());
    }
}
  • 3.编写Driver类执行:
package hadoop.mapreduce.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

/**
 * @author sunyong
 * @date 2020/07/01
 * @description
 */
public class CustomerOrderDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        //1.创建配置文件,创建Job
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf,"mapJoin");

        //2.设置jar的位置
        job.setJarByClass(CustomerOrderDriver.class);

        //3.设置map和reduce的位置(这里不需要reduce)
        job.setMapperClass(MapJoinMapper.class);
        //设置reduce个数为0
        job.setNumReduceTasks(0);
        //4.设置map输出端的key,value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(CustomerOrders.class);
        //5.设置reduce输出的key,value类型(这里不需要)
        //6.设置输出路径
        //注意URI无法识别\\只能用///不然会报错,无法识别路径
        job.addCacheFile(new URI("file:///F:///sunyong///Java///codes///javaToHdfs///join///customers.csv"));//设置小表的缓存
        FileInputFormat.setInputPaths(job,new Path("F:\\sunyong\\Java\\codes\\javaToHdfs\\join\\orders.csv"));
        FileOutputFormat.setOutputPath(job,new Path("mapOut"));
        //7.提交程序运行
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}
  • 4)执行后查看文件效果如下(是没有顺序的):