文章目录

  • MapReduce之Join多种应用
  • 1. Reduce Join 工作原理
  • 2. Reduce Join 案例实操
  • 2.1 需求分析
  • 2.2 代码实现
  • TableBean
  • TableMapper
  • TableReduce
  • TableDriver
  • 3. Reduce Join缺点及解决方案
  • 4. Map Join 工作原理
  • 5. Map Join 案例实操
  • 5.1 需求分析
  • 5.2 代码实现
  • DistributedCacheDriver
  • DistributedCacheMapper

MapReduce之Join多种应用

1. Reduce Join 工作原理

Map端的主要工作:为来自不同表或文件的key/value对,打标签以区别不同来源的记录。然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。

Reduce端的主要工作:在Reduce端以连接字段作为key的分组已经完成,我们只需要在每一个分组当中将那些来源于不同文件的记录(在Map阶段已经打标志)分开,最后进行合并就ok了。

2. Reduce Join 案例实操

2.1 需求分析

通过将关联条件作为Map输出的key,将两表满足Join条件的数据并携带数据所来源的文件信息,发往同一个ReduceTask,在Reduce中进行数据的串联。

MapReduce关系表连接_Text

2.2 代码实现
TableBean
/**
 * @Date 2020/7/12 17:14
 * @Version 10.21
 * @Author DuanChaojie
 */
public class TableBean implements Writable {
    // 订单id
    private String order_id;
    // 产品id
    private String p_id;
    // 产品数量
    private int amount;
    // 产品名称
    private String pname;
    // 产品标记
    private String flag;

    public TableBean() {
        super();
    }

    public TableBean(String order_id, String p_id, int amount, String pname, String flag) {
        super();
        this.order_id = order_id;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    public String getOrder_id() {
        return order_id;
    }

    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
            out.writeUTF(order_id);
            out.writeUTF(p_id);
            out.writeInt(amount);
            out .writeUTF(pname);
            out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
            this.order_id = in.readUTF();
            this.p_id = in.readUTF();
            this.amount = in.readInt();
            this.pname = in.readUTF();
            this.flag = in.readUTF();
    }

    @Override
    public String toString() {
        return  order_id + "\t" + amount + "\t" + pname;
    }

}
TableMapper
public class TableMapper extends Mapper<LongWritable, Text,Text,TableBean> {

    String name;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 获取输入文件切片
        FileSplit split = (FileSplit)context.getInputSplit();

        // 获取输入文件名称
        name = split.getPath().getName();
    }

    TableBean v = new TableBean();
    Text k = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 获取输入的数据
        String line = new String(value.getBytes(),0,value.getLength(),"GBK");

        // 不同文件分别处理
        if (name.startsWith("order")) {
            // 切割
            String[] fields = line.split("\t");
            // 封装对象
            v.setOrder_id(fields[0]);
            v.setP_id(fields[1]);
            v.setAmount(Integer.parseInt(fields[2]));
            v.setPname("");
            v.setFlag("order");

            k.set(fields[1]);
           
        }else{
             // 处理产品表
            String[] fields = line.split("\t");
            v.setP_id(fields[0]);
            v.setPname(fields[1]);
            v.setFlag("pd");
            v.setAmount(0);
            v.setOrder_id("");

            k.set(fields[0]);
        }

        // 写出
        context.write(k,v);
    }
}
TableReduce

这里有个小问题?就是打印key和value之后,输出文件为空;不清楚为啥…

/**
 * @Date 2020/7/12 17:31
 * @Version 10.21
 * @Author DuanChaojie
 */
public class TableReduce extends Reducer<Text,TableBean,TableBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
//        System.out.println("key = " + key);
//        for (TableBean value : values) {
//            System.out.println("value = " + value);
//        }
        // 订单的集合
        ArrayList<TableBean> list = new ArrayList<>();

        // 用一个TableBean对象来缓存,商品信息表
        TableBean pdBean = new TableBean();

        for (TableBean value : values) {
            // 订单表的信息放到集合list中
            if("order".equals(value.getFlag())){
                // 拷贝传递过来的每条订单数据到集合中
                TableBean orderBean = new TableBean();

                try {
                    BeanUtils.copyProperties(orderBean,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                list.add(orderBean);

            }else{
                //产品表
                try {
                    BeanUtils.copyProperties(pdBean,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }

            }
        }

        // 表的拼接
        for (TableBean tableBean : list) {
            tableBean.setPname(pdBean.getPname());
            context.write(tableBean,NullWritable.get());
        }
    }

}
TableDriver
public class TableDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args = new String[]{"E:\\file\\input\\join","E:\\file\\output7"};
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(TableDriver.class);
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean result = job.waitForCompletion(true);

        System.exit(result?0:1);


    }
}

3. Reduce Join缺点及解决方案

缺点:

  • 这种方式中,合并的操作是在Reduce阶段完成,Reduce端的处理压力太大,Map节点的运算负载则很低,资源利用率不高,且在Reduce阶段极易产生数据倾斜。

解决方案:

  • Map端实现数据合并

4. Map Join 工作原理

使用场景:Map Join适用于一张表十分小、一张表很大的场景。

优点:

在Reduce端处理过多的表,非常容易产生数据倾斜。怎么办?在Map端缓存多张表,提前处理业务逻辑,这样增加Map端业务,减少Reduce端数据的压力,尽可能的减少数据倾斜。

具体办法:采用DistributedCache

  1. 在Mapper的setup阶段,将文件读取到缓存集合中。、
  2. 在驱动函数中加载缓存。
// 缓存普通文件到Task运行节点。
job.addCacheFile(new URI("file:///E:/file/input/join/pd.txt"));

5. Map Join 案例实操

5.1 需求分析

MapJoin适用于关联表中有小表的情形。

MapReduce关系表连接_MapReduce关系表连接_02

5.2 代码实现
DistributedCacheDriver
/**
 * @Date 2020/7/12 20:21
 * @Version 10.21
 * @Author DuanChaojie
 */
public class DistributedCacheDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        // 0 根据自己电脑路径重新配置
        args = new String[]{"E:\\file\\input\\join\\order.txt", "E:\\file\\output8"};

        // 1 获取job信息
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 2 设置加载jar包路径
        job.setJarByClass(DistributedCacheDriver.class);

        // 3 关联map
        job.setMapperClass(DistributedCacheMapper.class);

        // 4 设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 5 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 6 加载缓存数据
        job.addCacheFile(new URI("file:///E:/file/input/join/pd.txt"));

        // 7 Map端Join的逻辑不需要Reduce阶段,设置reduceTask数量为0
        job.setNumReduceTasks(0);

        // 8 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }
}
DistributedCacheMapper
public class DistributedCacheMapper extends Mapper<LongWritable, Text,Text, NullWritable> {

    Map<String,String> pdMap = new HashMap<>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 获取缓存文件
        URI[] cacheFiles = context.getCacheFiles();
        String path = cacheFiles[0].getPath().toString();
        FileInputStream fis = new FileInputStream(path);
        InputStreamReader isr = new InputStreamReader(fis, "GBK");

        BufferedReader reader = new BufferedReader(isr);

        String line;

        while(StringUtils.isNotBlank(line = reader.readLine())){

            // 切割
            String[] fields = line.split("\t");
            pdMap.put(fields[0],fields[1]);
        }

        // 关闭资源
        reader.close();
        isr.close();
        fis.close();

    }

    Text k = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 获取一行
        String line = new String(value.getBytes(),0,value.getLength(),"GBK");

        // 截取
        String[] fields = line.split("\t");

        // 商品id
        String pId = fields[1];
        // 获取商品名称
        String pName = pdMap.get(pId);

        k.set(line + "\t" + pName );

        // 写出
        context.write(k,NullWritable.get());

    }
}