MapReduce关系表连接

转载

mob6454cc73e9a6 2024-07-09 14:59:47

文章标签 MapReduce关系表连接大数据 mapreduce Text ide 文章分类 架构后端开发

文章目录

MapReduce之Join多种应用

1. Reduce Join 工作原理
2. Reduce Join 案例实操

2.1 需求分析
2.2 代码实现

TableBean
TableMapper
TableReduce
TableDriver

3. Reduce Join缺点及解决方案
4. Map Join 工作原理
5. Map Join 案例实操

5.1 需求分析
5.2 代码实现

DistributedCacheDriver
DistributedCacheMapper

MapReduce之Join多种应用

1. Reduce Join 工作原理

Map端的主要工作：为来自不同表或文件的key/value对，打标签以区别不同来源的记录。然后用连接字段作为key，其余部分和新加的标志作为value，最后进行输出。
Reduce端的主要工作：在Reduce端以连接字段作为key的分组已经完成，我们只需要在每一个分组当中将那些来源于不同文件的记录(在Map阶段已经打标志)分开，最后进行合并就ok了。

2. Reduce Join 案例实操

2.1 需求分析

通过将关联条件作为Map输出的key，将两表满足Join条件的数据并携带数据所来源的文件信息，发往同一个ReduceTask，在Reduce中进行数据的串联。

MapReduce关系表连接_Text

2.2 代码实现

TableBean

/**
 * @Date 2020/7/12 17:14
 * @Version 10.21
 * @Author DuanChaojie
 */
public class TableBean implements Writable {
    // 订单id
    private String order_id;
    // 产品id
    private String p_id;
    // 产品数量
    private int amount;
    // 产品名称
    private String pname;
    // 产品标记
    private String flag;

    public TableBean() {
        super();
    }

    public TableBean(String order_id, String p_id, int amount, String pname, String flag) {
        super();
        this.order_id = order_id;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    public String getOrder_id() {
        return order_id;
    }

    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
            out.writeUTF(order_id);
            out.writeUTF(p_id);
            out.writeInt(amount);
            out .writeUTF(pname);
            out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
            this.order_id = in.readUTF();
            this.p_id = in.readUTF();
            this.amount = in.readInt();
            this.pname = in.readUTF();
            this.flag = in.readUTF();
    }

    @Override
    public String toString() {
        return  order_id + "\t" + amount + "\t" + pname;
    }

}

TableMapper

public class TableMapper extends Mapper<LongWritable, Text,Text,TableBean> {

    String name;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 获取输入文件切片
        FileSplit split = (FileSplit)context.getInputSplit();

        // 获取输入文件名称
        name = split.getPath().getName();
    }

    TableBean v = new TableBean();
    Text k = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 获取输入的数据
        String line = new String(value.getBytes(),0,value.getLength(),"GBK");

        // 不同文件分别处理
        if (name.startsWith("order")) {
            // 切割
            String[] fields = line.split("\t");
            // 封装对象
            v.setOrder_id(fields[0]);
            v.setP_id(fields[1]);
            v.setAmount(Integer.parseInt(fields[2]));
            v.setPname("");
            v.setFlag("order");

            k.set(fields[1]);
           
        }else{
             // 处理产品表
            String[] fields = line.split("\t");
            v.setP_id(fields[0]);
            v.setPname(fields[1]);
            v.setFlag("pd");
            v.setAmount(0);
            v.setOrder_id("");

            k.set(fields[0]);
        }

        // 写出
        context.write(k,v);
    }
}

TableReduce

这里有个小问题？就是打印key和value之后，输出文件为空；不清楚为啥…

/**
 * @Date 2020/7/12 17:31
 * @Version 10.21
 * @Author DuanChaojie
 */
public class TableReduce extends Reducer<Text,TableBean,TableBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
//        System.out.println("key = " + key);
//        for (TableBean value : values) {
//            System.out.println("value = " + value);
//        }
        // 订单的集合
        ArrayList<TableBean> list = new ArrayList<>();

        // 用一个TableBean对象来缓存，商品信息表
        TableBean pdBean = new TableBean();

        for (TableBean value : values) {
            // 订单表的信息放到集合list中
            if("order".equals(value.getFlag())){
                // 拷贝传递过来的每条订单数据到集合中
                TableBean orderBean = new TableBean();

                try {
                    BeanUtils.copyProperties(orderBean,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                list.add(orderBean);

            }else{
                //产品表
                try {
                    BeanUtils.copyProperties(pdBean,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }

            }
        }

        // 表的拼接
        for (TableBean tableBean : list) {
            tableBean.setPname(pdBean.getPname());
            context.write(tableBean,NullWritable.get());
        }
    }

}

TableDriver

public class TableDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args = new String[]{"E:\\file\\input\\join","E:\\file\\output7"};
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(TableDriver.class);
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        boolean result = job.waitForCompletion(true);

        System.exit(result?0:1);


    }
}

3. Reduce Join缺点及解决方案

缺点：
这种方式中，合并的操作是在Reduce阶段完成，Reduce端的处理压力太大，Map节点的运算负载则很低，资源利用率不高，且在Reduce阶段极易产生数据倾斜。
解决方案：
Map端实现数据合并

4. Map Join 工作原理

使用场景：Map Join适用于一张表十分小、一张表很大的场景。
优点：
在Reduce端处理过多的表，非常容易产生数据倾斜。怎么办？在Map端缓存多张表，提前处理业务逻辑，这样增加Map端业务，减少Reduce端数据的压力，尽可能的减少数据倾斜。
具体办法：采用DistributedCache
在Mapper的setup阶段，将文件读取到缓存集合中。、
在驱动函数中加载缓存。

// 缓存普通文件到Task运行节点。
job.addCacheFile(new URI("file:///E:/file/input/join/pd.txt"));

5. Map Join 案例实操

5.1 需求分析

MapJoin适用于关联表中有小表的情形。

MapReduce关系表连接_MapReduce关系表连接_02

5.2 代码实现

DistributedCacheDriver

/**
 * @Date 2020/7/12 20:21
 * @Version 10.21
 * @Author DuanChaojie
 */
public class DistributedCacheDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        // 0 根据自己电脑路径重新配置
        args = new String[]{"E:\\file\\input\\join\\order.txt", "E:\\file\\output8"};

        // 1 获取job信息
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 2 设置加载jar包路径
        job.setJarByClass(DistributedCacheDriver.class);

        // 3 关联map
        job.setMapperClass(DistributedCacheMapper.class);

        // 4 设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 5 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 6 加载缓存数据
        job.addCacheFile(new URI("file:///E:/file/input/join/pd.txt"));

        // 7 Map端Join的逻辑不需要Reduce阶段，设置reduceTask数量为0
        job.setNumReduceTasks(0);

        // 8 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }
}

DistributedCacheMapper

public class DistributedCacheMapper extends Mapper<LongWritable, Text,Text, NullWritable> {

    Map<String,String> pdMap = new HashMap<>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 获取缓存文件
        URI[] cacheFiles = context.getCacheFiles();
        String path = cacheFiles[0].getPath().toString();
        FileInputStream fis = new FileInputStream(path);
        InputStreamReader isr = new InputStreamReader(fis, "GBK");

        BufferedReader reader = new BufferedReader(isr);

        String line;

        while(StringUtils.isNotBlank(line = reader.readLine())){

            // 切割
            String[] fields = line.split("\t");
            pdMap.put(fields[0],fields[1]);
        }

        // 关闭资源
        reader.close();
        isr.close();
        fis.close();

    }

    Text k = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 获取一行
        String line = new String(value.getBytes(),0,value.getLength(),"GBK");

        // 截取
        String[] fields = line.split("\t");

        // 商品id
        String pId = fields[1];
        // 获取商品名称
        String pName = pdMap.get(pId);

        k.set(line + "\t" + pName );

        // 写出
        context.write(k,NullWritable.get());

    }
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。