MapReduce Join
对两份数据data1和data2进行关键词连接是一个很通用的问题,如果数据量比较小,可以在内存中完成连接。
如果数据量比较大,在内存进行连接操会发生OOM。mapreduce join可以用来解决大数据的连接。
1 思路
1.1 reduce join
在map阶段, 把关键字作为key输出,并在value中标记出数据是来自data1还是data2。因为在shuffle阶段已经自然按key分组,reduce阶段,判断每一个value是来自data1还是data2,在内部分成2组,做集合的乘积。
这种方法有2个问题:
1, map阶段没有对数据瘦身,shuffle的网络传输和排序性能很低。
2, reduce端对2个集合做乘积计算,很耗内存,容易导致OOM。
实现代码如下:
主程序入口代码:
package com.ibeifeng.mapreduce.join;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.task.reduce.Shuffle;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MapReduceJoin extends Configured implements Tool{
//定义map处理类模板
public static class map extends Mapper<LongWritable, Text, IntWritable, DataJoin>{
private IntWritable outputkey = new IntWritable();
private DataJoin datajoin = new DataJoin();
protected void map(LongWritable key, Text values, Context context)
throws IOException, InterruptedException {
//1.获取字符串
String str = values.toString();
//2.对字符串进行分割
String[] value = str.split(",");
//3.对非法数据进行过滤
int len = value.length;
if(len!=3&&len!=4) {
return;
}
//4.取出cid
String cid = value[0];
//5.判断是是customer表还是order表
if(len == 3) {
//表示是customer表
String cname = value[1];
String cphone = value[2];
datajoin.set("Customer", cid+","+cname+","+cphone);
}
if(len == 4) {
//表示是order表
String oname = value[1];
String oprice = value[2];
String otime = value[3];
datajoin.set("Order", cid+","+oname+","+oprice+","+otime);
}
outputkey.set(Integer.valueOf(cid));
context.write(outputkey, datajoin);
}
}
//定义reduce处理类模板
public static class reduce extends Reducer<IntWritable, DataJoin, NullWritable, Text{ private Text outputvalue = new Text();
@Override
protected void reduce(IntWritable key, Iterable<DataJoin> values,
Context context) throws IOException, InterruptedException {
//定义一个字符串用于保存客户信息
String customerInfo = null;
//定义一个List集合用于保存订单信息
List<String> list = new ArrayList<String>();
for(DataJoin datajoin : values) {
if(datajoin.getTag().equals("Customer")) {
System.out.println(datajoin.getData());
customerInfo = datajoin.getData();
}
if(datajoin.getTag().equals("Order")) {
list.add(datajoin.getData());
}
}
//进行输出
for(String s :list) {
outputvalue.set(customerInfo+","+s);
context.write(NullWritable.get(), outputvalue);
}
}
}
//配置Driver模块
public int run(String[] args) {
//1.获取配置配置文件对象
Configuration configuration = new Configuration();
//2.创建给mapreduce处理的任务
Job job = null;
try {
job = Job.getInstance(configuration,this.getClass().getSimpleName());
} catch (IOException e) {
e.printStackTrace();
}
try {
//3.创建输入路径
Path source_path = new Path(args[0]);
FileInputFormat.addInputPath(job, source_path);
//4.创建输出路径
Path des_path = new Path(args[1]);
FileOutputFormat.setOutputPath(job, des_path);
} catch (IllegalArgumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
//设置让任务打包jar运行
job.setJarByClass(MapReduceJoin.class);
//5.设置map
job.setMapperClass(map.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(DataJoin.class);
//================shuffle========================
//1.分区
// job.setPartitionerClass(MyPartitioner.class);
//2.排序
// job.setSortComparatorClass(cls);
//3.分组
// job.setGroupingComparatorClass(MyGroup.class);
//4.可选项,设置combiner,相当于map过程的reduce处理,优化选项
// job.setCombinerClass(Combiner.class);
//设置reduce个数
// job.setNumReduceTasks(2);
//================shuffle========================
//6.设置reduce
job.setReducerClass(reduce.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//7.提交job到yarn组件上
boolean isSuccess = false;
try {
isSuccess = job.waitForCompletion(true);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
return isSuccess?0:1;
}
//书写主函数
public static void main(String[] args) {
Configuration configuration = new Configuration();
//1.书写输入和输出路径
String[] args1 = new String[] {
"hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/input",
"hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/output"
};
//2.设置系统以什么用户执行job任务
System.setProperty("HADOOP_USER_NAME", "beifeng");
//3.运行job任务
int status = 0;
try {
status = ToolRunner.run(configuration, new MapReduceJoin(), args1);
} catch (Exception e) {
e.printStackTrace();
}
// int status = new MyWordCountMapReduce().run(args1);
//4.退出系统
System.exit(status);
}
}
自定义包装类代码:
package com.ibeifeng.mapreduce.join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class DataJoin implements Writable{
private String tag;
private String data;
public String getTag() {
return tag;
}
public String getData() {
return data;
}
public void set(String tag,String data) {
this.tag = tag;
this.data = data;
}
@Override
public String toString() {
return tag+","+data;
}
public void write(DataOutput out) throws IOException {
out.writeUTF(this.tag);
out.writeUTF(this.data);
}
public void readFields(DataInput in) throws IOException {
this.tag = in.readUTF();
this.data = in.readUTF();
}
}
准备测试数据如下(两个csv文件):
将csv文件上传至HDFS当中,并且将代码打包成jar,然后执行以下命令:
bin/yarn jar datas/mapreduce_join.jar /user/beifeng/wordcount/input/ /user/beifeng/wordcount/output
结果如下:
Map join
MapJoin 适用于有一份数据较小的连接情况。做法是直接把该小份数据直接全部加载到内存当中,按链接关键字建立索引。然后大份数据就作为 MapTask 的输入,对 map()方法的每次输入都去内存当中直接去匹配连接。然后把连接结果按 key 输出,这种方法要使用 hadoop中的 DistributedCache 把小份数据分布到各个计算节点,每个 maptask 执行任务的节点都需要加载该数据到内存,并且按连接关键字建立索引:
这里假设Customer为小表,Orders为大表,这也符合实际生产环境。
关于这种分布式缓存的用法,直接看下代码的演示:
主函数入口代码:
package com.ibeifeng.mapreduce.join;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import javax.jdo.annotations.Order;
public class MapJoin extends Configured implements Tool{
//定义缓存文件的读取路径
private static String cacheFile = "hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/input1/customers.csv";
//定义map处理类模板
public static class map extends Mapper<LongWritable, Text, NullWritable, Text>{
private Text outputValue = new Text();
Map<Integer,Customer> map = null;
@Override
protected void setup(Context context)throws IOException, InterruptedException {
//读取分布式缓存文件
FileSystem fs = FileSystem.get(URI.create(cacheFile),context.getConfiguration());
FSDataInputStream fdis = fs.open(new Path(cacheFile));
BufferedReader br = new BufferedReader(new InputStreamReader(fdis));
//创建一个map集合来保存读取文件的数据
map = new HashMap<Integer,Customer>();
String line = null;
while((line = br.readLine())!=null) {
String[] split = line.split(",");
Customer customer = new Customer(Integer.parseInt(split[0]), split[1], split[2]);
map.put(customer.getCid(),customer);
}
//关闭IO流
br.close();
}
@Override
protected void map(LongWritable key, Text values, Context context)
throws IOException, InterruptedException {
//将Customer表和Orders表的数据进行组合
String str = values.toString();
String[] Orders = str.split(",");
int joinID = Integer.valueOf(Orders[0]);
Customer customerid = map.get(joinID);
StringBuffer sbf = new StringBuffer();
sbf.append(Orders[0]).append(",")
.append(customerid.getCname()).append(",")
.append(customerid.getCphone()).append(",")
.append(Orders[1]).append(",")
.append(Orders[2]).append(",")
.append(Orders[3]).append(",");
outputValue.set(sbf.toString());
context.write(NullWritable.get(),outputValue);
}
}
//无reduce程序
//配置Driver模块
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//获取配置配置文件对象
Configuration configuration = new Configuration();
//创建给mapreduce处理的任务
Job job = Job.getInstance(configuration,this.getClass().getSimpleName());
//获取将要读取到内存的文件的路径,并加载进内存
job.addCacheFile(URI.create(cacheFile));
//创建输入路径
Path source_path = new Path(args[0]);
//创建输出路径
Path des_path = new Path(args[1]);
//创建操作hdfs的FileSystem对象
FileSystem fs = FileSystem.get(configuration);
if (fs.exists(des_path)) {
fs.delete(des_path,true);
}
FileInputFormat.addInputPath(job, source_path);
FileOutputFormat.setOutputPath(job, des_path);
//设置让任务打包jar运行
job.setJarByClass(MapJoin.class);
//设置map
job.setMapperClass(map.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
//设置reduceTask的任务数为0,即没有reduce阶段和shuffle阶段
job.setNumReduceTasks(0);
//提交job到yarn组件上
boolean isSuccess = job.waitForCompletion(true);
return isSuccess?0:1;
}
//书写主函数
public static void main(String[] args) {
Configuration configuration = new Configuration();
//1.书写输入和输出路径
String[] args1 = new String[] {
"hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/input",
"hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/wordcount/output"
};
//2.设置系统以什么用户执行job任务
System.setProperty("HADOOP_USER_NAME", "beifeng");
//3.运行job任务
int status = 0;
try {
status = ToolRunner.run(configuration, new MapJoin(), args1);
} catch (Exception e) {
e.printStackTrace();
}
// int status = new MyWordCountMapReduce().run(args1);
//4.退出系统
System.exit(status);
}
}
构造类代码:
package com.ibeifeng.mapreduce.join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Customer implements Writable{
private int cid;
private String cname;
private String cphone;
public int getCid() {
return cid;
}
public void setCid(int cid) {
this.cid = cid;
}
public String getCname() {
return cname;
}
public void setCname(String cname) {
this.cname = cname;
}
public String getCphone() {
return cphone;
}
public void setCphone(String cphone) {
this.cphone = cphone;
}
public Customer(int cid, String cname, String cphone) {
super();
this.cid = cid;
this.cname = cname;
this.cphone = cphone;
}
public void write(DataOutput out) throws IOException {
out.writeInt(this.cid);
out.writeUTF(this.cname);
out.writeUTF(this.cphone);
}
public void readFields(DataInput in) throws IOException {
this.cid = in.readInt();
this.cname = in.readUTF();
this.cphone = in.readUTF();
}
@Override
public String toString() {
return "Customer [cid=" + cid + ", cname=" + cname + ", cphone=" + cphone + "]";
}
}
执行命令:bin/yarn jar datas/map_join.jar也是可以得到同样的结果: