1、map端join算法实现
1.1、原理阐述
适用于关联表中有小表的情形;
可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行join并输出最终结果,可以大大提高join操作的并发度,加快处理速度。
1.2、实现示例
先在mapper类中预先定义好小表,进行join。
引入实际场景中的解决方案:一次加载数据库或者用distributedcache。
public class TestDistributedCache {
static class TestDistributedCacheMapper extends Mapper<LongWritable, Text, Text, Text>{
FileReader in = null;
BufferedReader reader = null;
HashMap<String,String> b_tab = new HashMap<String, String>();
String localpath =null;
String uirpath = null;
//是在map任务初始化的时候调用一次
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//通过这几句代码可以获取到cache file的本地绝对路径,测试验证用
Path[] files = context.getLocalCacheFiles();
localpath = files[0].toString();
URI[] cacheFiles = context.getCacheFiles();
//缓存文件的用法——直接用本地IO来读取
//这里读的数据是map task所在机器本地工作目录中的一个小文件
in = new FileReader("b.txt");
reader =new BufferedReader(in);
String line =null;
while(null!=(line=reader.readLine())){
String[] fields = line.split(",");
b_tab.put(fields[0],fields[1]);
}
IOUtils.closeStream(reader);
IOUtils.closeStream(in);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//这里读的是这个map task所负责的那一个切片数据(在hdfs上)
String[] fields = value.toString().split("\t");
String a_itemid = fields[0];
String a_amount = fields[1];
String b_name = b_tab.get(a_itemid);
// 输出结果 1001 98.9 banan
context.write(new Text(a_itemid), new Text(a_amount + "\t" + ":" + localpath + "\t" +b_name ));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TestDistributedCache.class);
job.setMapperClass(TestDistributedCacheMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//这里是我们正常的需要处理的数据所在路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//不需要reducer
job.setNumReduceTasks(0);
//分发一个文件到task进程的工作目录
job.addCacheFile(new URI("hdfs://hadoop-server01:9000/cachefile/b.txt"));
//分发一个归档文件到task进程的工作目录
// job.addArchiveToClassPath(archive);
//分发jar包到task节点的classpath下
// job.addFileToClassPath(jarfile);
job.waitForCompletion(true);
}
}
2、reduce端join算法实现
2.1、需求
订单数据表t_order:
id | date | pid | amount |
1001 | 20150710 | P0001 | 2 |
1002 | 20150710 | P0001 | 3 |
1002 | 20150710 | P0002 | 3 |
商品信息表t_product
id | pname | category_id | price |
P0001 | 小米5 | 1000 | 2 |
P0002 | 锤子T1 | 1000 | 3 |
假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id
2.2、实现机制
通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联。
public class RJoin {
static class RJoinMapper extends Mapper<LongWritable, Text, Text, InfoBean> {
InfoBean bean = new InfoBean();
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
// 通过文件名判断是哪种数据
String pid = "";
if (name.startsWith("order")) {
String[] fields = line.split(",");
// id date pid amount
pid = fields[2];
bean.set(Integer.parseInt(fields[0]), fields[1], pid, Integer.parseInt(fields[3]), "", 0, 0, "0");
} else {
String[] fields = line.split(",");
// id pname category_id price
pid = fields[0];
bean.set(0, "", pid, 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), "1");
}
k.set(pid);
context.write(k, bean);
}
}
static class RJoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable> {
@Override
protected void reduce(Text pid, Iterable<InfoBean> beans, Context context) throws IOException, InterruptedException {
InfoBean pdBean = new InfoBean();
ArrayList<InfoBean> orderBeans = new ArrayList<InfoBean>();
for (InfoBean bean : beans) {
if ("1".equals(bean.getFlag())) {
try {
BeanUtils.copyProperties(pdBean, bean);
} catch (Exception e) {
e.printStackTrace();
}
} else {
InfoBean odbean = new InfoBean();
try {
BeanUtils.copyProperties(odbean, bean);
orderBeans.add(odbean);
} catch (Exception e) {
e.printStackTrace();
}
}
}
// 拼接两类数据形成最终结果
for (InfoBean bean : orderBeans) {
bean.setPname(pdBean.getPname());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());
context.write(bean, NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception {
args = new String[]{"D:/srcdata/wgj/", "D:/temp/out3"};
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 指定本程序的jar包所在的本地路径
// job.setJarByClass(RJoin.class);
// job.setJar("c:/join.jar");
// 指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(RJoinMapper.class);
job.setReducerClass(RJoinReducer.class);
// 指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
// 指定最终输出的数据的kv类型
job.setOutputKeyClass(InfoBean.class);
job.setOutputValueClass(NullWritable.class);
// 指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/* job.submit(); */
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
public class InfoBean implements Writable {
private int order_id;
private String dateString;
private String p_id;
private int amount;
private String pname;
private int category_id;
private float price;
// flag=0表示这个对象是封装订单表记录
// flag=1表示这个对象是封装产品信息记录
private String flag;
public InfoBean() {
}
public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id, float price, String flag) {
this.order_id = order_id;
this.dateString = dateString;
this.p_id = p_id;
this.amount = amount;
this.pname = pname;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(order_id);
out.writeUTF(dateString);
out.writeUTF(p_id);
out.writeInt(amount);
out.writeUTF(pname);
out.writeInt(category_id);
out.writeFloat(price);
out.writeUTF(flag);
}
@Override
public void readFields(DataInput in) throws IOException {
this.order_id = in.readInt();
this.dateString = in.readUTF();
this.p_id = in.readUTF();
this.amount = in.readInt();
this.pname = in.readUTF();
this.category_id = in.readInt();
this.price = in.readFloat();
this.flag = in.readUTF();
}
@Override
public String toString() {
return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag;
}
}
2.3、缺点
这种方式中,join的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜。
解决方案: map端join实现方式。