一  前言

在很多时候,我们可能需要处理的不是一个单独的文件,而是几个有关联的文件,比如账户信息和订单信息=>

账户信息:customerIdname address telephone

订单信息:orderIdcustomerId price productName

我们很可能就需要用到这2个文件,并且他们的关系通过用户id进行关联或者join.

 

两个文件的关联点作为key,后面的字段作为value。

 

二 Map-Reduce join的三种方式

2.1 Reduce端Join: Join的操作在Reduce执行

2.2 Map端Join:

两个待连接的表,其中一个非常大,一个非常小,可以将小表直接存放于内存,DistributedCache实现。

2.3 半连接 SemiJoin: map端join和reduce端的join结合

 

三 Reduce端 Join

3.1场景:对于那种需要连接的文件大小差不多,且不需要过滤无效数据;这种情况效率比较低,特别是文件都很大的时候,还要经历shuffle过程。

 

3.2实现步骤:

#一般情况下,我们需要自定义key的类型,提供区别两种数据的一个标记
public class ReduceSideJoin extends Configured implements  Tool
public static class JoinWritable implements  Writable
      private String  tag;
      private String  data;
 
      public JoinWritable() {
 
      }
 
      public JoinWritable(String tag, String data) {
           this.tag = tag;
           this.data = data;
      }
 
      public void write(DataOutput out) throws IOException {
out.writeUTF(getTag());
out.writeUTF(getData());
      }
 
      public void readFields(DataInput in) throws IOException {
           this.setTag(in.readUTF());
           this.setData(in.readUTF());
      }
 
      public void set(String tag, String data) {
           this.setTag(tag);
           this.setData(data);
      }
 
 @Override
      public int hashCode() {
           final int prime = 31;
           int result = 1;
result = prime * result + ((data == null) ?  0 : data.hashCode());
result = prime * result + ((tag == null) ?  0 : tag.hashCode());
           return  result;
      }
 
 @Override
      public boolean equals(Object obj) {
           if (this == obj)
                 return true;
           if (obj == null)
                 return false;
           if (getClass() != obj.getClass())
                 return false;
           JoinWritable other = (JoinWritable)  obj;
           if (data == null) {
                 if (other.data != null)
                      return false;
           } else if (!data.equals(other.data))
                 return false;
           if (tag == null) {
                 if (other.tag != null)
                      return false;
           } else if (!tag.equals(other.tag))
                 return false;
           return true;
      }
 
 @Override
      public String  toString() {
           return  tag+","+data;
      }
 
      public String  getTag() {
           return  tag;
      }
 
      public void setTag(String tag) {
           this.tag = tag;
      }
 
      public String  getData() {
           return  data;
      }
 
      public void setData(String data) {
           this.data = data;
      }
}
 
public static class JoinMapper extends Mapper<LongWritable, Text, LongWritable,JoinWritable> {
      private LongWritable  outputKey = new  LongWritable();
      private JoinWritable  outputValue = new  JoinWritable();
 
 @Override
      protected void map(LongWritable key, Text value,
Context context)
                 throws IOException, InterruptedException {
           String[] fields =  value.toString().split(",");
           if (ArrayUtils.isEmpty(fields)) {
                 return;
           }
 
           if (fields.length != 3 && fields.length != 4) {
                 return;
           }
 
           long cid = Long.valueOf(fields[0]);
outputKey.set(cid);
 
           String name =  fields[1];
           if (fields.length == 3) {
                 String phone =  fields[2];
outputValue.set("customer", name+","+phone);
           }
 
           if (fields.length == 4) {
                 String price =  fields[2];
                 String date =  fields[3];
outputValue.set("order", name+","+price+","+date);
           }
context.write(outputKey, outputValue);
      }
}
 
public static class JoinReduce extends Reducer<LongWritable, JoinWritable,NullWritable, Text>{
      private Text  outputValue = new  Text();
 @Override
      protected void reduce(LongWritable  key, Iterable<JoinWritable>  values,
Context context)
                 throws IOException, InterruptedException {
           String customerInfo = null;
List<String> orderList = new  ArrayList<String>();
           for (JoinWritable value : values) {
                 if ("customer".equals(value.getTag())) {
customerInfo = value.getData();
                 } else if ("order".equals(value.getTag())){
orderList.add(value.getData());
                 }
           }
 
           for (String order : orderList) {
outputValue.set(key.get()+","+customerInfo+","+order);
context.write(NullWritable.get(), outputValue);
           }
      }
}
 
public int run(String[] args) throws Exception {
      if (ArrayUtils.isEmpty(args) || args.length < 2) {
           System.exit(2);
      }
      Configuration conf = this.getConf();
      Job job = Job.getInstance(conf, this.getClass().getSimpleName());
 
 job.setJarByClass(ReduceSideJoin.class);
 
      Path in = new Path(args[0]);
 FileInputFormat.addInputPath(job, in);
 
      Path out = new Path(args[1]);
 FileOutputFormat.setOutputPath(job, out);
 
 job.setMapperClass(JoinMapper.class);
 job.setMapOutputKeyClass(LongWritable.class);
 job.setMapOutputValueClass(JoinWritable.class);
 
 job.setReducerClass(JoinReduce.class);
 job.setOutputKeyClass(NullWritable.class);
 job.setOutputValueClass(Text.class);
 
      return job.waitForCompletion(Boolean.TRUE) ? 0 : 1;
}
 
public static void main(String[] args) throws Exception {
      int num = new  Random().nextInt(1000);
      if (args == null ||  args.length ==  0) {
args = new String[]{
"hdfs://hdfs-cluster/user/hadoop/input/join",
"hdfs://hdfs-cluster/user/hadoop/output/join"+num
           };
      }
 
      Configuration conf = new Configuration();
      int code = ToolRunner.run(conf, new  ReduceSideJoin(), args);
      System.exit(code);
}
}

四 Map端Join

4.1场景

如果需要连接的文件,一个比较大,一个比较小。这时候我们是比较适合在Map 端做join操作的。因为我们知道在Reduce端做join操作效率是比较低下的。

 

在Map阶段join会在数据达到map函数之前取出来,放在内存里的,以便task运行的时候可以去取,所以必须保证数据量不是很大,否则内存会撑不住。

我们要取文件需要通过DistributedCache来实现的,它会把我们上传的文件分发到各个节点一份,从而保证每个Mapper在工作的时候都能从本地缓存目录去取出这个文件。

4.2实现

#setup 阶段去读取文件,然后放入内存
#map阶段进行数据合并
#在提交job的时候,需要将这个文件路径放入分布式缓存
public class MapSideJoin extends Configured implements  Tool{
public static class CustomerInfo {
      private long cid;
      private String  name;
      private String  phone;
 
      public CustomerInfo() {
 
      }
 
      public CustomerInfo(long cid, String name, String phone) {
           this.cid = cid;
           this.name = name;
           this.phone = phone;
      }
 
      public long getCid() {
           return  cid;
      }
 
      public String  getName() {
           return  name;
      }
 
      public String  getPhone() {
           return  phone;
      }
}
 
public static class MapSideJoinMapper extendsMapper<LongWritable, Text, LongWritable, Text> {
      private Map<Long, CustomerInfo> customerInfos = new HashMap<Long, CustomerInfo>();
      private LongWritable  outputKey = new  LongWritable();
      private Text  outputValue = new  Text();
 
 @Override
      protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable,Text>.Context context)
                 throws IOException, InterruptedException {
           if (value == null) {
                 return;
           }
 
           String[] fields =  value.toString().split(",");
           if (ArrayUtils.isEmpty(fields) || fields.length < 4) {
                 return;
           }
 
           if (customerInfos.size() == 0) {
                 return;
           }
           try {
                 long cid = Long.valueOf(fields[0]);
                 CustomerInfo cInfo =  customerInfos.get(cid);
                 if (cInfo == null) {
                      return;
                 }
                 StringBuilder builder = new StringBuilder();
builder.append(cid).append("=>").append(cInfo.getName()).append("\t").append(cInfo.getPhone())
append("\t").append(fields[1]).append("\t").append(fields[2]).append("\t").append(fields[3]);
outputKey.set(cid);
outputValue.set(builder.toString());
context.write(outputKey, outputValue);
           } catch (NumberFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
           }
      }
 
 @Override
      protected void setup(Mapper<LongWritable, Text, LongWritable,Text>.Context context)
                 throws IOException, InterruptedException {
           Configuration conf =  context.getConfiguration();
// 获取小表中的缓存文件的URI
           URI[] localCacheFiles =  context.getCacheFiles();
           if (ArrayUtils.isEmpty(localCacheFiles)) {
                 return;
           }
// 通过URI构造HDFS 路径
           Path path = new Path(localCacheFiles[0]);
// 获取文件系统
FileSystem fs = FileSystem.get(conf);
// 打开文件
           FSDataInputStream in =  fs.open(path);
           if (in == null) {
                 return;
           }
           InputStreamReader isr = new InputStreamReader(in);
           BufferedReader br = new BufferedReader(isr);
           String line = null;
           String[] fields = null;
           CustomerInfo info = null;
           while ((line = br.readLine()) != null) {
fields = line.split(",");
                 if (ArrayUtils.isEmpty(fields) || fields.length != 3) {
                      continue;
                 }
info = new CustomerInfo(Long.valueOf(fields[0]), fields[1],  fields[2]);
customerInfos.put(Long.valueOf(fields[0]), info);
           }
           if (br != null) {
br.close();
           }
           if (isr != null) {
isr.close();
           }
      }
}
 
public static class MapSideJoinReducer extendsReducer<LongWritable, Text, NullWritable, Text> {
 @Override
      protected void reduce(LongWritable  key, Iterable<Text>  values,
Context context) throws IOException, InterruptedException {
           for (Text value : values) {
context.write(NullWritable.get(), value);
           }
      }
}
 
public int run(String[] args) throws Exception {
      if (ArrayUtils.isEmpty(args) || args.length < 3) {
           System.exit(3);
      }
 
      Configuration conf = this.getConf();
      Job job = Job.getInstance(conf, this.getClass().getSimpleName());
 
 job.setJarByClass(MapSideJoin.class);
 
      Path in = new Path(args[0]);
 FileInputFormat.addInputPath(job, in);
 
      Path out = new Path(args[1]);
 FileOutputFormat.setOutputPath(job, out);
 
      Path cacheURL = new Path(args[2]);
 job.addCacheFile(cacheURL.toUri());
 
 job.setMapperClass(MapSideJoinMapper.class);
 job.setMapOutputKeyClass(LongWritable.class);
 job.setMapOutputValueClass(Text.class);
 
 job.setReducerClass(MapSideJoinReducer.class);
 job.setOutputKeyClass(NullWritable.class);
 job.setOutputValueClass(Text.class);
 
      return job.waitForCompletion(Boolean.TRUE) ? 0 : 1;
}
 
public static void main(String[] args) throws Exception {
      int num = new  Random().nextInt(1000);
      if (args == null ||  args.length ==  0) {
args = new String[]{
"hdfs://hdfs-cluster/user/hadoop/input/join",
"hdfs://hdfs-cluster/user/hadoop/output/join"+num,
"hdfs://hdfs-cluster/user/hadoop/cache/customers.csv"
           };
      }
 
      Configuration conf = new Configuration();
      int code = ToolRunner.run(conf, new  MapSideJoin(), args);
      System.exit(code);
}
}

五 Semi-Join(半连接)

如果我们需要过滤很多无效数据,但是在Reduce阶段才来过滤,显然并不是一件很好的方法,因为我们让这些数据经历了shuffle过程,这会涉及到很多磁盘I/O 和网络I/O操作。

 

如果我们在Map阶段就把这些数据过滤了,那么这部分数据就不会经历shuffle阶段,从而有助于性能提升。

5.1实现

利用DistributedCache将小表分发到各个节点上,在Map过程的setup()函数里,读取缓存里的文件,只将小表的连接键存储在hashSet中。

 

在map()函数执行时,对每一条数据进行判断,如果这条数据的连接键为空或者在hashSet里不存在,那么则认为这条数据无效,使条数据也不参与reduce的过程。

 

需要注意的是小表的key是不是很大,否则内存不够用。

 

 

 

public class SemiJoin extends Configured implements  Tool
public static class JoinWritable implements  Writable
      private String  tag;
      private String  data;
 
      public JoinWritable() {
 
      }
 
      public JoinWritable(String tag, String data) {
           this.tag = tag;
           this.data = data;
      }
 
      public void write(DataOutput out) throws IOException {
out.writeUTF(getTag());
out.writeUTF(getData());
      }
 
      public void readFields(DataInput in) throws IOException {
           this.setTag(in.readUTF());
           this.setData(in.readUTF());
      }
 
      public void set(String tag, String data) {
           this.setTag(tag);
           this.setData(data);
      }
 
 @Override
      public int hashCode() {
           final int prime = 31;
           int result = 1;
result = prime * result + ((data == null) ?  0 : data.hashCode());
result = prime * result + ((tag == null) ?  0 : tag.hashCode());
           return  result;
      }
 
 @Override
      public boolean equals(Object obj) {
           if (this == obj)
                 return true;
           if (obj == null)
                 return false;
           if (getClass() != obj.getClass())
                 return false;
           JoinWritable other = (JoinWritable)  obj;
           if (data == null) {
                 if (other.data != null)
                      return false;
           } else if (!data.equals(other.data))
                 return false;
           if (tag == null) {
                 if (other.tag != null)
                      return false;
           } else if (!tag.equals(other.tag))
                 return false;
           return true;
      }
 
 @Override
      public String  toString() {
           return  tag + "," +  data;
      }
 
      public String  getTag() {
           return  tag;
      }
 
      public void setTag(String tag) {
           this.tag = tag;
      }
 
      public String  getData() {
           return  data;
      }
 
      public void setData(String data) {
           this.data = data;
      }
}
 
public static class SemiJoinMapper extendsMapper<LongWritable, Text, LongWritable, JoinWritable> {
      private Set<Long> keySet = new HashSet<Long>();
      private LongWritable  outputKey = new  LongWritable();
      private JoinWritable  outputValue = new  JoinWritable();
 
 @Override
      protected void map(LongWritable key, Text value,
Context context)
                 throws IOException, InterruptedException {
           if (value == null) {
                 return;
           }
 
           String[] fields =  value.toString().split(",");
           if (ArrayUtils.isEmpty(fields) || fields.length < 3) {
                 return;
           }
 
           try {
                 long cid = Long.valueOf(fields[0]);
                 if (!keySet.contains(cid)) {
                      return;
                 }
outputKey.set(cid);
 
                 String name =  fields[1];
                 if (fields.length == 3) {
                      String phone =  fields[2];
outputValue.set("customer", name + "\t" + phone);
                 }
 
                 if (fields.length == 4) {
                      String price =  fields[2];
                      String date =  fields[3];
outputValue.set("order", name + "\t" + price + "\t" + date);
                 }
context.write(outputKey, outputValue);
           } catch (NumberFormatException e) {
// TODO Auto-generated catch block
e.printStackTrace();
           }
      }
 
 @Override
      protected void setup(Mapper<LongWritable, Text, LongWritable,JoinWritable>.Context context)
                 throws IOException, InterruptedException {
           Configuration conf =  context.getConfiguration();
// 获取小表中的缓存文件的URI
           URI[] localCacheFiles =  context.getCacheFiles();
           if (ArrayUtils.isEmpty(localCacheFiles)) {
                 return;
           }
// 通过URI构造HDFS 路径
           Path path = new Path(localCacheFiles[0]);
// 获取文件系统
FileSystem fs = FileSystem.get(conf);
// 打开文件
           FSDataInputStream in =  fs.open(path);
           if (in == null) {
                 return;
           }
            InputStreamReaderisr = new  InputStreamReader(in);
           BufferedReader br = new BufferedReader(isr);
           String line = null;
           String[] fields = null;
           while ((line = br.readLine()) != null) {
fields = line.split(",");
                 if (ArrayUtils.isEmpty(fields) || fields.length != 3) {
                      continue;
                 }
keySet.add(Long.valueOf(fields[0]));
           }
           if (br != null) {
br.close();
           }
           if (isr != null) {
isr.close();
           }
      }
}
 
public static class SemiJoinReducer extendsReducer<LongWritable, JoinWritable, NullWritable, Text> {
      private Text  outputValue = new  Text();
 
 @Override
      protected void reduce(LongWritable  key, Iterable<JoinWritable>  values,
Context context)
                 throws IOException, InterruptedException {
List<String> orderList = new  ArrayList<String>();
           String customerInfo = null;
           for (JoinWritable value : values) {
                 if ("customer".equals(value.getTag())) {
customerInfo = value.getData();
                 } else if ("order".equals(value.getTag())) {
orderList.add(value.getData());
                 }
           }
 
           for (String order : orderList) {
outputValue.set(key.get() + "\t" + customerInfo + "\t" + order);
context.write(NullWritable.get(), outputValue);
           }
      }
}
 
public int run(String[] args) throws Exception {
      if (ArrayUtils.isEmpty(args) || args.length < 3) {
           System.exit(3);
      }
 
      Configuration conf = this.getConf();
      Job job = Job.getInstance(conf, this.getClass().getSimpleName());
 
 job.setJarByClass(SemiJoin.class);
 
      Path in = new Path(args[0]);
 FileInputFormat.addInputPath(job, in);
 
      Path out = new Path(args[1]);
 FileOutputFormat.setOutputPath(job, out);
 
      Path cacheURL = new Path(args[2]);
 job.addCacheFile(cacheURL.toUri());
 
 job.setMapperClass(SemiJoinMapper.class);
 job.setMapOutputKeyClass(LongWritable.class);
 job.setMapOutputValueClass(JoinWritable.class);
 
 job.setReducerClass(SemiJoinReducer.class);
 job.setOutputKeyClass(NullWritable.class);
 job.setOutputValueClass(Text.class);
 
      return job.waitForCompletion(Boolean.TRUE) ? 0 : 1;
}
 
public static void main(String[] args) throws Exception {
      int num = new  Random().nextInt(1000);
      if (args == null ||  args.length ==  0) {
args = new String[] { "hdfs://hdfs-cluster/user/hadoop/input/join",
"hdfs://hdfs-cluster/user/hadoop/output/join" + num,
"hdfs://hdfs-cluster/user/hadoop/cache/customers.csv"
      }
 
      Configuration conf = new Configuration();
      int code = ToolRunner.run(conf, new  SemiJoin(), args);
      System.exit(code);
}
}