参考了一个博客:https://www.liangzl.com/get-article-detail-131008.html
做法是建一个writable的bean,用来装载值
对于不同类型的表,通过FileInputFormat.setInputPaths(job,input);
方法读取一批文件,根据文件名来判断是哪个表。
JoinBean
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class JoinBean implements Writable {
private String orderId;
private String userId;
private String userName;
private int userAge;
private String userFriend;
private String tableName;
public void set(String orderId, String userId, String userName, int userAge, String userFriend,String tableName) {
this.orderId = orderId;
this.userId = userId;
this.userName = userName;
this.userAge = userAge;
this.userFriend = userFriend;
this.tableName=tableName;
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getUserId() {
return userId;
}
public void setUserId(String userId) {
this.userId = userId;
}
public String getUserName() {
return userName;
}
public void setUserName(String userName) {
this.userName = userName;
}
public int getUserAge() {
return userAge;
}
public void setUserAge(int userAge) {
this.userAge = userAge;
}
public String getUserFriend() {
return userFriend;
}
public void setUserFriend(String userFriend) {
this.userFriend = userFriend;
}
public String getTableName() {
return tableName;
}
public void setTableName(String tableName) {
this.tableName = tableName;
}
@Override
public String toString() {
return this.orderId+","+this.userId+","+this.userAge
+","+this.userName+","+this.userFriend;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(this.orderId);
dataOutput.writeUTF(this.userId);
dataOutput.writeUTF(this.userName);
dataOutput.writeInt(this.userAge);
dataOutput.writeUTF(this.userFriend);
dataOutput.writeUTF(this.tableName);
}
public void readFields(DataInput dataInput) throws IOException {
this.orderId=dataInput.readUTF();
this.userId=dataInput.readUTF();
this.userName=dataInput.readUTF();
this.userAge=dataInput.readInt();
this.userFriend=dataInput.readUTF();
this.tableName=dataInput.readUTF();
}
}
ReduceSideJoin
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
public class ReduceSideJoin {
//每次读一个文件都会新建一个对象,然后调用setup方法,然后反复调用里面的mapper函数
public static class ReduceSideJoinMapper extends Mapper<LongWritable, Text,Text,JoinBean> {
String fileName =null;
JoinBean bean = new JoinBean();
Text k=new Text();
/**
* maptask在做数据处理的时候,会先调用一次setup(只会调用一次)
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
fileName=inputSplit.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fields = value.toString().split(",");
if (fileName.startsWith("order")){ //通过文件名来判断
bean.set(fields[0],fields[1],"NULL",-1,"NULL","order");
}else{
bean.set("NULL",fields[0],fields[1],Integer.parseInt(fields[2]),fields[3],"user");
}
k.set(bean.getUserId());
context.write(k,bean);
}
}
public static class ReducerSideJoinReduce extends Reducer<Text,JoinBean,JoinBean, NullWritable> {
@Override
protected void reduce(Text key, Iterable<JoinBean> beans, Context context) throws IOException, InterruptedException {
ArrayList<JoinBean> orderList = new ArrayList<JoinBean>();
JoinBean userBean=null;
try {
for (JoinBean bean:beans){
//区分两类数据
if("order".equals(bean.getTableName())){
JoinBean newBean = new JoinBean();
BeanUtils.copyProperties(newBean,bean);
orderList.add(newBean);
}else{
userBean=new JoinBean();
BeanUtils.copyProperties(userBean,bean);
}
}
//拼接数据userBean数据加入到orderBean
for(JoinBean bean:orderList){
bean.setUserAge(userBean.getUserAge());
bean.setUserFriend(userBean.getUserFriend());
bean.setUserName(userBean.getUserName());
context.write(bean,NullWritable.get());
System.out.println(userBean.getUserFriend());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args)throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//动态获取jar包在哪里
job.setJarByClass(ReduceSideJoin.class);
//2.封装参数:本次job所要调用的mapper实现类
job.setMapperClass(ReduceSideJoinMapper.class);
job.setReducerClass(ReducerSideJoinReduce.class);
//3.封装参数:本次job的Mapper实现类产生的数据key,value的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(JoinBean.class);
//4.封装参数:本次Reduce返回的key,value数据类型
job.setOutputKeyClass(JoinBean.class);
job.setOutputValueClass(NullWritable.class);
Path input=new Path("/home/tqc/Desktop/mrjoin/input");
Path outout=new Path("/home/tqc/Desktop/mrjoin/output");
FileSystem fs = outout.getFileSystem(conf);
if(fs.exists(outout)){
fs.delete(outout, true);
}
FileInputFormat.setInputPaths(job,input);
FileOutputFormat.setOutputPath(job,outout);
boolean res = job.waitForCompletion(true);
System.exit(res ? 0:-1);
}
}