案例总结目录
- 1. Reduce Join案例
- 2. Map Join案例
- 3. 数据清洗(ETL)
1. Reduce Join案例
- 需求:将下列两个表进行合并,订单中的pid经过合并之后编程pname
订单数据表t_order
id | pid | amount |
1001 | 01 | 1 |
1002 | 02 | 2 |
1003 | 03 | 3 |
1004 | 01 | 4 |
1005 | 02 | 5 |
1006 | 03 | 6 |
商品表:
pid | pname |
01 | 小米 |
02 | 华为 |
03 | 格力 |
合并后:
id | pname | amount |
1001 | 小米 | 1 |
1002 | 华为 | 2 |
1003 | 格力 | 3 |
1004 | 小米 | 4 |
1005 | 华为 | 5 |
1006 | 格力 | 6 |
- 需求分析:通过将关联条件作为Map输出的key(pid),将两表满足Join条件的数据并携带数据所来源的文件信息,发往同一个ReduceTask,在Reduce中进行数据的串联。
实现代码:
TableBean.java
public class TableBean implements Writable {
private String id; // 订单id
private String pid; // 商品id
private int amount; // 商品数量
private String pname;// 商品名称
private String flag; // 标记是什么表 order pd
// 空参构造
public TableBean() {
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(id);
out.writeUTF(pid);
out.writeInt(amount);
out.writeUTF(pname);
out.writeUTF(flag);
}
@Override
public void readFields(DataInput in) throws IOException {
this.id = in.readUTF();
this.pid = in.readUTF();
this.amount = in.readInt();
this.pname = in.readUTF();
this.flag = in.readUTF();
}
@Override
public String toString() {
// id pname amount
return id + "\t" + pname + "\t" + amount ;
}
}
TableMapper.java
public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {
private String fileName;
private Text outK = new Text();
private TableBean outV = new TableBean();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 初始化 order pd(获取文件的名称)
FileSplit split = (FileSplit) context.getInputSplit();
fileName = split.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1 获取一行
String line = value.toString();
// 2 判断是哪个文件的
if (fileName.contains("order")){// 处理的是订单表
String[] split = line.split("\t");
// 封装k v
outK.set(split[1]);
outV.setId(split[0]);
outV.setPid(split[1]);
outV.setAmount(Integer.parseInt(split[2]));
outV.setPname("");
outV.setFlag("order");
}else {// 处理的是商品表
String[] split = line.split("\t");
outK.set(split[0]);
outV.setId("");
outV.setPid(split[0]);
outV.setAmount(0);
outV.setPname(split[1]);
outV.setFlag("pd");
}
// 写出
context.write(outK, outV);
}
}
TableReducer.java
public class TableReducer extends Reducer<Text, TableBean,TableBean, NullWritable> {
@Override
protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
// 01 1001 1 order
// 01 1004 4 order
// 01 小米 pd
// 准备初始化集合
ArrayList<TableBean> orderBeans = new ArrayList<>();
TableBean pdBean = new TableBean();
// 循环遍历
for (TableBean value : values) {
if ("order".equals(value.getFlag())){// 订单表
TableBean tmptableBean = new TableBean();
try {
BeanUtils.copyProperties(tmptableBean,value);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
orderBeans.add(tmptableBean);
}else {// 商品表
try {
BeanUtils.copyProperties(pdBean,value);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
// 循环遍历orderBeans,赋值 pdname
for (TableBean orderBean : orderBeans) {
orderBean.setPname(pdBean.getPname());
context.write(orderBean,NullWritable.get());
}
}
}
TableDriver.java
public class TableDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(TableDriver.class);
job.setMapperClass(TableMapper.class);
job.setReducerClass(TableReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TableBean.class);
job.setOutputKeyClass(TableBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("D:\\hadoop\\input\\inputtable"));
FileOutputFormat.setOutputPath(job, new Path("D:\\hadoop\\output12"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
2. Map Join案例
- 需求:将下列两个表进行合并,订单中的pid经过合并之后编程pname
订单数据表t_order
id | pid | amount |
1001 | 01 | 1 |
1002 | 02 | 2 |
1003 | 03 | 3 |
1004 | 01 | 4 |
1005 | 02 | 5 |
1006 | 03 | 6 |
商品表:
pid | pname |
01 | 小米 |
02 | 华为 |
03 | 格力 |
合并后:
id | pname | amount |
1001 | 小米 | 1 |
1002 | 华为 | 2 |
1003 | 格力 | 3 |
1004 | 小米 | 4 |
1005 | 华为 | 5 |
1006 | 格力 | 6 |
- 需求分析:在驱动中设置缓存文件,在Map初始化阶段读取缓存文件,将商品表信息封装到一个Map集合中,在map()方法中获取数据【根据map中的pid获取pname】进行拼接
MapJoinMapper .java
public class MapJoinMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
private HashMap<String, String> pdMap = new HashMap<>();
private Text OutK = new Text();
@Override
protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
//获取缓存文件
URI[] cacheFiles = context.getCacheFiles();
//获取系统对象,创建输入流
FileSystem fs = FileSystem.get(context.getConfiguration());
FSDataInputStream fis = fs.open(new Path(cacheFiles[0]));
BufferedReader reader = new BufferedReader(new InputStreamReader(fis, "UTF-8"));
String line;
//将商品信息放入到Map集合中
while (StringUtils.isNotEmpty(line = reader.readLine())) {
String[] split = line.split("\t");
pdMap.put(split[0], split[1]);
}
IOUtils.closeStream(reader);
}
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
//获取订单表信息,并进行赋值
String line = value.toString();
String[] split = line.split("\t");
//将需要输出的结果进行拼接
String pname = pdMap.get(split[1]);
OutK.set(split[0] + "\t" + pname + "\t" + split[2]);
context.write(OutK,NullWritable.get());
}
}
MapJoinDriver .java
public class MapJoinDriver {
public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(MapJoinDriver.class);
job.setMapperClass(MapJoinMapper.class);
//设置map的kv
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.addCacheFile(new URI("file:///D:/hadoop/input/tablecahe/pd.txt"));
// Map端Join的逻辑不需要Reduce阶段,设置reduceTask数量为0
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path("D:\\hadoop\\input\\inputtable"));
FileOutputFormat.setOutputPath(job,new Path("D:\\hadoop\\output2"));
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
3. 数据清洗(ETL)
在运行核心业务MapReduce程序之前,往往要先对数据进行清洗,清理掉不符合用户要求的数据。清理的过程往往只需要运行Mapper程序,不需要运行Reduce程序。
- 需求:去除日志中字段个数小于等于11的日志。
- 需求分析:需要在Map阶段对输入的数据根据规则进行过滤清洗。
WebLogMapper.java
public class WebLogMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
//1.获取一行数据
String line = value.toString();
//2.ETL处理(数据清洗)
boolean result = parseLog(line, context);
//3.判断是否选择输出
if (!result) {
return;//如果日志长度小于11,则直接返回将数据过滤
}
//4.写出
context.write(value, NullWritable.get());
}
private boolean parseLog(String line, Context context) {
String[] fields = line.split(" ");
if (fields.length > 11) {
return true;
} else {
return false;
}
}
}
WebLogDriver.java
public class WebLogDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(WebLogDriver.class);
job.setMapperClass(WebLogMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job,new Path("D:\\hadoop\\input\\inputlog"));
FileOutputFormat.setOutputPath(job,new Path("D:\\hadoop\\output5"));
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1) ;
}
}