文章目录
- 自定义数据类型(序列化)
- 自定义数据类型
- 自定义数据类型规则
- 实例1
- 使用hadoop提供的数据类型实现如上格式输出
- 自定义数据类型 FlowWritable
- 实现map方法
- 实现reduce方法
- 主函数 Driver
- Map的分片
- 自定义分区
- 实例2
- 默认分区的源码
- 上一层源码
- 过程分析
- 继承Partitioner类
- 修改主函数Driver
- 排序
- 实例3
- 修改FlowWritable 实现 WritableComparable 接口
- 实现map函数
- 实现reduce函数
- 实现主函数Driver
自定义数据类型(序列化)
自定义数据类型
Java类型 | Hadoop类型 |
boolean | BooleanWritable |
byte | ByteWriteable |
int | IntWritable |
float | FloatWritable |
long | LongWritable |
double | DoubleWritable |
String | Text |
map | MapWritable |
array | ArrayWritable |
对于以上的数据类型, hadoop都提供了相应的实现, 能满足基本开发需求, 但是有一些需求不能满足
自定义数据类型规则
- 必须实现** writable **接口
- 必须提供无参的构造方法, 因为反射的时候 , 默认调用无参的构造方法
- 分为 key 和 **value ** , 如果自定义数据类型为key , 则必须实现writableComparable接口
实例1
一张数据表如下格式:
13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27想要得到的格式:
手机号 上行流量 下行流量 总流量
13726230503 2481 24681 53456使用hadoop提供的数据类型实现如上格式输出
- 分析: 自定义一个数据类型, 把上行流量, 下行流量进行保存
自定义数据类型 FlowWritable
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FlowWritable implements Writable{
private long upFlow; //上行总流量
private long downFlow; //下行总流量
/**
* 数据序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(upFlow);
out.writeLong(downFlow);
}
/**
* 反序列化方法
*/
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.upFlow=in.readLong();
this.downFlow=in.readLong();
}
}实现map方法
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FlowMap extends Mapper<LongWritable, Text, Text, FlowWritable>{
private Text outputKey=new Text();
private FlowWritable outputValue=new FlowWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
//1 获取数据
String line =value.toString();
//2 开始切割
String[] fileds=line.split("\t");
// 3 获得手机号
String phoneNumber=fileds[0];
//4 上行总流量 和下行总流量
long upFlowValue=Long.valueOf(fileds[fileds.length-3]);
long downFlowValue=Long.valueOf(fileds[fileds.length-2]);
// 设置 手机号, 上行流量 下行流量 到序列化类型中
outputKey.set(phoneNumber);
outputValue.set(upFlowValue, downFlowValue);
context.write(outputKey, outputValue);
}
}实现reduce方法
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowReduce extends Reducer<Text, FlowWritable, Text, FlowWritable>{
private FlowWritable outputValue=new FlowWritable();
@Override
protected void reduce(Text key, Iterable<FlowWritable> values,
Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
long totalUpFlow=0;
long totalDownFlow=0;
//相同的手机号,上行流量 和下行流量的和进行累加
for(FlowWritable value:values) {
totalUpFlow+=value.getUpFlow();
totalDownFlow+=value.getDownFlow();
}
outputValue.set(totalUpFlow, totalDownFlow);
context.write(key, outputValue);
}
}主函数 Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 获得配置信息
Configuration config=new Configuration();
// 实例化 job类 并且把配置信息传给job
Job job=Job.getInstance(config);
// 通过反射机制 加载主类的位置
job.setJarByClass(Driver.class);
//设置map和reduce类
job.setMapperClass(FlowMap.class);
job.setReducerClass(FlowReduce.class);
//设置map的输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowWritable.class);
//设置redue的输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowWritable.class);
//设置文件的输入 输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任务
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}- 注意: 序列化的顺序和反序列化的顺序一致
Map的分片
- Map的执行数量是由分片决定的
- 每一个分片 对应一个MapTask任务
- 默认情况下 , 切片的大小等于 blocksize 块的大小, 可以进行修改
自定义分区
实例2
- 要求:对这个文件实现自定义分区,并且是按照省份ID进行分区的
默认分区的源码
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K key, V value,
int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}Key.hashCode % numberReduceTask = 0 或者 1
integer.MAX_VALUE = 01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111
假设key.hashcode = 10101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101
做 & 运算
01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111
10101011 10101010 11011011 11101000 00110101 11011011 11101000 0011010100101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101注意:
- 这里所做的&运算主要就是为了防止key.hashcode为负数, 对2取余出现其他分区
- 默认的分区规则: key.hashCode%reduce 的数量
上一层源码
@InterfaceAudience.Public
@InterfaceStability.Stable
public class HashPartitioner<K, V> extends Partitioner<K, V> {
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K key, V value,
int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
}从上面的源码可以发现,一个类继承了Partitoner就可以实现一个分区规则
过程分析
- 拿到手机号
- 获得手机号前三位
- 存在一个数据块, 存储对应的省份ID, 可以通过手机号查询到对应的省份
- 去数据库查询对应的省份, 获得省份ID
继承Partitioner类
import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class ProvincePartitioner extends Partitioner<Text, FlowWritable>{
static HashMap<String, Integer> provinveID=new HashMap<>();
static {
provinveID.put("136", 0);
provinveID.put("137", 1);
provinveID.put("138", 2);
provinveID.put("139", 3);
}
@Override
public int getPartition(Text key, FlowWritable value, int numPartitions) {
// TODO Auto-generated method stub
//1 获得手机号码的字符串
String phoneNumber=key.toString();
//2 获得手机号码的前三位
String prefix=phoneNumber.substring(0, 3);
Integer proviceId=provinveID.get(prefix);
return proviceId ==null? 4:proviceId;
}
}修改主函数Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 获得配置信息
Configuration config=new Configuration();
// 实例化 job类 并且把配置信息传给job
Job job=Job.getInstance(config);
// 通过反射机制 加载主类的位置
job.setJarByClass(Driver.class);
//设置map和reduce类
job.setMapperClass(CountSortMap.class);
job.setReducerClass(CountSortReduce.class);
//设置map的输出
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowWritable.class);
//设置reduce数量
job.setNumReduceTasks(5);
// 设置自定义分区类
job.setPartitionerClass(ProvincePartitioner.class);
//设置redue的输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowWritable.class);
//设置文件的输入 输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任务
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}排序
实例3
对实例1的结果中的总流量进行降序排列
分析
修改FlowWritable 实现 WritableComparable 接口
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
public class FlowWritable implements WritableComparable<FlowWritable>{
private long upFlow; //上行总流量
private long downFlow; //下行总流量
private long totalFlow; //总流量
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
public void set(long upFlow,long downFlow) {
this.upFlow=upFlow;
this.downFlow=downFlow;
this.totalFlow=this.upFlow+this.downFlow;
}
/**
* 数据序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(upFlow);
out.writeLong(downFlow);
}
/**
* 反序列化方法
*/
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.upFlow=in.readLong();
this.downFlow=in.readLong();
}
@Override
public String toString() {
return this.upFlow+"\t"+this.downFlow+"\t"+this.totalFlow;
}
@Override
public int compareTo(FlowWritable o) {
// TODO Auto-generated method stub
return this.totalFlow>o.totalFlow?-1:1;
}
}实现map函数
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
// values 13480253104 key 180 180 360
public class CountSortMap extends Mapper<LongWritable, Text, FlowWritable, Text>{
FlowWritable outputKey=new FlowWritable();
Text outputValue=new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
// 1 获取一行的数据文本
String line=value.toString();
//2 截取想要的内容
String[] fileds=line.split("\t");
//3 手机号
String phoneNumber=fileds[0];
//4 上行流量
long upFlow=Long.valueOf(fileds[1]);
//5 下行流量
long downFlow=Long.valueOf(fileds[2]);
outputKey.set(upFlow, downFlow);
outputValue.set(phoneNumber);
//6 输出到 reduce
context.write(outputKey, outputValue);
}
}实现reduce函数
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class CountSortReduce extends Reducer<FlowWritable, Text, Text, FlowWritable>{
@Override
protected void reduce(FlowWritable key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
for(Text text:values) {
System.out.println(key.getUpFlow());
context.write(text, key);
}
}
}实现主函数Driver
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;
public class Driver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1 获得配置信息
Configuration config=new Configuration();
// 实例化 job类 并且把配置信息传给job
Job job=Job.getInstance(config);
// 通过反射机制 加载主类的位置
job.setJarByClass(Driver.class);
//设置map和reduce类
job.setMapperClass(CountSortMap.class);
job.setReducerClass(CountSortReduce.class);
//设置map的输出
job.setMapOutputKeyClass(FlowWritable.class);
job.setMapOutputValueClass(Text.class);
//设置redue的输出
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowWritable.class);
//设置文件的输入 输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//提交任务
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}
















