hadoop自定义类型排序如何自定义hadoop数据类型

转载

云端小梦 2023-07-12 15:27:56

文章标签 hadoop自定义类型排序排序分区序列化自定义类型 文章分类 Hadoop 大数据

文章目录

自定义数据类型(序列化)

自定义数据类型

自定义数据类型规则

实例1

使用hadoop提供的数据类型实现如上格式输出

自定义数据类型 FlowWritable
实现map方法
实现reduce方法
主函数 Driver

Map的分片

自定义分区

实例2

默认分区的源码
上一层源码
过程分析
继承Partitioner类
修改主函数Driver

排序

实例3

修改FlowWritable 实现 WritableComparable 接口
实现map函数
实现reduce函数
实现主函数Driver

自定义数据类型(序列化)

自定义数据类型

Java类型	Hadoop类型
boolean	BooleanWritable
byte	ByteWriteable
int	IntWritable
float	FloatWritable
long	LongWritable
double	DoubleWritable
String	Text
map	MapWritable
array	ArrayWritable

对于以上的数据类型, hadoop都提供了相应的实现, 能满足基本开发需求, 但是有一些需求不能满足

自定义数据类型规则

必须实现** writable **接口
必须提供无参的构造方法, 因为反射的时候 , 默认调用无参的构造方法
分为 key 和 **value ** , 如果自定义数据类型为key , 则必须实现writableComparable接口

实例1

一张数据表如下格式:

13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com	24	27

想要得到的格式:

手机号   				上行流量  		下行流量  		总流量
13726230503     	2481  			 	24681   			53456

使用hadoop提供的数据类型实现如上格式输出

分析: 自定义一个数据类型, 把上行流量, 下行流量进行保存

自定义数据类型 FlowWritable

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class FlowWritable implements Writable{

	private long upFlow; //上行总流量 
	private long downFlow; //下行总流量
	
	/**
	 * 数据序列化方法
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeLong(upFlow);
		out.writeLong(downFlow);
	}

	/**
	 * 反序列化方法
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		this.upFlow=in.readLong();
		this.downFlow=in.readLong();
	}
}

实现map方法

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class FlowMap extends Mapper<LongWritable, Text, Text, FlowWritable>{

	private Text outputKey=new Text();
	
	private FlowWritable outputValue=new FlowWritable();
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		
//		13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com	24	27	2481	24681	200
		//1 获取数据
		String line =value.toString();
		
		//2 开始切割 
		
		String[] fileds=line.split("\t");
		
		// 3 获得手机号 
		
		String phoneNumber=fileds[0];
		
		//4 上行总流量 和下行总流量 
		
		long upFlowValue=Long.valueOf(fileds[fileds.length-3]);
		long downFlowValue=Long.valueOf(fileds[fileds.length-2]);
		
		// 设置 手机号， 上行流量  下行流量 到序列化类型中
		outputKey.set(phoneNumber);
		outputValue.set(upFlowValue, downFlowValue);
		
		context.write(outputKey, outputValue);
	}
	
}

实现reduce方法

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FlowReduce extends Reducer<Text, FlowWritable, Text, FlowWritable>{

	private FlowWritable outputValue=new FlowWritable();
	
	@Override
	protected void reduce(Text key, Iterable<FlowWritable> values,
			Context context) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		
		long totalUpFlow=0;
		
		long totalDownFlow=0;
		
		//相同的手机号，上行流量 和下行流量的和进行累加
		
		for(FlowWritable value:values) {
			
			totalUpFlow+=value.getUpFlow();
			
			totalDownFlow+=value.getDownFlow();
		}
		
		outputValue.set(totalUpFlow, totalDownFlow);
		context.write(key, outputValue);
	}

}

主函数 Driver

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;

public class Driver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		
		//1 获得配置信息 
		
		Configuration config=new Configuration();
		// 实例化 job类 并且把配置信息传给job
		Job job=Job.getInstance(config);
		
		// 通过反射机制 加载主类的位置
		job.setJarByClass(Driver.class);
		
		//设置map和reduce类
		job.setMapperClass(FlowMap.class);
		job.setReducerClass(FlowReduce.class);
		
		
		//设置map的输出 
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowWritable.class);
		
		
		
		//设置redue的输出
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowWritable.class);
		
		//设置文件的输入 输出路径
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//提交任务 
		
		boolean result=job.waitForCompletion(true);
		
		System.exit(result?0:1);
		
	}
}

注意: 序列化的顺序和反序列化的顺序一致

Map的分片

Map的执行数量是由分片决定的
每一个分片对应一个MapTask任务
默认情况下 , 切片的大小等于 blocksize 块的大小, 可以进行修改

自定义分区

实例2

要求:对这个文件实现自定义分区，并且是按照省份ID进行分区的

默认分区的源码

/** Use {@link Object#hashCode()} to partition. */
  public int getPartition(K key, V value,
                          int numReduceTasks) {
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

Key.hashCode % numberReduceTask = 0 或者 1

integer.MAX_VALUE = 01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111
假设key.hashcode = 10101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101
做 & 运算

01111111 11111111 11111111 11111111 11111111 11111111 11111111 11111111 
10101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101

00101011 10101010 11011011 11101000 00110101 11011011 11101000 00110101

注意:

这里所做的&运算主要就是为了防止key.hashcode为负数, 对2取余出现其他分区
默认的分区规则: key.hashCode%reduce 的数量

上一层源码

@InterfaceAudience.Public
@InterfaceStability.Stable
public class HashPartitioner<K, V> extends Partitioner<K, V> {

  /** Use {@link Object#hashCode()} to partition. */
  public int getPartition(K key, V value,
                          int numReduceTasks) {
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

}

从上面的源码可以发现，一个类继承了Partitoner就可以实现一个分区规则

过程分析

拿到手机号
获得手机号前三位
存在一个数据块, 存储对应的省份ID, 可以通过手机号查询到对应的省份
去数据库查询对应的省份, 获得省份ID

继承Partitioner类

import java.util.HashMap;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class ProvincePartitioner extends Partitioner<Text, FlowWritable>{

	static HashMap<String, Integer> provinveID=new HashMap<>();
	
	static {
		
		provinveID.put("136", 0);
		provinveID.put("137", 1);
		provinveID.put("138", 2);
		provinveID.put("139", 3);
	}
	
	@Override
	public int getPartition(Text key, FlowWritable value, int numPartitions) {
		// TODO Auto-generated method stub
		
		//1 获得手机号码的字符串
		
		String phoneNumber=key.toString();
		
		//2 获得手机号码的前三位
		String prefix=phoneNumber.substring(0, 3);
		
		
		Integer proviceId=provinveID.get(prefix);
		
		return proviceId ==null? 4:proviceId;
	}

}

修改主函数Driver

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;

public class Driver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		//1 获得配置信息 
		Configuration config=new Configuration();
		// 实例化 job类 并且把配置信息传给job
		Job job=Job.getInstance(config);
		
		// 通过反射机制 加载主类的位置
		job.setJarByClass(Driver.class);
		
		//设置map和reduce类
		job.setMapperClass(CountSortMap.class);
		job.setReducerClass(CountSortReduce.class);
		
		//设置map的输出 
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowWritable.class);
		
		//设置reduce数量
		job.setNumReduceTasks(5);
		
		// 设置自定义分区类
		job.setPartitionerClass(ProvincePartitioner.class);
		
		//设置redue的输出
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowWritable.class);
		
		//设置文件的输入 输出路径
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//提交任务 
		
		boolean result=job.waitForCompletion(true);
		
		System.exit(result?0:1);
		
	}
}

排序

实例3

对实例1的结果中的总流量进行降序排列
分析

修改FlowWritable 实现 WritableComparable 接口

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

public class FlowWritable implements WritableComparable<FlowWritable>{

	private long upFlow; //上行总流量 
	
	private long downFlow; //下行总流量
	
	private long totalFlow; //总流量
	
	public long getUpFlow() {
		return upFlow;
	}

	public void setUpFlow(long upFlow) {
		this.upFlow = upFlow;
	}

	public long getDownFlow() {
		return downFlow;
	}

	public void setDownFlow(long downFlow) {
		this.downFlow = downFlow;
	}
	
	public void set(long upFlow,long downFlow) {
		
		this.upFlow=upFlow;
		
		this.downFlow=downFlow;
		
		this.totalFlow=this.upFlow+this.downFlow;
	}

	/**
	 * 数据序列化方法
	 */
	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeLong(upFlow);
		out.writeLong(downFlow);
	}

	/**
	 * 反序列化方法
	 */
	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		this.upFlow=in.readLong();
		this.downFlow=in.readLong();
	}

	@Override
	public String toString() {
		return this.upFlow+"\t"+this.downFlow+"\t"+this.totalFlow;
	}

	@Override
	public int compareTo(FlowWritable o) {
		// TODO Auto-generated method stub
		return this.totalFlow>o.totalFlow?-1:1;
	}
}

实现map函数

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


// values 13480253104   key 180	180	360
public class CountSortMap extends Mapper<LongWritable, Text, FlowWritable, Text>{

	FlowWritable outputKey=new FlowWritable();
	
	Text outputValue=new Text();
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		
		// 1 获取一行的数据文本
		String line=value.toString();
		
		//2 截取想要的内容
		
		String[] fileds=line.split("\t");
		//3 手机号
		String phoneNumber=fileds[0];
		//4 上行流量
		long upFlow=Long.valueOf(fileds[1]);
		//5 下行流量
		
		long downFlow=Long.valueOf(fileds[2]);
		
		outputKey.set(upFlow, downFlow);
		
		outputValue.set(phoneNumber);
		
		//6 输出到 reduce 
		context.write(outputKey, outputValue);
		
	}
}

实现reduce函数

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class CountSortReduce extends Reducer<FlowWritable, Text, Text, FlowWritable>{
	@Override
	protected void reduce(FlowWritable key, Iterable<Text> values,
			Context context) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		
		for(Text text:values) {
			System.out.println(key.getUpFlow());
			context.write(text, key);
		}
	}
}

实现主函数Driver

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.qianfeng.wordcount.WordCountMap;
import org.qianfeng.wordcount.WordCountReduce;

public class Driver {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		
		//1 获得配置信息 
		
		Configuration config=new Configuration();
		// 实例化 job类 并且把配置信息传给job
		Job job=Job.getInstance(config);
		
		// 通过反射机制 加载主类的位置
		job.setJarByClass(Driver.class);
		
		//设置map和reduce类
		job.setMapperClass(CountSortMap.class);
		job.setReducerClass(CountSortReduce.class);
		
		
		//设置map的输出 
		job.setMapOutputKeyClass(FlowWritable.class);
		job.setMapOutputValueClass(Text.class);

		//设置redue的输出
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowWritable.class);
		
		//设置文件的输入 输出路径
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		//提交任务 
		
		boolean result=job.waitForCompletion(true);
		
		System.exit(result?0:1);
		
	}
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。