hadoop 自定义文件自定义hadoop数据类型

转载

数据挖掘者 2023-10-03 11:49:31

文章标签 hadoop 自定义文件 Text apache hadoop 文章分类 Hadoop 大数据

　　Hadoop提供了大量的数据输入类型，如Text和IntWritable．

　　假如我们需要自定义一个数据类型，首先要实现Writable接口，主要包含两个函数readFields和write．如果需要把新的数据类型作为Map函数的key输出的话，在shuffle阶段会有一个排序的过程，需要对key进行比较，那么这里就推荐实现WritableComparable接口，它比Writable接口多一个compareTo函数．

　　自定义数据类型：

　　注意：自定义的数据类型必须要有空的构造函数，且需要调用父类的构造方法．同时建议在空的构造函数里面对属性进行初始化，因为write函数对实例进行序列化，readFields函数进行反序列化，在进行反序列化时，若属性未进行初始化，可能会出现空指针异常（普通数据类型不会出现该异常，当属性为符合数据类型时才会出现异常），因此建议大家在空的构造函数中将所有的属性都进行初始化．toString函数是作为reduce函数输出时用到的．

package pers.kefault.entity;

import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;


public class IndexDoc implements  WritableComparable<IndexDoc> {
	
	private int docId;
	private String urlString;
	private String siteName;
	
	public IndexDoc() {
		super();
		// TODO Auto-generated constructor stub
	}

	public IndexDoc(int docId, String urlString, String siteName) {
		super();
		this.docId = docId;
		this.urlString = urlString;
		this.siteName = siteName;
	}
	//复制对象，好烦～～
	public IndexDoc clone()
	{
		return new IndexDoc(getDocId(),getUrlString(),getSiteName());
	}
	
	public void setDocId(int docId) {
		this.docId = docId;
	}

	public void setUrlString(String urlString) {
		this.urlString = urlString;
	}

	public void setSiteName(String siteName) {
		this.siteName = siteName;
	}

	public int getDocId() {
		return docId;
	}

	public String getSiteName() {
		if(siteName == null || siteName == "")
		{
			siteName = urlString.split("/")[0];
		}
		return siteName;
	}

	public String getUrlString() {
		return urlString;
	}
	
	//读取文件，以Text形式返回
	public Text text()
	{
		//设置读取的文件的路径
		String fileName = "/home/monster/spider/" + urlString;
		File file = new File(fileName);
		BufferedReader reader = null;
		String content = "";
		try
		{
			reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"GB2312"));
			String tempString = null;
			while((tempString = reader.readLine())!=null)
			{
				content += tempString;
			}
			reader.close();
		}catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
		return new Text(content); 
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		docId = in.readInt();
		urlString = in.readUTF();
		siteName = urlString.split("/")[0];
	}

	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeInt(docId);
		out.writeUTF(urlString);
	}

	@Override
	public int compareTo(IndexDoc o) {
		// TODO Auto-generated method stub
		if(o.docId < docId)
		{
			return 1;
		}else if(o.docId > docId)
		{
			return -1;
		}
		return 0;
	}
	
	@Override
	public String toString() {
		return docId + "," + urlString + "," + siteName;
	}
}

　　自定义数据输入格式：

package pers.kefault.format;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import pers.kefault.entity.IndexDoc;

public class IndexDocInputFormat extends FileInputFormat<Text, IndexDoc> {
	@Override
	public RecordReader<Text, IndexDoc> createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		context.setStatus(split.toString());//不知道干嘛的
		IndexDocRecordReader reader = new IndexDocRecordReader(context.getConfiguration());
		return reader;
	}
}

　　自定义数据输入的RecordReader:

package pers.kefault.format;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

import pers.kefault.entity.IndexDoc;

public class IndexDocRecordReader extends RecordReader<Text, IndexDoc> {

	public static final String KEY_VALUE_SEPERATOR = "mapreduce.input.keyvaluelinerecordreader.key.value.separator";
	private final LineRecordReader lineRecordReader;
	private byte separator = (byte) '\t';
	private Text innerValue;
	private Text key;
	private IndexDoc value;
	
	public Class<Text> getKeyClass() { 
		return Text.class;
	}
	
	@Override
	public  void close() throws IOException {
		// TODO Auto-generated method stub
		System.out.println("close()");
		lineRecordReader.close();
	}

	public IndexDocRecordReader(Configuration conf) {
		// TODO Auto-generated constructor stub
		lineRecordReader = new LineRecordReader();
		String sepStr = conf.get(KEY_VALUE_SEPERATOR,"\t");
		this.separator = (byte) sepStr.charAt(0);
	}

	public static int findSeparator(byte[] utf, int start, int length, byte sep) {
		for (int i = start; i < (start + length); i++) {
			if (utf[i] == sep) {
				return i;
			}
		}
		return -1;
	}
	
	public static void setKeyValue(Text key, IndexDoc value, byte[] line,int lineLen, int pos) {
		if (pos == -1) {
			key.set(line, 0, lineLen);
			value.setDocId(-1);
			value.setSiteName("www.kefault.com");
			value.setUrlString("www.kefault.com/null");
		} else {
			key.set(line, 0, pos); //设置键  从 第 0位置 到 截取标识符的位置
			Text text = new Text();
			text.set(line, pos + 1, lineLen - pos - 1);
			String[] str = text.toString().split(",");
			value.setDocId(Integer.parseInt(str[0]));
			value.setUrlString(str[1]);
			value.setSiteName(str[2]);
		}
	}
	
	@Override
	public Text getCurrentKey() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return key;
	}

	@Override
	public IndexDoc getCurrentValue() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return lineRecordReader.getProgress();
	}

	@Override
	public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		lineRecordReader.initialize(genericSplit, context);
	}

	@Override
	public  boolean nextKeyValue() throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		byte[] line = null;
		int lineLen = -1;
		
		if (lineRecordReader.nextKeyValue()) {
			innerValue = lineRecordReader.getCurrentValue();
			line = innerValue.getBytes();
			lineLen = innerValue.getLength();
		} else {
			System.out.println("return false;");
			return false;
		}
		if (line == null){
			System.out.println("return false;");
			return false;
		}
		
		if (key == null) {
			key = new Text();
		}
		if (value == null) {
			value = new IndexDoc(); 
		}
		
		int pos = findSeparator(line, 0, lineLen, this.separator);
		setKeyValue(key, value, line, lineLen, pos);
		return true;
	}
}

　　整个程序的重点放在RecordReader上．简单来说，读取数据按行读入，返回一个key-value对．而开始读入的信息保存成Text类型，然后对Text解析，变成key-value对的形式．别看代码这么多，其实拿过来要修改的地方也不是很多．对Text分解时，依据的是一个分割符号，这里用的是'\t'，大家也可以自定．然后依据分割成的字符串创建key的对象和value的对象就行了．

　　如果大家使用Eclipse的话，在控制台是不显示MapReduce执行过程中的错误的，所以大家可以看一下点击打开链接．希望可以让你调试代码更轻松一点．

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。