Hadoop提供了大量的数据输入类型,如Text和IntWritable.
假如我们需要自定义一个数据类型,首先要实现Writable接口,主要包含两个函数readFields和write.如果需要把新的数据类型作为Map函数的key输出的话,在shuffle阶段会有一个排序的过程,需要对key进行比较,那么这里就推荐实现WritableComparable接口,它比Writable接口多一个compareTo函数.
自定义数据类型:
注意:自定义的数据类型必须要有空的构造函数,且需要调用父类的构造方法.同时建议在空的构造函数里面对属性进行初始化,因为write函数对实例进行序列化,readFields函数进行反序列化,在进行反序列化时,若属性未进行初始化,可能会出现空指针异常(普通数据类型不会出现该异常,当属性为符合数据类型时才会出现异常),因此建议大家在空的构造函数中将所有的属性都进行初始化.toString函数是作为reduce函数输出时用到的.
package pers.kefault.entity;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
public class IndexDoc implements WritableComparable<IndexDoc> {
private int docId;
private String urlString;
private String siteName;
public IndexDoc() {
super();
// TODO Auto-generated constructor stub
}
public IndexDoc(int docId, String urlString, String siteName) {
super();
this.docId = docId;
this.urlString = urlString;
this.siteName = siteName;
}
//复制对象,好烦~~
public IndexDoc clone()
{
return new IndexDoc(getDocId(),getUrlString(),getSiteName());
}
public void setDocId(int docId) {
this.docId = docId;
}
public void setUrlString(String urlString) {
this.urlString = urlString;
}
public void setSiteName(String siteName) {
this.siteName = siteName;
}
public int getDocId() {
return docId;
}
public String getSiteName() {
if(siteName == null || siteName == "")
{
siteName = urlString.split("/")[0];
}
return siteName;
}
public String getUrlString() {
return urlString;
}
//读取文件,以Text形式返回
public Text text()
{
//设置读取的文件的路径
String fileName = "/home/monster/spider/" + urlString;
File file = new File(fileName);
BufferedReader reader = null;
String content = "";
try
{
reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"GB2312"));
String tempString = null;
while((tempString = reader.readLine())!=null)
{
content += tempString;
}
reader.close();
}catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
return new Text(content);
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
docId = in.readInt();
urlString = in.readUTF();
siteName = urlString.split("/")[0];
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeInt(docId);
out.writeUTF(urlString);
}
@Override
public int compareTo(IndexDoc o) {
// TODO Auto-generated method stub
if(o.docId < docId)
{
return 1;
}else if(o.docId > docId)
{
return -1;
}
return 0;
}
@Override
public String toString() {
return docId + "," + urlString + "," + siteName;
}
}
自定义数据输入格式:
package pers.kefault.format;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import pers.kefault.entity.IndexDoc;
public class IndexDocInputFormat extends FileInputFormat<Text, IndexDoc> {
@Override
public RecordReader<Text, IndexDoc> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
context.setStatus(split.toString());//不知道干嘛的
IndexDocRecordReader reader = new IndexDocRecordReader(context.getConfiguration());
return reader;
}
}
自定义数据输入的RecordReader:
package pers.kefault.format;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import pers.kefault.entity.IndexDoc;
public class IndexDocRecordReader extends RecordReader<Text, IndexDoc> {
public static final String KEY_VALUE_SEPERATOR = "mapreduce.input.keyvaluelinerecordreader.key.value.separator";
private final LineRecordReader lineRecordReader;
private byte separator = (byte) '\t';
private Text innerValue;
private Text key;
private IndexDoc value;
public Class<Text> getKeyClass() {
return Text.class;
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
System.out.println("close()");
lineRecordReader.close();
}
public IndexDocRecordReader(Configuration conf) {
// TODO Auto-generated constructor stub
lineRecordReader = new LineRecordReader();
String sepStr = conf.get(KEY_VALUE_SEPERATOR,"\t");
this.separator = (byte) sepStr.charAt(0);
}
public static int findSeparator(byte[] utf, int start, int length, byte sep) {
for (int i = start; i < (start + length); i++) {
if (utf[i] == sep) {
return i;
}
}
return -1;
}
public static void setKeyValue(Text key, IndexDoc value, byte[] line,int lineLen, int pos) {
if (pos == -1) {
key.set(line, 0, lineLen);
value.setDocId(-1);
value.setSiteName("www.kefault.com");
value.setUrlString("www.kefault.com/null");
} else {
key.set(line, 0, pos); //设置键 从 第 0位置 到 截取标识符的位置
Text text = new Text();
text.set(line, pos + 1, lineLen - pos - 1);
String[] str = text.toString().split(",");
value.setDocId(Integer.parseInt(str[0]));
value.setUrlString(str[1]);
value.setSiteName(str[2]);
}
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return key;
}
@Override
public IndexDoc getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return lineRecordReader.getProgress();
}
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
lineRecordReader.initialize(genericSplit, context);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
byte[] line = null;
int lineLen = -1;
if (lineRecordReader.nextKeyValue()) {
innerValue = lineRecordReader.getCurrentValue();
line = innerValue.getBytes();
lineLen = innerValue.getLength();
} else {
System.out.println("return false;");
return false;
}
if (line == null){
System.out.println("return false;");
return false;
}
if (key == null) {
key = new Text();
}
if (value == null) {
value = new IndexDoc();
}
int pos = findSeparator(line, 0, lineLen, this.separator);
setKeyValue(key, value, line, lineLen, pos);
return true;
}
}
整个程序的重点放在RecordReader上.简单来说,读取数据按行读入,返回一个key-value对.而开始读入的信息保存成Text类型,然后对Text解析,变成key-value对的形式.别看代码这么多,其实拿过来要修改的地方也不是很多.对Text分解时,依据的是一个分割符号,这里用的是'\t',大家也可以自定.然后依据分割成的字符串创建key的对象和value的对象就行了.
如果大家使用Eclipse的话,在控制台是不显示MapReduce执行过程中的错误的,所以大家可以看一下点击打开链接.希望可以让你调试代码更轻松一点.
















