hadoop多文件输入 hadoop的输入格式

转载

mob6454cc73e9a6 2023-09-20 10:52:19

文章标签 hadoop多文件输入 hadoop 自定义输出输入格式多文件输出 apache 文章分类 Hadoop 大数据

Hadoop提供了较为丰富的数据输入输出格式，可以满足很多的设计实现，但是在某些时候需要自定义输入输出格式。

数据的输入格式用于描述MapReduce作业的数据输入规范，MapReduce框架依靠数据输入格式完后输入规范检查（比如输入文件目录的检查），对数据文件进行输入分块（InputSpilt）以及提供从输入分快中将数据逐行的读出，并转换为Map过程的输入键值对等功能。Hadoop提供了很多的输入格式，TextInputFormat和KeyValueInputFormat,对于每个输入格式都有与之对应的RecordReader,LineRecordReader和KeyValueLineRecordReader。用户需要自定义输入格式，主要实现InputFormat中的createRecordReader()和getSplit()方法，而在RecordReader中实现getCurrentKey().....

例如：

package com.rpc.nefu;

import java.io.IOException;   
import org.apache.hadoop.fs.FSDataInputStream;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;   
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.InputSplit;    
import org.apache.hadoop.mapreduce.RecordReader;  
import org.apache.hadoop.mapreduce.TaskAttemptContext;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.util.LineReader;  
import org.apache.hadoop.mapreduce.lib.input.FileSplit;  
 
//自定义的输入格式需要 继承FileInputFormat接口
public class ZInputFormat extends FileInputFormat<IntWritable,IntWritable>{  
          
        @Override  //实现RecordReader
        public RecordReader<IntWritable, IntWritable> createRecordReader(  
                InputSplit split, TaskAttemptContext context)  
                throws IOException, InterruptedException {  
            return new ZRecordReader();                                                  
        }  
  
        //自定义的数据类型  
        public static class ZRecordReader extends RecordReader<IntWritable,IntWritable>  
        {  
            //data  
            private LineReader in;      //输入流  
            private boolean more = true;//提示后续还有没有数据  
              
            private IntWritable key = null;  
            private IntWritable value = null;  
              
            //这三个保存当前读取到位置（即文件中的位置）  
            private long start;  
            private long end;  
            private long pos;  
              
            //private Log LOG = LogFactory.getLog(ZRecordReader.class);//日志写入系统，可加可不加  
                          
            @Override  
            public void initialize(InputSplit split, TaskAttemptContext context)  
                    throws IOException, InterruptedException {  
                // 初始化函数  
                  
                FileSplit inputsplit = (FileSplit)split;  
                start = inputsplit.getStart();                      //得到此分片开始位置  
                end   = start + inputsplit.getLength();//结束此分片位置  
                final Path file = inputsplit.getPath();  
          
                // 打开文件  
                FileSystem fs = file.getFileSystem(context.getConfiguration());  
                FSDataInputStream fileIn = fs.open(inputsplit.getPath());  
                  
                
                //将文件指针移动到当前分片，因为每次默认打开文件时，其指针指向开头  
                fileIn.seek(start);  
                  
                in = new LineReader(fileIn, context.getConfiguration());  
  
                if (start != 0)   
                {  
                  System.out.println("4");   
                   //如果这不是第一个分片，那么假设第一个分片是0——4，那么，第4个位置已经被读取，则需要跳过4，否则会产生读入错误，因为你回头又去读之前读过的地方  
               start += in.readLine(new Text(), 0, maxBytesToConsume(start));  
                }  
                pos = start;  
            }  
              
            private int maxBytesToConsume(long pos)   
            {  
                    return (int) Math.min(Integer.MAX_VALUE, end - pos);  
             }  
              
            @Override  
            public boolean nextKeyValue() throws IOException,  
                    InterruptedException {  
                //下一组值  
                //tips:以后在这种函数中最好不要有输出，费时  
                //LOG.info("正在读取下一个，嘿嘿");  
                if(null == key)  
                {  
                    key = new IntWritable();  
                }  
                if(null == value)  
                {  
                    value = new IntWritable();  
                }  
                Text nowline = new Text();//保存当前行的内容  
                int readsize = in.readLine(nowline);  
                //更新当前读取到位置  
                pos += readsize;  
              
                //如果pos的值大于等于end，说明此分片已经读取完毕  
                if(pos >= end)  
                {  
                    more = false;  
                    return false;  
                }  
                  
                if(0 == readsize)  
                {  
                    key = null;  
                    value = null;  
                    more = false;//说明此时已经读取到文件末尾，则more为false  
                    return false;  
                }  
                String[] keyandvalue = nowline.toString().split(",");  
                  
                //排除第一行  
                if(keyandvalue[0].endsWith("\"CITING\""))  
                {  
                    readsize = in.readLine(nowline);  
                    //更新当前读取到位置  
                    pos += readsize;  
                    if(0 == readsize)  
                    {  
                        more = false;//说明此时已经读取到文件末尾，则more为false  
                        return false;  
                    }  
                    //重新划分  
                    keyandvalue = nowline.toString().split(",");  
                }  
                  
                //得到key和value  
                //LOG.info("key is :" + key +"value is" + value);  
                key.set(Integer.parseInt(keyandvalue[0]));  
                value.set(Integer.parseInt(keyandvalue[1]));  
                  
                return true;  
            }  
  
            @Override  
            public IntWritable getCurrentKey() throws IOException,  
                    InterruptedException {  
                //得到当前的Key  
                return key;  
            }  
  
            @Override  
            public IntWritable getCurrentValue() throws IOException,  
                    InterruptedException {  
                //得到当前的value  
                return value;  
            }  
  
            @Override  
            public float getProgress() throws IOException, InterruptedException {  
                //计算对于当前片的处理进度  
                if( false == more || end == start)  
                {  
                    return 0f;  
                }  
                else  
                {  
                    return Math.min(1.0f, (pos - start)/(end - start));  
                }  
            }  
  
            @Override  
            public void close() throws IOException {  
                //关闭此输入流  
                if(null != in)  
                {  
                    in.close();  
                }  
            }  
              
        }  
}

package reverseIndex;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class FileNameLocInputFormat extends FileInputFormat<Text, Text>{

	@Override
	public org.apache.hadoop.mapreduce.RecordReader<Text, Text> createRecordReader(
			org.apache.hadoop.mapreduce.InputSplit split, TaskAttemptContext context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		return new FileNameLocRecordReader();
	}
	public static class FileNameLocRecordReader extends RecordReader<Text,Text>{
		
		String FileName;
		LineRecordReader line = new LineRecordReader();
		/**
		 * ......
		 */ 

		@Override
		public Text getCurrentKey() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return new Text("("+FileName+"@"+line.getCurrentKey()+")");
		}

		@Override
		public Text getCurrentValue() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return line.getCurrentValue();
		}

		

		@Override
		public void initialize(InputSplit split, TaskAttemptContext arg1)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			line.initialize(split, arg1);
			FileSplit inputsplit = (FileSplit)split;
			FileName = (inputsplit).getPath().getName();	
		}

		@Override
		public void close() throws IOException {
			// TODO Auto-generated method stub
			
		}

		@Override
		public float getProgress() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return 0;
		}

		@Override
		public boolean nextKeyValue() throws IOException, InterruptedException {
			// TODO Auto-generated method stub
			return false;
		}
	}
}

Hadoop中也内置了很多的输出格式与RecordWriter.输出格式完成输出规范检查，作业结果数据输出。

自定义的输出格式：

public static class AlphaOutputFormat extends multiformat<Text, IntWritable>{
		
		@Override
		protected String generateFileNameForKeyValue(Text key,
				IntWritable value, Configuration conf) {
			// TODO Auto-generated method stub
			char c = key.toString().toLowerCase().charAt(0);
			if( c>='a' && c<='z'){
				return c+".txt";
			}else{
				return "other.txt";
			}
		}
		
	}

//设置输出格式
		job.setOutputFormatClass(AlphaOutputFormat.class);

package com.rpc.nefu;
import java.io.DataOutputStream;  
import java.io.IOException;  
import java.util.HashMap;  
import java.util.Iterator;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FSDataOutputStream;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.Writable;  
import org.apache.hadoop.io.WritableComparable;  
import org.apache.hadoop.io.compress.CompressionCodec;  
import org.apache.hadoop.io.compress.GzipCodec;  
import org.apache.hadoop.mapreduce.OutputCommitter;  
import org.apache.hadoop.mapreduce.RecordWriter;  
import org.apache.hadoop.mapreduce.TaskAttemptContext;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.util.ReflectionUtils;  

public abstract class multiformat<K extends WritableComparable<?>, V extends Writable>  
        extends FileOutputFormat<K, V> {  
    private MultiRecordWriter writer = null;  
    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException,  
            InterruptedException {  
        if (writer == null) {  
            writer = new MultiRecordWriter(job, getTaskOutputPath(job));  
        }  
        return writer;  
    }  
    private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {  
        Path workPath = null;  
        OutputCommitter committer = super.getOutputCommitter(conf);  
        if (committer instanceof FileOutputCommitter) {  
            workPath = ((FileOutputCommitter) committer).getWorkPath();  
        } else {  
            Path outputPath = super.getOutputPath(conf);  
            if (outputPath == null) {  
                throw new IOException("Undefined job output-path");  
            }  
            workPath = outputPath;  
        }  
        return workPath;  
    }  
    /**通过key, value, conf来确定输出文件名（含扩展名）*/  
    protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf);  
    public class MultiRecordWriter extends RecordWriter<K, V> {  
        /**RecordWriter的缓存*/  
        private HashMap<String, RecordWriter<K, V>> recordWriters = null;  
        private TaskAttemptContext job = null;  
        /**输出目录*/  
        private Path workPath = null;  
        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {  
            super();  
            this.job = job;  
            this.workPath = workPath;  
            recordWriters = new HashMap<String, RecordWriter<K, V>>();  
        }  
        @Override  
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {  
            Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();  
            while (values.hasNext()) {  
                values.next().close(context);  
            }  
            this.recordWriters.clear();  
        }  
        @Override  
        public void write(K key, V value) throws IOException, InterruptedException {  
            //得到输出文件名  
            String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());  
            RecordWriter<K, V> rw = this.recordWriters.get(baseName);  
            if (rw == null) {  
                rw = getBaseRecordWriter(job, baseName);  
                this.recordWriters.put(baseName, rw);  
            }  
            rw.write(key, value);  
        }  
        // ${mapred.out.dir}/_temporary/_${taskid}/${nameWithExtension}  
        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)  
                throws IOException, InterruptedException {  
            Configuration conf = job.getConfiguration();  
            boolean isCompressed = getCompressOutput(job);  
            String keyValueSeparator = ",";  
            RecordWriter<K, V> recordWriter = null;  
            if (isCompressed) {  
                Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,  
                        GzipCodec.class);  
                CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);  
                Path file = new Path(workPath, baseName + codec.getDefaultExtension());  
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);  
                recordWriter = new lineRecordWrite<K, V>(new DataOutputStream(codec  
                        .createOutputStream(fileOut)), keyValueSeparator);  
            } else {  
                Path file = new Path(workPath, baseName);  
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);  
                recordWriter = new lineRecordWrite<K, V>(fileOut, keyValueSeparator);  
            }  
            return recordWriter;  
        }  
    }  
}

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。