hadoop创建输入输出文件 hadoop常见的输入格式

转载

mob6454cc6cee7e 2023-08-18 21:23:37

文章标签 hadoop创建输入输出文件自定义输入 hadoop mapreduce Text 文章分类 Hadoop 大数据

个人感觉如果没有能自己实现输入格式的话，其实对mapreduce的程序运行，是不能理解深刻的。实现目标:自定义输入格式从本地文本信息中统计单词出现个数。感觉很熟悉吧。
第一步首先要实现抽象类InputFormat。里面有要两个实现的方法，得到分片信息，和得到记录阅读类（RecordReader）。下面是源代码

public abstract class InputFormat<K, V> {
public abstract 
    List<InputSplit> getSplits(JobContext context
                               ) throws IOException, InterruptedException;


  public abstract 
    RecordReader<K,V> createRecordReader(InputSplit split,
                                         TaskAttemptContext context
                                        ) throws IOException, 
                                                 InterruptedException;

}

接着来看自定义的输入类代码

public class TextInputFormat  extends InputFormat<IntWritable, Text>{

    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        //这里为了方便直接以1，3条记录为一个分片，4，6记录为一个分片。将分片信息放在集合里面。
        ArrayList<InputSplit> arrayList=new ArrayList<>();
        TestSplit testSplit=new TestSplit(1, 3);
        TestSplit testSplit2=new TestSplit(4, 6);
        arrayList.add(testSplit);
        arrayList.add(testSplit2);
        return arrayList;
    }

    @Override
    public RecordReader<IntWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return new TestReader();
    }

}

3.首先来看得到分片信息，我们可以看到返回类型是inputsplit的list类型。很容易得到自定义分片，应该是inputsplit的子类。于是我们实现抽象类InputSplit.有两个要实现方法。getlength()得到分片大小。这里主要是后面会根据分片大小排序。getLocations()得到分片信息来自于哪一个节点的名称，如果返回null的话，这里是会出异常的。可以任意返回与分片数量相等的字符串数组。来看源代码

public class TestSplit extends InputSplit implements Writable{
    private int start=0;
    private int end=0;

    //分片一定有一个空的构造函数，不然会报错、
    public TestSplit() {
        // TODO Auto-generated constructor stub
    }
    //返回分片大小
    @Override
    public long getLength() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return end-start+1;
    }
    //返回位置信息
    @Override
    public String[] getLocations() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return new String[]{"hadoop1","hadoop2"};
    }
    //序列化这个地方不用讲吧
    @Override
    public void write(DataOutput out) throws IOException {
        // TODO Auto-generated method stub
         out.writeInt(start);
         out.writeInt(end);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        this.start=in.readInt();
        this.end=in.readInt();

    }

    public TestSplit(int start, int end) {

        this.start = start;
        this.end = end;

    }

    public int getStart() {
        return start;
    }

    public void setStart(int start) {
        this.start = start;
    }

    public int getEnd() {
        return end;
    }

    public void setEnd(int end) {
        this.end = end;
    }
}

4.再来看RecordReader类，这个类主要是将记录拆成key/value这种形式，我里以行数为key，以行内容为value。实现这个功能的主要是nextKeyValue()方法。这个方法只要还有记录没有拆分就返回true.这个地方先要讲一个如何得到记录，我写了一个工具类。可以读本地文件将每行信息保存到map中。下面来看源代码

public class TestReader  extends RecordReader<IntWritable, Text>{
    private TestSplit testsplit;
    private int start;     //记录的开始
    private int end;       //记录的结束 
    private IntWritable key=new IntWritable();  //返回的key
    private Text value=new Text();              //返回的value
    private Map<Integer, String> map=new HashMap<>();
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        testsplit=(TestSplit) split;
        start=testsplit.getStart();  
        end=testsplit.getEnd();
        map=TestText.getText();  //得到文件信息
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
      if (start<=end) {
        key.set(start);            //设置key和value
        value.set(map.get(start));
        start++;
        return true;
    }
      else
          return false;
    }

    @Override
    public IntWritable getCurrentKey() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return value;
    }


    //后面两个方法可以不用实现
    @Override
    public float getProgress() throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return 0;
    }

    @Override
    public void close() throws IOException {
        // TODO Auto-generated method stub

    }

}

工具类代码

public class TestText {

    public  static Map<Integer, String> getText() {
         Map<Integer, String> map=new HashMap();
            File file = new File("D:/Test.txt");  
            BufferedReader reader = null;  
            try {  
                reader = new BufferedReader(new FileReader(file));  
                String tempString = null;  
                int line = 1;  
                while ((tempString = reader.readLine()) != null) {  
                    map.put(line, tempString);
                    line++;  
                }  
                reader.close();  
            } catch (IOException e) {  
                e.printStackTrace();  
            } finally {     
                if (reader != null) {  
                    try {  
                        reader.close();  
                    } catch (IOException e1) {  
                    }  
                } 
                return map;
            }  
    }


}

5.Mapper类

public class TestMapper  extends Mapper<IntWritable, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);  
    private Text word = new Text();  
    @Override
    public void map(IntWritable key, Text value, Context context)
            throws IOException, InterruptedException {

        // TODO Auto-generated method stub
          String line = value.toString();  
            StringTokenizer st = new StringTokenizer(line," ");  

            while(st.hasMoreTokens()){  
                word.set(st.nextToken()); 

                context.write(word,one);  

            }  
    }

}

6.Reducer类

public  class TestReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
     IntWritable result = new IntWritable();
        public void reduce(Text    key, Iterable<IntWritable> values, Context context) throws IOException,InterruptedException {
            int sum = 0;
            for(IntWritable val:values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key,result);
        }  
}

7.主类

public class WordCount {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = new Job(conf, "wordcount");
        job.setJarByClass(WordCount.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapperClass(TestMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setReducerClass(TestReduce.class);
        FileOutputFormat.setOutputPath(job, new Path("01/"));  //输出信息保存在本地项目目录下
        System.exit(job.waitForCompletion(true)?0:1);
    } 
}

8.github项目地址https://github.com/iareuniqe/InputFormat

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。