Mapreduce访问文件

精选转载

xztelecomlcs 2015-12-31 10:31:31

文章标签 程序 package import reduce 文章分类 Hadoop 大数据

Map和Reduce方法中操作本地文件

在Map和Reduce方法中是可以直接操作本地文件的，例如向本地文件系统中写或者读，只是这也会是分布式读和写，这会是从执行task的节点的本地硬盘中读或向其中写。

注意事项：mapreduce程序书写完毕，请务必打包成jar，在命令行提交中运行。之前我向本地文件系统写时一直不生成数据，以为map或者reduce中不能向本地文件系统写，实际上并不如此。我的错误之处是直接在主节点上的eclipse中编译运行，由于从节点上没有主节点上的这些代码，所以执行之后毫无效果。

在Map中向本地文件系统写文件的代码，以下代码成功运行后，会在从节点的/home/hadoop目录下生成LogInfo文件

package org.apache.hadoop.examples;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/*
* AUTHOR: zhankunlin 2010-8-16
*/
public class WordCountZKL {

public static class LogInfo{
  public static String LogFile="/home/hadoop/LogInfo";
  static{

  }
  public static void Begin(String region,String taskID){
   File log=new File(LogFile);
   FileOutputStream out;
   try{
    out=new FileOutputStream(LogFile, true);
    out.write((region+" "+taskID+" begin/n").getBytes());
   }catch(FileNotFoundException e){

   }
   catch(IOException e){

   }
  }
  public static void End(String region,String taskID){
   //File log=new File(LogFile);
   FileOutputStream out;
   try{
    out=new FileOutputStream(LogFile, true);
    out.write((region+" "+taskID+" end/n").getBytes());
   }catch(FileNotFoundException e){

   }
   catch(IOException e){

   }
  }
}

/*
public static class WordCountMapper extends
Mapper<Object, Text, Text, IntWritable> {

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

  public void map(Object key, Text value, Context context)
    throws IOException, InterruptedException {
   StringTokenizer itr = new StringTokenizer(value.toString());
   while (itr.hasMoreTokens()) {
    word.set(itr.nextToken());
    context.write(word, one);
   }
  }
}

public static class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();

  public void reduce(Text key, Iterable<IntWritable> values,
    Context context) throws IOException, InterruptedException {
   int sum = 0;
   for (IntWritable val : values) {
    sum += val.get();
   }
   result.set(sum);
   context.write(key, result);
  }
}
*/

public static class WordCountMapperZKL extends
   Mapper<Object, Text, Text, IntWritable> {

private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

  public void map(Context context) throws IOException,
    InterruptedException {
   LogInfo.Begin("map",context.getTaskAttemptID().getTaskID().toString()); //从节点上会生成文件
   while (context.nextKeyValue()) {
    Object key = context.getCurrentKey();
    Text value = (Text) context.getCurrentValue();
    /////
    StringTokenizer itr = new StringTokenizer(value.toString());
    while (itr.hasMoreTokens()) {
     word.set(itr.nextToken());
     context.write(word, one);
    }
    /////
   }
   LogInfo.End("map",context.getTaskAttemptID().getTaskID().toString());
  }

   /**
     * Expert users can override this method for more complete control over the
     * execution of the Mapper.
     * @param context
     * @throws IOException
     */
    public void run(Context context) throws IOException, InterruptedException {
      setup(context);
      map(context);
      cleanup(context);
    }
}

public static class WordCountReducerZKL extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();

  public void reduce(Context context) throws IOException, InterruptedException {
   while (context.nextKey()) {
    Text key = context.getCurrentKey();
    Iterable<IntWritable> values = context.getValues();
    ///////
    int sum = 0;
    for (IntWritable val : values) {
     sum += val.get();
    }
    result.set(sum);
    context.write(key, result);
    ///////
   }
  }

    /**
     * Advanced application writers can use the
     * {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to
     * control how the reduce task works.
     */
    public void run(Context context) throws IOException, InterruptedException {
      setup(context);
      reduce(context);
      cleanup(context);
    }
}

@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {

  LogInfo.Begin("job","job_1"); //主节点上会生成LogInfo文件

  Configuration conf = new Configuration();
  /*
   * String[] otherArgs = new GenericOptionsParser(conf,
   * args).getRemainingArgs(); if (otherArgs.length != 2) {
   * System.err.println("Usage: wordcount <in> <out>"); System.exit(2); }
   */

  String[] inputPars = { "wcinZKL", "wcoutZKL" };
  String[] otherArgs = new GenericOptionsParser(conf, inputPars)
    .getRemainingArgs();

  Path outputPaths = new Path(otherArgs[1]);
  FileSystem fs = FileSystem.get(conf);
  if (fs.exists(outputPaths)) { // please see the code of exists() method
   // throw new FileAlreadyExistsException("Output directory " +
   // outputPaths + " already exists");
   FileStatus fsStatus = fs.getFileStatus(outputPaths);
   if (fsStatus.isDir()) // only test the methods of hdfs,but it is not necessary
    fs.delete(outputPaths, true);
   else
    fs.delete(outputPaths, false);// true is also ok
   System.out.println("Output directory /"" + outputPaths
     + "/" already exists" + ",firstly delete it");
  }

  /*
   * FileStatus fsStatus=fs.getFileStatus(outputPaths); if
   * (fsStatus!=null) { throw new
   * FileAlreadyExistsException("Output directory " + outputPaths +
   * " already exists"); }
   */

  Job job = new Job(conf, "word count zkl");
  job.setJarByClass(WordCountZKL.class);
  job.setMapperClass(WordCountMapperZKL.class);
  job.setCombinerClass(WordCountReducerZKL.class);
  job.setReducerClass(WordCountReducerZKL.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  System.out.println("job "+job.getJobName()+"("+job.getJobID()+")"+" finished? "+job.waitForCompletion(true));
  //System.exit( job.waitForCompletion(true)? 0 : 1);
  LogInfo.End("job","job_1");
}
}

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.util.ReflectionUtils;

public class CompressionTest {
//设在本地目录下有一个名为uploadFile的文件，对本地文件进行gzip压缩
        public static void StreamCompresson() throws IOException
        {
                Configuration conf = new Configuration();
                //注意此处得到压缩器的方法，CompressionCodec是一个封装了压缩器的接口.
                //下面的语句根据第一个参数产生相应的压缩器
                CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(GzipCodec.class, conf);

                FileOutputStream outFile = new FileOutputStream("uploadFile.gz");        //用于将数据写入该流指向的文件中
                FileInputStream in = new FileInputStream("uploadFile");        //该文件会被写压缩输出流，即被压缩。
                //要压缩的话，如下对一个写入输出流的数据进行压缩
                CompressionOutputStream out = codec.createOutputStream(outFile);
                IOUtils.copyBytes(in, out, 4096, true);
        }

        //
        public static void FileDecompressor() throws IOException
        {
                Configuration conf = new Configuration();
                FileSystem local = FileSystem.getLocal(conf);
                Path input = new Path("uploadFile.gz");
                //获取所拥有的所有压缩器——工厂
                CompressionCodecFactory factory = new CompressionCodecFactory(conf);
                //根据后缀得到相应的压缩器
                CompressionCodec codec = factory.getCodec(input);
                //移除文件名的后缀
                String outputUri =CompressionCodecFactory.removeSuffix("uploadFile.gz", codec.getDefaultExtension());

                InputStream in = null;
                OutputStream out = null;
                //从压缩输入流中读取内容放入文件输出流
         in = codec.createInputStream(local.open(input));
                out = local.create(new Path(outputUri));
                IOUtils.copyBytes(in, out, conf, true);
        }

        public static void main(String [] args) throws IOException
        {
                StreamCompresson();
                FileDecompressor();
        }
}