Map[Reduce] 的 setup 中读取 HDFS 文件夹信息

精选转载

xztelecomlcs 2015-12-31 14:50:33

文章标签 文件夹信息 private null public 文章分类 后端开发

有时候，我们想在 Map 或者 Reduce 执行前读取一些数据信息（量相对比较小），又不想通过另一个 Map 来读取，就可以在该Map 的 setup 中来完成该操作。

相关定义

1.在HDFS上某一文件夹下存放用户信息列表：

/user/data/userinfo/part-00000

/user/data/userinfo/part-00001

...

/user/data/userinfo/part-00020

2.每行数据格式：

userid + \t + [其他参数]

/**
* Map.
*/
public static class TestMapper extends
   Mapper<Text, Text, NullWritable, NullWritable>{
  /** the list of userId. */
  ArrayList<Integer> userIdList = null;

  /** the path of userId. */
  private String userIdFilePath = "";

  /*
  * 读取指定HDFS路径下所有文件的数据.
  */
  public void readUserIdInfo(Context context) throws IOException{
   if(userIdList == null){
    userIdList = new ArrayList<Integer>();
   }else if(!userIdList.isEmpty()){
    userIdList.clear();
   }
   FileSystem fs = FileSystem.get(context.getConfiguration());
   // 读取文件列表
   Path filePath =new Path(userIdFilePath);
   FileStatus stats[] = fs.listStatus(filePath);
   String s = "";

   // 依次处理每个文件
   for(int i = 0; i < stats.length; ++i){
         Path inFile =new Path(stats[i].getPath().toString());
         FSDataInputStream fin = fs.open(inFile);
    BufferedReader input = new BufferedReader(new InputStreamReader(fin, "UTF-8"));

    // 处理当前文件数据
    while ((s = input.readLine()) != null) {
     String[] items = s.split("\t");

     // 暂时只需要 userId 字段
     int userId = Integer.parseInt(items[0]);
     if(!userIdList.contains(userId)){
      userIdList.add(userId);
     }
    }

    // 释放
    if (input != null) {
     input.close();
     input = null;
    }
    if (fin != null) {
     fin.close();
     fin = null;
    }
         }
   /* 测试信息
         System.out.println("userid count" + userIdList.size());
   */
  }

  @Override
  protected void setup(Context context) throws IOException {
   // 读取数据
   userIdFilePath = context.getConfiguration().get("userIdFilePath");
   readUserIdInfo(context);
  }
  @Override
  protected void cleanup(Context context) throws IOException {
   // 释放数据
   if(!userIdList.isEmpty()){
    userIdList.clear();
   }
  }
  @Override
  public void map(Text key, Text value, Context context)
    throws IOException, InterruptedException {
   // map 操作
  }
}