
  • 小文件同样需要对应的元数据,过多的小文件元数据浪费内存空间
  • 寻址大量小文件浪费时间

hadoop archive

hadoop archive实际上底层实现是运行了一个MR任务。

  • name: 生成的压缩包文件名。文件名必须以.har结尾
  • parent path: 需要治理的小文件所在文件夹所在父目录
  • r : 官方文档中也没有提到此参数。
  • src :源文件目录
  • dest: 目标文件目录
//dir1 和dir2是testfile目录下的两个子文件夹,可以指定大于等于1个子文件夹
 //也可以省略dir1 dir2,这样会直接治理testfile下的全部文件
 hadoop archive -archiveName testhar.har -p /testfile dir1 dir2 -r 2 /tmp


hdfs dfs -lsr 《file path》

  • 此命令查看治理后的文件目录,与web ui上显示一致

hdfs dfs -lsr har://《filepath》

  • 此命令查看治理前的文件目录,与压缩前的目录显示一致


  • hdfs dfs -cp

Sequence Files

  • SequenceFile文件,其中数据格式为二进制。
  • SequenceFile文件主要由一条条record记录组成;每个record是键值对形式的。
  • 将SequenceFile文件作为小文件容器,将大量的小文件压缩成一个SequenceFile文件,小文件名作为recordkey,小文件内容作为recordvalue


  • 一个4四节的header(文件版本号)
  • 若干个record记录
  • 若各个位置随机的同步点sync marker
  • sync marker用于方便定位到记录边界。当seek寻找record错误时,直接从下一个sync marker查找

Sequence File 压缩方式

  • 不压缩
  • record为单位压缩
  • Sequence Fille中的block为单位压缩。
  • 多数情况以此压缩
  • 因为一个block包含多条记录,利用record间的相似性进行压缩,压缩效率更高
  • Sequence Fille中两个Sync marker之间相连的多个record为一个block

Sequence File写数据

import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class HDFSOperate {
    private static final String[] TESTDATA = {
            "The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models.",
            "It is designed to scale up from single servers to thousands of machines, each offering local computation and storage.",
            "Rather than rely on hardware to deliver high-availability, the library itself is designed to detect and handle failures at the application layer",
            "o delivering a highly-available service on top of a cluster of computers, each of which may be prone to failures.",
            "Hadoop Common: The common utilities that support the other Hadoop modules."

    public static void main(String[] args) throws IOException, URISyntaxException {
        //输出路径:要生成的Sequence File文件名
        String uri = "hdfs://node01:8020/writeSeFile";
        Configuration conf = new Configuration();
        FileSystem fileSystem = FileSystem.get(URI.create(uri), conf);
        //创建HDFS上Sequence File的路径实例
        Path path = new Path(uri);
        IntWritable key = new IntWritable();
        Text value = new Text();
        //创建向SequenceFile文件写入数据时的一些选项:path keyOption valueOption compressionTpye
        SequenceFile.Writer.Option pathOption = SequenceFile.Writer.file(path);
        SequenceFile.Writer.Option keyOption = SequenceFile.Writer.keyClass(key.getClass());
        SequenceFile.Writer.Option valueOption = SequenceFile.Writer.valueClass(value.getClass());
        //SequenceFile压缩方式:NONE | RECORD | BLOCK三选一
        SequenceFile.Writer.Option compressionOption = SequenceFile.Writer.compression(SequenceFile.CompressionType.RECORD);
        //SequenceFile.Writer.Option compressionOption = SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK);
        BZip2Codec bZip2Codec = new BZip2Codec();
        SequenceFile.Writer.Option compressAlgorithm = SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK,bZip2Codec);
        SequenceFile.Writer writer = SequenceFile.createWriter(conf, pathOption, keyOption, valueOption,compressAlgorithm);*/
        SequenceFile.Writer writer = SequenceFile.createWriter(conf, pathOption, keyOption, valueOption);

        for (int i = 0; i < 100000;i++){
            key.set(100 - i);
            value.set(TESTDATA[i% TESTDATA.length]);

查看Sequence File

// | head -100为可选参数
 hadoop fs -text hdfs://node01:8020/writeSeFile | head -100

Sequence File读数据

public static void main(String[] args) throws IOException {
        String uri = "hdfs://node01:8020/writeSeFile";
        Configuration conf = new Configuration();
        Path path =  new Path(uri);
        SequenceFile.Reader reader = null;

            SequenceFile.Reader.Option pathOption = SequenceFile.Reader.file(path);
            reader = new SequenceFile.Reader(conf, pathOption);
            IntWritable key = (IntWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
            Text value = (Text) ReflectionUtils.newInstance(reader.getValueClass(), conf);
            long position = reader.getPosition();

            while (reader.next(key,value)){
                String syncSeen = reader.syncSeen() ? "*" : "";
                position =reader.getPosition(); //beginning of next record
        } finally{

/** Read the next key/value pair in the file into <code>key</code> and
     * <code>val</code>.  Returns true if such a pair exists and false when at
     * end of file */
    public synchronized boolean next(Writable key, Writable val)
      throws IOException {
      if (val.getClass() != getValueClass())
        throw new IOException("wrong value class: "+val+" is not "+valClass);

      boolean more = next(key);
      if (more) {

      return more;