hadoop lzo解压命令 hadoop解压命令 hadoop解压文件

转载

mob64ca140b0bc8 2024-08-20 08:42:35

文章标签 hadoop 压缩文件 hadoop 压缩 apache Text 文章分类 Hadoop 大数据

hadoop计算需要在hdfs文件系统上进行，因此每次计算之前必须把需要用到的文件(我们称为原始文件)都上传到hdfs上。文件上传到hdfs上通常有两种方法：

a hadoop自带的dfs服务，put；

b hadoop的API，Writer对象可以实现这一功能；

将a、b方案进行对比，如下：

1 空间：方案a在hdfs上占用空间同本地，因此假设只上传日志文件，则保存一个月日志文件将消耗掉约10T空间，如果加上这期间的各种维表、事实表，将占用大约25T空间

方案b经测试，压缩比大约为3~4:1，因此假设hdfs空间为100T，原来只能保存约4个月的数据，现在可以保存约1年

2 上传时间：方案a的上传时间经测试，200G数据上传约1小时

方案b的上传时间，程序不做任何优化，大约是以上的4~6倍，但存在一定程度提升速度的余地

3 运算时间：经过对200G数据，大约4亿条记录的测试，如果程序以IO操作为主，则压缩数据的计算可以提高大约50%的速度，但如果程序以内存操作为主，则只能提高5%~10%的速度

4 其它：未压缩的数据还有一个好处是可以直接在hdfs上查看原始数据。压缩数据想看原始数据只能用程序把它导到本地，或者利用本地备份数据

压缩格式：按照hadoop api的介绍，压缩格式分两种：BLOCK和RECORD,其中RECORD是只对value进行压缩，一般采用BLOCK进行压缩。

对压缩文件进行计算，需要用SequenceFileInputFormat类来读入压缩文件，以下是计算程序的典型配置代码：

压缩wordcount程序

package fh;
 /*
  * 题目描述：在英文环境下，给定一篇文章，统计每个单词出现的次数，单词不区分大小写，标点符号不进行统计
  * 程序设计约束：程序需要两个输入参数，第一个为输入文件的路径，第二个为输出文件的路径，输出文件的格式为
  */

 import java.io.IOException;
 import java.util.StringTokenizer;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.filecache.DistributedCache;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.SequenceFile.CompressionType;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.util.GenericOptionsParser;

 public class MyWordCount {
     enum Counter {
         LINESKIP, // 出错行
     }

     public static class TokenizerMapper extends
             Mapper<Object, Text, Text, IntWritable> {

         private final static IntWritable one = new IntWritable(1);
         private Text word = new Text();
         // private String delim=" ,.?!:;-_()[]{}'"+'"';
         private String pattern = "[^\\w]"; // 正则表达式，代表不是0-9, a-z, A-Z的所有其它字符

         public void map(Object key, Text value, Context context) {
             try {
                 String line = value.toString().toLowerCase(); // 全部转为小写字母
                 line = line.replaceAll(pattern, " "); // 将非0-9, a-z, A-Z的字符替换为空格
                 StringTokenizer itr = new StringTokenizer(line);
                 while (itr.hasMoreTokens()) {
                     word.set(itr.nextToken());

                     context.write(word, one);
                 }
             } catch (Exception e) {
                 // 出错让计数器加一
                 context.getCounter(Counter.LINESKIP).increment(1);
                 return;
             }
         }
     }

     public static class IntSumReducer extends
             Reducer<Text, IntWritable, Text, IntWritable> {
         private IntWritable result = new IntWritable();

         public void reduce(Text key, Iterable<IntWritable> values,
                 Context context) throws IOException, InterruptedException {
             int sum = 0;
             /*
              * 获取conf配置，获取其中内容 Configuration conf=context.getConfiguration();
              * System.out.println(conf.get("fh"));
              */
             for (IntWritable val : values) {
                 sum += val.get();
             }
             result.set(sum);

             key.set(key.toString() + ":");
             // System.out.println(key.toString() + "--" + result);
             context.write(key, result);
         }
     }

     public static void main(String[] args) throws Exception {
         Configuration conf = new Configuration();
         
         //设置对map输出文件进行压缩
         conf.setBoolean("mapred.compress.map.output", true);
         conf.setClass("mapred.map.output.compression.codec", GzipCodec.class,CompressionCodec.class);
         
         String[] otherArgs = new GenericOptionsParser(conf, args)
                 .getRemainingArgs();

         if (otherArgs.length != 2) {
             System.err.println("Usage: wordcount <in> <out>");
             System.exit(2);
         }
         Job job = new Job();
         job.setJobName("wordcount");
         // 如果需要打成jar运行，需要下面这句
         job.setJarByClass(MyWordCount.class);

         job.setMapperClass(TokenizerMapper.class);
         job.setCombinerClass(IntSumReducer.class);
         job.setReducerClass(IntSumReducer.class);
         job.setOutputKeyClass(Text.class);
         job.setOutputValueClass(IntWritable.class);
         
         //压缩输出文件---压缩文件到hdfs上，不能直接查看，需要下载到本地查看
 //        FileOutputFormat.setCompressOutput(job, true);
 //        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
 //        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);//默认为RECORD,每条记录；建议改为BLOCK
         
         SequenceFileInputFormat.addInputPath(job, new Path(otherArgs[0]));
         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
         System.exit(job.waitForCompletion(true) ? 0 : 1);
     }
 }

未压缩的数据还有一个好处是可以直接在hdfs上查看原始数据。压缩数据想看原始数据只能用程序把它导到本地，或者利用本地备份数据

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。