MapReduce实战之压缩/解压缩案例
原创
©著作权归作者所有:来自51CTO博客作者年轻即出发的原创作品,请联系作者获取转载授权,否则将追究法律责任
1 数据流的压缩和解压缩
CompressionCodec有两个方法可以用于轻松地压缩或解压缩数据。要想对正在被写入一个输出流的数据进行压缩,我们可以使用createOutputStream(OutputStreamout)方法创建一个CompressionOutputStream,将其以压缩格式写入底层的流。相反,要想对从输入流读取而来的数据进行解压缩,则调用createInputStream(InputStreamin)函数,从而获得一个CompressionInputStream,从而从底层的流读取未压缩的数据。
测试一下如下压缩方式:
DEFLATE | org.apache.hadoop.io.compress.DefaultCodec |
gzip | org.apache.hadoop.io.compress.GzipCodec |
bzip2 | org.apache.hadoop.io.compress.BZip2Codec |
package com.atguigu.mapreduce.compress; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.CompressionOutputStream; import org.apache.hadoop.util.ReflectionUtils; public class TestCompress { public static void main(String[] args) throws Exception { compress("e:/hello.txt","org.apache.hadoop.io.compress.BZip2Codec"); // decompress("e:/hello.txt.bz2"); } // 压缩 private static void compress(String filename, String method) throws Exception { // 1 获取输入流 FileInputStream fis = new FileInputStream(new File(filename)); Class codecClass = Class.forName(method); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, new Configuration()); // 2 获取输出流 FileOutputStream fos = new FileOutputStream(new File(filename +codec.getDefaultExtension())); CompressionOutputStream cos = codec.createOutputStream(fos); // 3 流的对拷 IOUtils.copyBytes(fis, cos, 1024*1024*5, false); // 4 关闭资源 fis.close(); cos.close(); fos.close(); } // 解压缩 private static void decompress(String filename) throws FileNotFoundException, IOException { // 0 校验是否能解压缩 CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); CompressionCodec codec = factory.getCodec(new Path(filename)); if (codec == null) { System.out.println("cannot find codec for file " + filename); return; } // 1 获取输入流 CompressionInputStream cis = codec.createInputStream(new FileInputStream(new File(filename))); // 2 获取输出流 FileOutputStream fos = new FileOutputStream(new File(filename + ".decoded")); // 3 流的对拷 IOUtils.copyBytes(cis, fos, 1024*1024*5, false); // 4 关闭资源 cis.close(); fos.close(); } } |
2 Map输出端采用压缩
即使你的MapReduce的输入输出文件都是未压缩的文件,你仍然可以对map任务的中间结果输出做压缩,因为它要写在硬盘并且通过网络传输到reduce节点,对其压缩可以提高很多性能,这些工作只要设置两个属性即可,我们来看下代码怎么设置:
1)给大家提供的hadoop源码支持的压缩格式有:BZip2Codec 、DefaultCodec
package com.atguigu.mapreduce.compress; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; publicclass WordCountDriver { publicstaticvoid main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration configuration = new Configuration(); // 开启map端输出压缩 configuration.setBoolean("mapreduce.map.output.compress", true); // 设置map端输出压缩方式 configuration.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class); Job job = Job.getInstance(configuration); job.setJarByClass(WordCountDriver.class); job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean result = job.waitForCompletion(true); System.exit(result ? 1 : 0); } } |
2)Mapper保持不变
package com.atguigu.mapreduce.compress; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; publicclass WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ @Override protectedvoid map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 1 获取一行 String line = value.toString(); // 2 切割 String[] words = line.split(" "); // 3 循环写出 for(String word:words){ context.write(new Text(word), new IntWritable(1)); } } } |
3)Reducer保持不变
package com.atguigu.mapreduce.compress; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; publicclass WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protectedvoid reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; // 1 汇总 for(IntWritable value:values){ count += value.get(); } // 2 输出 context.write(key, new IntWritable(count)); } } |
3 Reduce输出端采用压缩
基于workcount案例处理
1)修改驱动
package com.atguigu.mapreduce.compress; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; importorg.apache.hadoop.io.compress.DefaultCodec; importorg.apache.hadoop.io.compress.GzipCodec; importorg.apache.hadoop.io.compress.Lz4Codec; importorg.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; publicclass WordCountDriver { publicstaticvoid main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration); job.setJarByClass(WordCountDriver.class); job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); // 设置reduce端输出压缩开启 FileOutputFormat.setCompressOutput(job, true); // 设置压缩的方式 FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); // FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); boolean result = job.waitForCompletion(true); System.exit(result?1:0); } } |