Map和Reduce方法中操作本地文件
在Map和Reduce方法中是可以直接操作本地文件的,例如向本地文件系统中写或者读,只是这也会是分布式读和写,这会是从执行task的节点的本地硬盘中读或向其中写。
注意事项:mapreduce程序书写完毕,请务必打包成jar,在命令行提交中运行。之前我向本地文件系统写时一直不生成数据,以为map或者reduce中不能向本地文件系统写,实际上并不如此。我的错误之处是直接在主节点上的eclipse中编译运行,由于从节点上没有主节点上的这些代码,所以执行之后毫无效果。
在Map中向本地文件系统写文件的代码,以下代码成功运行后,会在从节点的/home/hadoop目录下生成LogInfo文件
package org.apache.hadoop.examples;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/*
* AUTHOR: zhankunlin 2010-8-16
*/
public class WordCountZKL {
public static class LogInfo{
public static String LogFile="/home/hadoop/LogInfo";
static{
}
public static void Begin(String region,String taskID){
File log=new File(LogFile);
FileOutputStream out;
try{
out=new FileOutputStream(LogFile, true);
out.write((region+" "+taskID+" begin/n").getBytes());
}catch(FileNotFoundException e){
}
catch(IOException e){
}
}
public static void End(String region,String taskID){
//File log=new File(LogFile);
FileOutputStream out;
try{
out=new FileOutputStream(LogFile, true);
out.write((region+" "+taskID+" end/n").getBytes());
}catch(FileNotFoundException e){
}
catch(IOException e){
}
}
}
/*
public static class WordCountMapper extends
Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
*/
public static class WordCountMapperZKL extends
Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Context context) throws IOException,
InterruptedException {
LogInfo.Begin("map",context.getTaskAttemptID().getTaskID().toString()); //从节点上会生成文件
while (context.nextKeyValue()) {
Object key = context.getCurrentKey();
Text value = (Text) context.getCurrentValue();
/////
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
/////
}
LogInfo.End("map",context.getTaskAttemptID().getTaskID().toString());
}
/**
* Expert users can override this method for more complete control over the
* execution of the Mapper.
* @param context
* @throws IOException
*/
public void run(Context context) throws IOException, InterruptedException {
setup(context);
map(context);
cleanup(context);
}
}
public static class WordCountReducerZKL extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Context context) throws IOException, InterruptedException {
while (context.nextKey()) {
Text key = context.getCurrentKey();
Iterable<IntWritable> values = context.getValues();
///////
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
///////
}
}
/**
* Advanced application writers can use the
* {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to
* control how the reduce task works.
*/
public void run(Context context) throws IOException, InterruptedException {
setup(context);
reduce(context);
cleanup(context);
}
}
@SuppressWarnings("deprecation")
public static void main(String[] args) throws Exception {
LogInfo.Begin("job","job_1"); //主节点上会生成LogInfo文件
Configuration conf = new Configuration();
/*
* String[] otherArgs = new GenericOptionsParser(conf,
* args).getRemainingArgs(); if (otherArgs.length != 2) {
* System.err.println("Usage: wordcount <in> <out>"); System.exit(2); }
*/
String[] inputPars = { "wcinZKL", "wcoutZKL" };
String[] otherArgs = new GenericOptionsParser(conf, inputPars)
.getRemainingArgs();
Path outputPaths = new Path(otherArgs[1]);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outputPaths)) { // please see the code of exists() method
// throw new FileAlreadyExistsException("Output directory " +
// outputPaths + " already exists");
FileStatus fsStatus = fs.getFileStatus(outputPaths);
if (fsStatus.isDir()) // only test the methods of hdfs,but it is not necessary
fs.delete(outputPaths, true);
else
fs.delete(outputPaths, false);// true is also ok
System.out.println("Output directory /"" + outputPaths
+ "/" already exists" + ",firstly delete it");
}
/*
* FileStatus fsStatus=fs.getFileStatus(outputPaths); if
* (fsStatus!=null) { throw new
* FileAlreadyExistsException("Output directory " + outputPaths +
* " already exists"); }
*/
Job job = new Job(conf, "word count zkl");
job.setJarByClass(WordCountZKL.class);
job.setMapperClass(WordCountMapperZKL.class);
job.setCombinerClass(WordCountReducerZKL.class);
job.setReducerClass(WordCountReducerZKL.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.out.println("job "+job.getJobName()+"("+job.getJobID()+")"+" finished? "+job.waitForCompletion(true));
//System.exit( job.waitForCompletion(true)? 0 : 1);
LogInfo.End("job","job_1");
}
}
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.util.ReflectionUtils;
public class CompressionTest {
//设在本地目录下有一个名为uploadFile的文件,对本地文件进行gzip压缩
public static void StreamCompresson() throws IOException
{
Configuration conf = new Configuration();
//注意此处得到压缩器的方法,CompressionCodec是一个封装了压缩器的接口.
//下面的语句根据第一个参数产生相应的压缩器
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(GzipCodec.class, conf);
FileOutputStream outFile = new FileOutputStream("uploadFile.gz"); //用于将数据写入该流指向的文件中
FileInputStream in = new FileInputStream("uploadFile"); //该文件会被写压缩输出流,即被压缩。
//要压缩的话,如下对一个写入输出流的数据进行压缩
CompressionOutputStream out = codec.createOutputStream(outFile);
IOUtils.copyBytes(in, out, 4096, true);
}
//
public static void FileDecompressor() throws IOException
{
Configuration conf = new Configuration();
FileSystem local = FileSystem.getLocal(conf);
Path input = new Path("uploadFile.gz");
//获取所拥有的所有压缩器——工厂
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
//根据后缀得到相应的压缩器
CompressionCodec codec = factory.getCodec(input);
//移除文件名的后缀
String outputUri =CompressionCodecFactory.removeSuffix("uploadFile.gz", codec.getDefaultExtension());
InputStream in = null;
OutputStream out = null;
//从压缩输入流中读取内容放入文件输出流
in = codec.createInputStream(local.open(input));
out = local.create(new Path(outputUri));
IOUtils.copyBytes(in, out, conf, true);
}
public static void main(String [] args) throws IOException
{
StreamCompresson();
FileDecompressor();
}
}