网上MapReduce的Demo不少,不过学习嘛,还得一步步来。

不说废话了,直接干。

在Linux机器中,搭建了Hadoop伪集群,详见:Hadoop - 1 - 集群搭建

java读取BufferedInputStream内容生成jpg格式文件到指定目录_apache

 

Hadoop学习的代码,完整项目全部上传GitHub上了,需要的同学请自取;

如果觉得还不错,麻烦点个star

https://github.com/Simba-cheng/HadoopCode

 

Windows操作系统 IDEA中远程调用Hadoop集群进行本地运行

下面看代码,测试数据,这里不贴出来了,去GitHub上拿,或者自己造也行。

用于输出日志

package com.server.conf;

import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.apache.log4j.RollingFileAppender;

/**
 * @author CYX
 * @create 2018-08-30-23:38
 */
public class LoadConfig {

    public static final Logger logger = Logger.getLogger(LoadConfig.class);

    static {
        String pattern = "%d{yyyy-MM-dd HH:mm:ss} %p %l - %m%n";
        Logger rootLogger = Logger.getRootLogger();
        rootLogger.setLevel(Level.INFO);
        RollingFileAppender rollingFileAppender = new RollingFileAppender();
        //文件数
        rollingFileAppender.setMaxBackupIndex(4);
        rollingFileAppender.setMaxFileSize("50MB");
        rollingFileAppender.setFile("./log/logger.log");
        rollingFileAppender.setEncoding("UTF-8");
        rollingFileAppender.setLayout(new PatternLayout(pattern));
        rollingFileAppender.activateOptions();
        rootLogger.addAppender(new ConsoleAppender((new PatternLayout(pattern))));
        rootLogger.addAppender(rollingFileAppender);
    }

    private static class SingletonHolder {
        private static final LoadConfig INSTANCE = new LoadConfig();
    }

    public static final LoadConfig getInstance() {
        return SingletonHolder.INSTANCE;
    }

}

mapper

package com.server.mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.log4j.Logger;

import java.io.IOException;

/**
 * 四个泛型类型分别代表:
 * KeyIn        Mapper的输入数据的Key,这里是每行文字的起始位置(0,11,...)
 * ValueIn      Mapper的输入数据的Value,这里是每行文字
 * KeyOut       Mapper的输出数据的Key,这里是每行文字中的“年份”
 * ValueOut     Mapper的输出数据的Value,这里是每行文字中的“气温”
 *
 * @author CYX
 * @create 2018-08-30-23:43
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    public static final Logger logger = Logger.getLogger(WordCountMapper.class);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        logger.info("key : " + key + " , value : " + value);

        //获取每一行数据
        String line = value.toString();
        //年份
        String year = line.substring(0, 4);
        //温度
        int temperature = Integer.parseInt(line.substring(8));

        context.write(new Text(year), new IntWritable(temperature));

        logger.info("======" + "After Mapper:" + new Text(year) + ", " + new IntWritable(temperature));
    }
}

reducer

package com.server.mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.log4j.Logger;

import java.io.IOException;

/**
 * 四个泛型类型分别代表:
 * KeyIn        Reducer的输入数据的Key,这里是每行文字中的“年份”
 * ValueIn      Reducer的输入数据的Value,这里是每行文字中的“气温”
 * KeyOut       Reducer的输出数据的Key,这里是不重复的“年份”
 * ValueOut     Reducer的输出数据的Value,这里是这一年中的“最高气温”
 *
 * @author CYX
 * @create 2018-08-30-23:44
 */
public class WordCountReduce extends Reducer<Text, IntWritable, Text, IntWritable> {

    public static final Logger logger = Logger.getLogger(WordCountReduce.class);

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        int maxValue = Integer.MIN_VALUE;
        StringBuffer sb = new StringBuffer();

        for (IntWritable intWritable : values) {
            maxValue = Math.max(maxValue, intWritable.get());
            sb.append(intWritable).append(" , ");
        }
        logger.info("Before Reduce: " + key + ", " + sb.toString());
        context.write(key, new IntWritable(maxValue));
        logger.info("======" + "After Reduce: " + key + ", " + maxValue);

    }
}

main

package com.server;

import com.server.conf.LoadConfig;
import com.server.mapreduce.WordCountMapper;
import com.server.mapreduce.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.log4j.Logger;

/**
 * @author CYX
 */
public class MapReduceAppDemo01 {


    public static final Logger logger = Logger.getLogger(WordCountMapper.class);

    public static void main(String[] args) throws Exception {

        //windows指定hadoop目录
        System.setProperty("hadoop.home.dir", "/G:\\hadoop\\hadoop-2.8.1");

        LoadConfig.getInstance();

        //读取文件
        String wordCountIputFileHDFS = "hdfs://192.168.137.160:9000/MapReduce-Demo-01/input/wordcount.txt";

        //结果输出文件-必须是不存在的文件
        String wordCountOutputFileHDFS = "hdfs://192.168.137.160:9000/MapReduce-Demo-01/output/outresult01";

        Configuration hadoopConfig = new Configuration();
        hadoopConfig.set("fs.hdfs.impl", DistributedFileSystem.class.getName());
        hadoopConfig.set("fs.file.impl", LocalFileSystem.class.getName());

        Job job = new Job(hadoopConfig);

        //job执行作业时输入和输出文件的路径
        FileInputFormat.addInputPath(job, new Path(wordCountIputFileHDFS));
        FileOutputFormat.setOutputPath(job, new Path(wordCountOutputFileHDFS));

        //指定自定义的Mapper和Reducer作为两个阶段的任务处理类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReduce.class);

        //设置最后输出结果的Key和Value的类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //执行job,直到完成
        job.waitForCompletion(true);
        logger.info("Finished");

    }


}

如果你现在直接运行程序,肯定是不行的,会出错。

当你在 IDEA 远程调试  Linux Hadoop集群的之前,需要在Windows本地装一个Hadoop。

网上很多资料,不再赘述:


本地Windows环境的Hadoop安装好了之后,运行程序,会出现下面这个程序错误:

org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z

为什么会出现这个错误,网上去找,或者去我GitHub中直接拿代码。

 

上面全部安装、搭建、修改完成之后,就可以运行程序了。

java读取BufferedInputStream内容生成jpg格式文件到指定目录_apache_02

然后去HDFS上看下结果

java读取BufferedInputStream内容生成jpg格式文件到指定目录_hadoop_03

输出结果

java读取BufferedInputStream内容生成jpg格式文件到指定目录_apache_04

ok,Windows远程调用Hadoop集群没有问题。

 

 

MapReduce打包上传Linux服务器运行

通过另一个例子,来看下如何将MapReduce程序打成Jar,上传Linux运行。

map过程:

package com.server.mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.StringTokenizer;

/**
 * Map过程<br>
 * Map过程需要继承org.apache.hadoop.mapreduce.Mapper类,并重写map方法。
 *
 * @author CYX
 * @create 2018-09-01-12:28
 */
public class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {

    private final static IntWritable one = new IntWritable(1);

    private Text word = new Text();

    @Override
    protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {

        System.out.println("map() , key:" + key + " , value : " + value);

        StringTokenizer itr = new StringTokenizer(value.toString());

        while (itr.hasMoreTokens()) {
            word.set(itr.nextToken());
            context.write(word, one);
        }

    }
}

reducer过程

package com.server.mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Reducer过程
 *
 * @author CYX
 * @create 2018-09-01-13:16
 */
public class InSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    private IntWritable result = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

        System.out.println("reduce() , key : " + key + " , values : " + values);

        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }

        result.set(sum);
        context.write(key, result);

    }


}

main方法

package com.server;

import com.server.mapreduce.InSumReducer;
import com.server.mapreduce.TokenizerMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * @author CYX
 */
public class MapReduceAppDemo02 {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 2) {

            System.err.println("Usage : wrodcount <in><out>");
            System.exit(2);

        }

        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(MapReduceAppDemo02.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(InSumReducer.class);
        job.setReducerClass(InSumReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

然后,将程序打成jar包,用idea maven中的package。

java读取BufferedInputStream内容生成jpg格式文件到指定目录_apache_05

然后将jar包上传至指定目录中

java读取BufferedInputStream内容生成jpg格式文件到指定目录_hadoop_06

 

再声明一下,下面的操作,是基于Hadoop以及配置好环境变量的,如果还没配置最好配一下,方便一些。

服务器上新创建一个目录:

/home/yxcheng/hadoop/mr/mapreducedemo02

然后将jar包丢进去。

然后我们创建测试数据:

echo "Hello World" > file-1.bcp
echo "Hello Hadoop" > file-2.bcp

最后目录中有这些文件:

java读取BufferedInputStream内容生成jpg格式文件到指定目录_hadoop_07

一个MapReduce可运行jar包、两个测试数据文件。

 

下面在hdfs上新建目录,并将bcp文件上传:

#查看hdfs上根节点下所有目录
hadoop fs -ls /

#创建一级目录
hadoop fs -mkdir /MapReduce-Demo-02

#创建二级目录
hadoop fs -mkdir /MapReduce-Demo-02/input
hadoop fs -mkdir /MapReduce-Demo-02/output

#将测试数据上传/MapReduce-Demo-02/input目录
#'./file-1.bcp' linux文件路径
#'/MapReduce-Demo-02/input' HDFS存储路径
hadoop fs -put ./file-1.bcp /MapReduce-Demo-02/input
hadoop fs -put ./file-2.bcp /MapReduce-Demo-02/input

#确认input目录中上传的数据
hadoop fs -ls /MapReduce-Demo-02/input

上传文件结束之后,运行jar包,具体命令如下:

#hadoop运行jar命令
hadoop jar Hadoop-MapReduceDemo-02-1.0-SNAPSHOT.jar com.server.MapReduceAppDemo02 /MapReduce-Demo-02/input /MapReduce-Demo-02/output/outResult02

解释下:

Hadoop-MapReduceDemo-02-1.0-SNAPSHOT.jar:你要运行jar包的名称 com.server.MapReduceAppDemo02 : 运行jar包中的主类全类名 /MapReduce-Demo-02/input:测试数据在hdfs中的目录 /MapReduce-Demo-02/output/outResult02:mapreduce运行结束之后,结果数据输出路径

运行程序:

java读取BufferedInputStream内容生成jpg格式文件到指定目录_hadoop_08

查看hdfs中的结果数据:

java读取BufferedInputStream内容生成jpg格式文件到指定目录_apache_09

java读取BufferedInputStream内容生成jpg格式文件到指定目录_hadoop_10

完成