hadoop mvn 编译 hadoop 代码_Text


输入文件1,2,3经过mapper处理成中间结果,最后再shuffle给reduce,最后得到最终结果,处理输入时的key都是默认排好序的。hadoop1的hdfs的block的大小是64M,从hadoop2起block大小默认是128M。

案例一:

输入一堆单词,统计各个单词重复出现的个数

输入的格式如下,单词之间以空格隔开

hadoop mvn 编译 hadoop 代码_hdfs_02


编写输出中间结果的mapper

/**
 * 输入的map是(行号,一行内容)
 * 输出的map是(单词,1)
 * 所以继承的mapper是<LongWriteable(行号), Text(一行内容), Text(单词), IntWritable(1)>
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    /**
     * 
     * @param key key是代表文件的行号
     * @param value value是每一行的内容
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        // 每一行的单词以空格隔开
        String[] words = line.split(" ");
        for (String word : words) {
            // 输出的key是单词,value都是1
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

编写输出最终结果的reducer

/**
 * mapper输出的map是<Text, IntWritable>,这也是Reducer的输入
 * Reducer的输出是<Text(单词),IntWritable(单词的重复次数)
 */
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable value : values) {
            // 对mapper输出的单词统计一下,就能知道该单词的重复次数
            count += value.get();
        }
        context.write(key, new IntWritable(count));
    }
}

编写运行main方法

public class WordCountRunner {
    public static void main(String[] args) {
        try {
            Configuration conf = new Configuration();
            Job wcjob = Job.getInstance(conf);
            wcjob.setJarByClass(WordCountRunner.class);
            wcjob.setMapperClass(WordCountMapper.class);
            wcjob.setReducerClass(WordCountReducer.class);
            // mapper输出的key类型
            wcjob.setMapOutputKeyClass(Text.class);
            // mapper输出的value的类型
            wcjob.setMapOutputValueClass(IntWritable.class);
            // reducer输出的key类型
            wcjob.setOutputKeyClass(Text.class);
            // reducer输出的value类型
            wcjob.setOutputValueClass(IntWritable.class);

            // 输入文件路径
            FileInputFormat.setInputPaths(wcjob, new Path("hdfs://localhost:9000/wordcount/input/"));
            // 输出路径,路径不能已存在,否则就结出错
            FileOutputFormat.setOutputPath(wcjob, new Path("hdfs://localhost:9000/wordcount/output/"));
            boolean res = wcjob.waitForCompletion(true);
            System.out.println(res);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

案例二:求矩阵相乘的结果

矩阵1:

hadoop mvn 编译 hadoop 代码_ide_03


矩阵2:

hadoop mvn 编译 hadoop 代码_hadoop mvn 编译_04


其中的每一行的第1个数代表矩阵的第几行,以缩进符隔开的内容是该行每一列的数字,如上图1_0,则代表第一列的数是0,加上前面的数字1,就是矩阵第一行第一列的数字是0

为了方便运行需要把矩阵2转置

public class Mapper1 extends Mapper<LongWritable, Text, Text, Text> {
    private Text outKey = new Text();
    private Text outValue = new Text();

    /**
     *
     * @param key 行号
     * @param value 第key行的所有列的值
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 行号和对应列的值之间以缩进符隔开
        String[] rowAndLine = value.toString().split("\t");
        String row = rowAndLine[0];
        String[] lines = rowAndLine[1].split(",");

        for (int i = 0; i < lines.length; i++) {
            String column = lines[i].split("_")[0];
            String valueStr = lines[i].split("_")[1];
            // key是列
            outKey.set(column);
            // value是(行 + "_" + 对应值)
            outValue.set(row + "_" + valueStr);
            context.write(outKey, outValue);
        }
    }
}
public class Reducer1 extends Reducer<Text, Text, Text, Text> {
    private Text outKey = new Text();
    private Text outValue = new Text();

    /**
     * 转置矩阵2
     */
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();
        for (Text text : values) {
            sb.append(text + ",");
        }
        String line = null;
        if (sb.toString().endsWith(",")) {
            line = sb.substring(0, sb.length() - 1);
        }
        outKey.set(key);
        outValue.set(line);
        context.write(outKey, outValue);
    }
}
public class MR1 {
    private static String inPath = "/matrix/step1_input/matrix2.txt";
    private static String outPath = "/matrix/step1_output";
    private static String hdfs = "hdfs://localhost:9000";

    public int run() {
        try {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", hdfs);
            Job job = Job.getInstance(conf, "step1");
            job.setJarByClass(MR1.class);
            job.setMapperClass(Mapper1.class);
            job.setReducerClass(Reducer1.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileSystem fs = FileSystem.get(conf);
            Path inputPath = new Path(inPath);
            if (fs.exists(inputPath)) {
                FileInputFormat.addInputPath(job, inputPath);
            }
            Path outputPath = new Path(outPath);
            fs.delete(outputPath, true);
            FileOutputFormat.setOutputPath(job, outputPath);
            return job.waitForCompletion(true) ? 1 : -1;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return -1;
    }

    public static void main(String[] args) {
        BasicConfigurator.configure();
        int result = -1;
        result = new MR1().run();
        if (result == 1) {
            System.out.println("step1运行成功...");
        } else if (result == -1) {
            System.out.println("step1运行失败");
        }
    }
}

运行MR1的main方法得到矩阵2的转置矩阵,结果存在/matrix/step1_output文件夹中,mapreduce会把结果存进去,有两个文件,一个是空文件_SUCCESS代表处理成功,而文件名part-r-00000则存放最终结果

编写MR2的运行入口

public class MR2 {
    private static String inPath = "/matrix/step2_input/matrix1.txt";
    private static String outPath = "/matrix/output";
    private static String cache = "/matrix/step1_output/part-r-00000";
    private static String hdfs = "hdfs://localhost:9000";

    public int run() {
        try {
            Configuration conf = new Configuration();
            conf.set("fs.defaultFS", hdfs);
            Job job = Job.getInstance(conf, "step2");
            // 此处使用了分布式缓存DistributedCache,其别名是matrix2,就是矩阵2的转置矩阵
            job.addCacheArchive(new URI(cache + "#matrix2"));
            job.setJarByClass(MR2.class);
            job.setMapperClass(Mapper2.class);
            job.setReducerClass(Reducer2.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileSystem fs = FileSystem.get(conf);
            Path inputPath = new Path(inPath);
            if (fs.exists(inputPath)) {
                FileInputFormat.addInputPath(job, inputPath);
            }
            Path outputPath = new Path(outPath);
            fs.delete(outputPath, true);
            FileOutputFormat.setOutputPath(job, outputPath);
            return job.waitForCompletion(true) ? 1 : -1;
        } catch (Exception e) {
            e.printStackTrace();
        }
        return -1;
    }

    public static void main(String[] args) {
        BasicConfigurator.configure();
        int result = 0;
        result = new MR2().run();
        if (result == 1) {
            System.out.println("step2运行成功。。。");
        } else if (result == -1) {
            System.out.println("step2运行失败...");
        }
    }
}
public class Mapper2 extends Mapper<LongWritable, Text, Text, Text> {
    private Text outKey = new Text();
    private Text outValue = new Text();

    private List<String> cacheList = new ArrayList<>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 加载mapper实例时取出缓存里的矩阵2的转置矩阵
        FileReader fr = new FileReader("matrix2");
        BufferedReader br = new BufferedReader(fr);

        String line = null;
        while ((line = br.readLine()) != null) {
            cacheList.add(line);
        }
        fr.close();
        br.close();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String row_matrix1 = value.toString().split("\t")[0];
        String[] column_value_array_matrix1 = value.toString().split("\t")[1].split(",");
        for (String line : cacheList) {
            String row_matrix2 = line.toString().split("\t")[0];
            String[] column_value_array_matrix2 = line.toString().split("\t")[1].split(",");

            int result = 0;
            for (String column_value_matrix1 : column_value_array_matrix1) {
                String column_matrix1 = column_value_matrix1.split("_")[0];
                String value_matrix1 = column_value_matrix1.split("_")[1];

                for (String column_value_matrix2 : column_value_array_matrix2) {
                    if (column_value_matrix2.startsWith(column_matrix1 + "_")) {
                        String value_matrix2 = column_value_matrix2.split("_")[1];
                        result += Integer.valueOf(value_matrix1) * Integer.valueOf(value_matrix2);
                    }
                }
            }
            outKey.set(row_matrix1);
            // reducer的输出结果写入到文件时默认key和value之间是以缩进符分隔开的
            outValue.set(row_matrix2 + "_" + result);
            context.write(outKey, outValue);
        }
    }
}

输出结果如下

hadoop mvn 编译 hadoop 代码_Text_05


行能做到1,2,3,4,5顺序排是因为mapper和reducer默认输出结果时key就是按字典排序的,而后面的值就不能保证顺序了,当然这不重要,结果是对的