Mapreduce是一个分布式运算程序的编程框架,是用户开发“基于hadoop的数据分析应用”的核心框架。

Mapreduce分布式运算程序往往分2个阶段:

map阶段:并发实例,各司其职,互不干涉,完全并行;

reduce阶段:并发实例,互补干涉,但它们的数据依赖上一个阶段实例输出;

mrapplication master 负责监管map和reduce。

Mapreduce实现两个表的join:

方式一: 分布式缓存,把一个小表当文件,把文件丢到每个task的磁盘上,task启动时加载文件,然后和另一个表做关联,这样就只需要写map端程序;

大数据之Mapreduce_mapreduce

例如:

package cn.doitedu.basic.mr;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;

public class MapJoin {

public static class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable>{
HashMap<String, String> dictB = new HashMap<>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {

BufferedReader br = new BufferedReader(new FileReader("b1.txt"));
String line = null;
while((line=br.readLine())!=null){
String[] split = line.split(",");
dictB.put(split[0],line);
}


}


@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

String did = value.toString().split(",")[1];
String b_data = dictB.get(did);

context.write(new Text(value.toString()+","+b_data),NullWritable.get());

}
}

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();
Job job = Job.getInstance(conf);

job.setJarByClass(MapJoin.class);

job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);

job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);

FileInputFormat.setInputPaths(job,new Path("basic_knowledge/data/join/t_a"));
FileOutputFormat.setOutputPath(job,new Path("basic_knowledge/data/join/output"));

job.addCacheFile(new URI("hdfs://doitedu01:8020/cachefiles/b1.txt"));

boolean b = job.waitForCompletion(true);
System.exit(b?0:2);

}


}

方式二: 正常的运算,需要写map和reduce程序,在reduce端进行join

大数据之Mapreduce_mapreduce_02

例如:

package cn.doitedu.basic.mr;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;

public class ReduceJoin {

public static class JoinMapper extends Mapper<LongWritable, Text,Text,Text>{
String name = "";
@Override
protected void setup(Context context) throws IOException, InterruptedException {
FileSplit inputSplit = (FileSplit) context.getInputSplit();
name = inputSplit.getPath().getName();

}

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

String line = value.toString();

String deviceId = "";
String[] split = line.split(",");

String v = value.toString();

if(name.startsWith("a")){
deviceId = split[1];
v = "a,"+v;
}else{
deviceId = split[0];
v = "b,"+v;
}

context.write(new Text(deviceId),new Text(v));
}
}


public static class JoinReducer extends Reducer<Text,Text,Text, NullWritable>{

@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
ArrayList<String> aDatas = new ArrayList<>();
String bData = "";

for (Text value : values) {
if(value.toString().startsWith("a")){
aDatas.add(value.toString());
}else{
bData = value.toString();
}
}


for (String aData : aDatas) {
context.write(new Text(aData+","+bData),NullWritable.get());
}
}
}


}

大数据里的数据很少来自数据库,大部分来自用户浏览行为,分析用户行为,并把它们记录在日志里。

计算单词的流程如下:

大数据之Mapreduce_combinator_03

如果用户的业务逻辑比较复杂,那就只能来多个mapreduce程序,串行运行。

二、统计单词个数

1、导入项目需要的包

2、WordcountMapper类

  package cn.itcast.bigdata.mr.wcdemo;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
* KEYIN: 默认情况下,是mr框架所读到的一行文本的起始偏移量,Long,
* 但是在hadoop中有自己的更精简的序列化接口,所以不直接用Long,而用 LongWritable
*
* VALUEIN:默认情况下,是mr框架所读到的一行文本的内容,String,同上,用Text
*
* KEYOUT:是用户自定义逻辑处理完成之后输出数据中的key,在此处是单词,String,同上,用Text
* VALUEOUT:是用户自定义逻辑处理完成之后输出数据中的value,在此处是单词次数,Integer,同上,用IntWritable
*
*/

public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{

/**
* map阶段的业务逻辑就写在自定义的map()方法中
* maptask会对每一行输入数据调用一次我们自定义的map()方法
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

//将maptask传给我们的文本内容先转换成String
String line = value.toString();
//根据空格将这一行切分成单词
String[] words = line.split(" ");

//将单词输出为<单词,1>
for(String word:words){
//将单词作为key,将次数1作为value,以便于后续的数据分发,可以根据单词分发,以便于相同单词会到相同的reduce task
context.write(new Text(word), new IntWritable(1));
}
}

}

3、WordcountReducer类

  package cn.itcast.bigdata.mr.wcdemo;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/**
* KEYIN, VALUEIN 对应 mapper输出的KEYOUT,VALUEOUT类型对应
*
* KEYOUT, VALUEOUT 是自定义reduce逻辑处理结果的输出数据类型
* KEYOUT是单词
* VLAUEOUT是总次数
* @author
*
*/
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{

/**
* <angelababy,1><angelababy,1><angelababy,1><angelababy,1><angelababy,1>
* <hello,1><hello,1><hello,1><hello,1><hello,1><hello,1>
* <banana,1><banana,1><banana,1><banana,1><banana,1><banana,1>
* 入参key,是一组相同单词kv对的key
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

int count=0;
/*Iterator<IntWritable> iterator = values.iterator();
while(iterator.hasNext()){
count += iterator.next().get();
}*/

for(IntWritable value:values){

count += value.get();
}

context.write(key, new IntWritable(count));

}

}

4、WordcountDriver类

  package cn.itcast.bigdata.mr.wcdemo;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
* 相当于一个yarn集群的客户端
* 需要在此封装我们的mr程序的相关运行参数,指定jar包
* 最后提交给yarn
*
*/
public class WordcountDriver {

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();

Job job = Job.getInstance(conf);

//指定本程序的jar包所在的本地路径
job.setJarByClass(WordcountDriver.class);

//指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordcountReducer.class);

//指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);

//指定最终输出的数据的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

//指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));

//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/*job.submit();*/
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);

}

}

5、打包在linux环境下运行(hadoop jar +包名 +WordcountDriver路径和类名 输入路径 输出路径 )

大数据之Mapreduce_hadoop_04

三、统计手机号流量

序列化:

反序列化:

大数据之Mapreduce_hadoop_05

1、导入项目需要包

2、创建FlowBean实体类

  package cn.itcast.bigdata.mr.flowsum;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;

public class FlowBean implements Writable{

private long upFlow;
private long dFlow;
private long sumFlow;

//反序列化时,需要反射调用空参构造函数,所以要显示定义一个
public FlowBean(){}

public FlowBean(long upFlow, long dFlow) {
this.upFlow = upFlow;
this.dFlow = dFlow;
this.sumFlow = upFlow + dFlow;
}

public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getdFlow() {
return dFlow;
}
public void setdFlow(long dFlow) {
this.dFlow = dFlow;
}


public long getSumFlow() {
return sumFlow;
}


public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}


/**
* 序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(dFlow);
out.writeLong(sumFlow);

}


/**
* 反序列化方法
* 注意:反序列化的顺序跟序列化的顺序完全一致
*/
@Override
public void readFields(DataInput in) throws IOException {
upFlow = in.readLong();
dFlow = in.readLong();
sumFlow = in.readLong();
}

@Override
public String toString() {

return upFlow + "\t" + dFlow + "\t" + sumFlow;
}

}

3、FlowCount类

  package cn.itcast.bigdata.mr.flowsum;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FlowCount {

static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

//将一行内容转成string
String line = value.toString();
//切分字段
String[] fields = line.split("\t");
//取出手机号
String phoneNbr = fields[1];
//取出上行流量下行流量
long upFlow = Long.parseLong(fields[fields.length-3]);
long dFlow = Long.parseLong(fields[fields.length-2]);

context.write(new Text(phoneNbr), new FlowBean(upFlow, dFlow));

}

}

static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{

//<183323,bean1><183323,bean2><183323,bean3><183323,bean4>.......
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

long sum_upFlow = 0;
long sum_dFlow = 0;

//遍历所有bean,将其中的上行流量,下行流量分别累加
for(FlowBean bean: values){
sum_upFlow += bean.getUpFlow();
sum_dFlow += bean.getdFlow();
}

FlowBean resultBean = new FlowBean(sum_upFlow, sum_dFlow);
context.write(key, resultBean);

}

}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
/*conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resoucemanager.hostname", "mini1");*/
Job job = Job.getInstance(conf);

/*job.setJar("/home/hadoop/wc.jar");*/
//指定本程序的jar包所在的本地路径
job.setJarByClass(FlowCount.class);

//指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);

//指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);

//指定最终输出的数据的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);

//指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));

//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/*job.submit();*/
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);

}

}

3、打包在linux环境下运行(hadoop jar +包名 +WordcountDriver路径和类名 输入路径 输出路径 )

大数据之Mapreduce_apache_06

四、按省份统计手机号流量

1、导入项目需要的包

2、FlowBean实体类

  package cn.itcast.bigdata.mr.provinceflow;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
public class FlowBean implements Writable{

private long upFlow;
private long dFlow;
private long sumFlow;

//反序列化时,需要反射调用空参构造函数,所以要显示定义一个
public FlowBean(){}

public FlowBean(long upFlow, long dFlow) {
this.upFlow = upFlow;
this.dFlow = dFlow;
this.sumFlow = upFlow + dFlow;
}


public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getdFlow() {
return dFlow;
}
public void setdFlow(long dFlow) {
this.dFlow = dFlow;
}


public long getSumFlow() {
return sumFlow;
}


public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}


/**
* 序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(dFlow);
out.writeLong(sumFlow);

}

/**
* 反序列化方法
* 注意:反序列化的顺序跟序列化的顺序完全一致
*/
@Override
public void readFields(DataInput in) throws IOException {
upFlow = in.readLong();
dFlow = in.readLong();
sumFlow = in.readLong();
}

@Override
public String toString() {

return upFlow + "\t" + dFlow + "\t" + sumFlow;
}

}

3、ProvincePartitioner实体类

  package cn.itcast.bigdata.mr.provinceflow;

import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
* K2 V2 对应的是map输出kv的类型
* @author
*
*/
public class ProvincePartitioner extends Partitioner<Text, FlowBean>{

public static HashMap<String, Integer> proviceDict = new HashMap<String, Integer>();
static{
proviceDict.put("136", 0);
proviceDict.put("137", 1);
proviceDict.put("138", 2);
proviceDict.put("139", 3);
}



@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
String prefix = key.toString().substring(0, 3);
Integer provinceId = proviceDict.get(prefix);

return provinceId==null?4:provinceId;
}
}

4、FlowCount

  package cn.itcast.bigdata.mr.provinceflow;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class FlowCount {

static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

String line = value.toString(); //将一行内容转成string
String[] fields = line.split("\t"); //切分字段
String phoneNbr = fields[1]; //取出手机号

long upFlow = Long.parseLong(fields[fields.length-3]); //取出上行流量下行流量
long dFlow = Long.parseLong(fields[fields.length-2]);

context.write(new Text(phoneNbr), new FlowBean(upFlow, dFlow));
}
}


static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
//<183323,bean1><183323,bean2><183323,bean3><183323,bean4>.......
@Override
protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

long sum_upFlow = 0;
long sum_dFlow = 0;

//遍历所有bean,将其中的上行流量,下行流量分别累加
for(FlowBean bean: values){
sum_upFlow += bean.getUpFlow();
sum_dFlow += bean.getdFlow();
}

FlowBean resultBean = new FlowBean(sum_upFlow, sum_dFlow);
context.write(key, resultBean);
}
}

public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
/*conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resoucemanager.hostname", "mini1");*/
Job job = Job.getInstance(conf);

/*job.setJar("/home/hadoop/wc.jar");*/
//指定本程序的jar包所在的本地路径
job.setJarByClass(FlowCount.class);

//指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);

//指定我们自定义的数据分区器
job.setPartitionerClass(ProvincePartitioner.class);
//同时指定相应“分区”数量的reducetask
job.setNumReduceTasks(5);

//指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);

//指定最终输出的数据的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);

//指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));

//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/*job.submit();*/
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}

}

5、打包在linux环境下运行(hadoop jar +包名 +WordcountDriver路径和类名 输入路径 输出路径 )

五、手机号流量排序

1、导入项目需要的包

2、FlowBean实体类

  package cn.itcast.bigdata.mr.flowsum;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean>{

private long upFlow;
private long dFlow;
private long sumFlow;

//反序列化时,需要反射调用空参构造函数,所以要显示定义一个
public FlowBean(){}

public FlowBean(long upFlow, long dFlow) {
this.upFlow = upFlow;
this.dFlow = dFlow;
this.sumFlow = upFlow + dFlow;
}


public void set(long upFlow, long dFlow) {
this.upFlow = upFlow;
this.dFlow = dFlow;
this.sumFlow = upFlow + dFlow;
}

public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getdFlow() {
return dFlow;
}
public void setdFlow(long dFlow) {
this.dFlow = dFlow;
}


public long getSumFlow() {
return sumFlow;
}

public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}

/**
* 序列化方法
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(dFlow);
out.writeLong(sumFlow);

}

/**
* 反序列化方法
* 注意:反序列化的顺序跟序列化的顺序完全一致
*/
@Override
public void readFields(DataInput in) throws IOException {
upFlow = in.readLong();
dFlow = in.readLong();
sumFlow = in.readLong();
}

@Override
public String toString() {

return upFlow + "\t" + dFlow + "\t" + sumFlow;
}

@Override
public int compareTo(FlowBean o) {
return this.sumFlow>o.getSumFlow()?-1:1; //从大到小, 当前对象和要比较的对象比, 如果当前对象大, 返回-1, 交换他们的位置(自己的理解)
}

}

3、FlowCountSort类

  package cn.itcast.bigdata.mr.flowsum;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import cn.itcast.bigdata.mr.flowsum.FlowCount.FlowCountMapper;
import cn.itcast.bigdata.mr.flowsum.FlowCount.FlowCountReducer;

/**
* 13480253104 180 180 360 13502468823 7335 110349 117684 13560436666 1116 954
* @author
*
*/
public class FlowCountSort {

static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text> {

FlowBean bean = new FlowBean();
Text v = new Text();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

// 拿到的是上一个统计程序的输出结果,已经是各手机号的总流量信息
String line = value.toString();

String[] fields = line.split("\t");

String phoneNbr = fields[0];

long upFlow = Long.parseLong(fields[1]);
long dFlow = Long.parseLong(fields[2]);

bean.set(upFlow, dFlow);
v.set(phoneNbr);

context.write(bean, v);

}

}

/**
* 根据key来掉, 传过来的是对象, 每个对象都是不一样的, 所以每个对象都调用一次reduce方法
* @author: 张政
* @date: 2016年4月11日 下午7:08:18
* @package_name: day07.sample
*/
static class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean> {

// <bean(),phonenbr>
@Override
protected void reduce(FlowBean bean, Iterable<Text> values, Context context) throws IOException, InterruptedException {

context.write(values.iterator().next(), bean);

}

}

public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();
/*conf.set("mapreduce.framework.name", "yarn");
conf.set("yarn.resoucemanager.hostname", "mini1");*/
Job job = Job.getInstance(conf);

/*job.setJar("/home/hadoop/wc.jar");*/
//指定本程序的jar包所在的本地路径
job.setJarByClass(FlowCountSort.class);

//指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(FlowCountSortMapper.class);
job.setReducerClass(FlowCountSortReducer.class);

//指定mapper输出数据的kv类型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);

//指定最终输出的数据的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);

//指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job的输出结果所在目录

Path outPath = new Path(args[1]);
/*FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)){
fs.delete(outPath, true);
}*/
FileOutputFormat.setOutputPath(job, outPath);

//将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/*job.submit();*/
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);

}

}

六、mapreduce运行方式:

一、本地模式

大数据之Mapreduce_combinator_07

二、集群模式(默认不用写,运行hadoop jar会加载)

大数据之Mapreduce_hadoop_08

七、combiner运行机制

combiner是MR程序中Mapper和Reduce之外的一种组件。

默认情况下,按文件规划切片,不管文件大小,都会是一个单独的切片,都会交给一个maptask,若是有大量小文件,会产生大量的maptask,效率低下。

优化策略:

方法一:在数据处理时的最前端,将小文件合并成大文件,再传给HDFS;

方法二:使用combineFileInputFormat,将多个小文件从逻辑上规划到一个切片中,多个小文件交给一个maptask。

代码略;

mapreduce的任务切片是一个逻辑划分,一个切片对应一个maptask实例。

更多java、大数据学习面试资料,请扫码关注我的公众号:

大数据之Mapreduce_mapreduce_09