half:关机

yarn端口:8088

删除hdfs目录:hadoop fs -rm -r /wc/output

namenode两个状态都是standby原因:zookeeper没有比hdfs先启动

现在来做一个流量统计的例子:

首先数据是这样一张表:见附件

java 浏览量的实现 java 流量统计_hadoop

统计:(代码)

java 浏览量的实现 java 流量统计_apache_02

1,flowbean:

package cn.itcast.hadoop.mr.flowsum;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean> {

private String phoneNB;
private long up_flow;
private long d_flow;
private long s_flow;
//在反序列化时候反射机制需要调用空参数构造方法,所以显示定义了一个空参构造函数
public FlowBean() {}
//为了对象数据的初始化方便,加入一个带参数的构造函数
public FlowBean(String phoneNB, long up_flow, long d_flow) {
super();
this.phoneNB = phoneNB;
this.up_flow = up_flow;
this.d_flow = d_flow;
this.s_flow = up_flow+d_flow;
}

@Override
public String toString() {
return ""+up_flow +"\t" +d_flow + "\t"+ s_flow;
}

public String getPhoneNB() {
return phoneNB;
}

public void setPhoneNB(String phoneNB) {
this.phoneNB = phoneNB;
}

public long getUp_flow() {
return up_flow;
}

public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}

public long getD_flow() {
return d_flow;
}

public void setD_flow(long d_flow) {
this.d_flow = d_flow;
}

public long getS_flow() {
return s_flow;
}

public void setS_flow(long s_flow) {
this.s_flow = s_flow;
}

//从数据流中 反序列化出对象的数据
// 从数据流中独处对象字段时候,必须跟序列化的顺序保持一致
@Override
public void readFields(DataInput in) throws IOException {
phoneNB = in.readUTF();
up_flow=in.readLong();
d_flow=in.readLong();
s_flow=in.readLong();
}

//将对象数据 序列化到流中
@Override
public void write(DataOutput out) throws IOException {
 
out.writeUTF(phoneNB);
out.writeLong(up_flow);
out.writeLong(d_flow);
out.writeLong(s_flow);
}
//比较,在这里实现了排序
@Override
public int compareTo(FlowBean o) {
return s_flow>o.getS_flow()?-1:1;
}

}

2,flowsumMapper:

package cn.itcast.hadoop.mr.flowsum;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

/**
 * @author yw.wang
 * FlowBean 是我们自定义的一种数据类型,要在hadoop的各个节点之间传输,所以应该遵循hadoop的序列化机制
 * 就必须实现hadoop的序列化接口
 *
 */
public class FlowSumMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
// 拿到日志中的一行数据,切分各个字段,抽取我们需要的字段:手机号,上行流量,下行流量,然后封装成kv类型发送出去,到reduce
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
//拿一行数据
String line = value.toString();
//切分成各个字段
String[] fields = StringUtils.split(line,"\t");
//拿到我们需要的字段
String phoneNB = fields[0];
long u_flow =Long.parseLong(fields[7]);
long d_flow =Long.parseLong(fields[8]);
//封装数据为kv类型并输出
context.write(new Text(phoneNB), new FlowBean(phoneNB,u_flow,d_flow));
}
}

3,flowsumreducer

package cn.itcast.hadoop.mr.flowsum;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FlowSumReducer extends Reducer<Text, FlowBean, Text, FlowBean>{
//框架每传递一组数据<1237435262,{flowbean,flowbean,flowbean....}>
//reduce中的业务逻辑就是遍历values,然后进行累加求和再输出
@Override
protected void reduce(Text key, Iterable<FlowBean> values,Context context)
throws IOException, InterruptedException {
long up_flow_counter= 0;
long d_flow_counter=0;
for (FlowBean bean : values) {
up_flow_counter +=bean.getUp_flow();
d_flow_counter+=bean.getD_flow();
}
context.write(key, new FlowBean(key.toString(),up_flow_counter,d_flow_counter));
}

}

4,flowsumrunner:

package cn.itcast.hadoop.mr.flowsum;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//这是job描述和提交类的规范写法
public class FlowSumRunner extends Configured implements Tool{

@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(FlowSumRunner.class);
job.setMapperClass(FlowSumMapper.class);
job.setReducerClass(FlowSumReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new FlowSumRunner(), args);
System.exit(res);
}

}

打成jar包:

java 浏览量的实现 java 流量统计_java 浏览量的实现_03

在集群中使用命令:

hadoop  jar  /root/Documents/sum.jar   cn.itcast.hadoop.mr.flowsum.FlowSumRunner  /wc/data/  /wc/sumoutput

解释:

java 浏览量的实现 java 流量统计_apache_04

排序:

java 浏览量的实现 java 流量统计_hadoop_05

代码:

package cn.itcast.hadoop.mr.flowsort;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import cn.itcast.hadoop.mr.flowsum.FlowBean;public class SortMR {   public static class SortMapper extends Mapper<LongWritable, Text, FlowBean, NullWritable>{  //拿到一行数据,切分出各字段,封装为一个flowbean,作为key输出 @Override protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException { String line = value.toString();  String[] fields = StringUtils.split(line, "\t");  String phoneNB = fields[0]; long u_flow = Long.parseLong(fields[1]); long d_flow = Long.parseLong(fields[2]);  context.write(new FlowBean(phoneNB, u_flow, d_flow), NullWritable.get());  }   }    public static class SortReducer extends Reducer<FlowBean, NullWritable, Text, FlowBean>{  @Override protected void reduce(FlowBean key, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException { String phoneNB = key.getPhoneNB(); context.write(new Text(phoneNB), key);  }  }    public static void main(String[] args) throws Exception {  Configuration conf = new Configuration();  Job job = Job.getInstance(conf); // main方法所在的类,此处表示自身的类 job.setJarByClass(SortMR.class);

//会代表map,reduce的output,如果不一样可以申明mapoutput类型,像下面的一样

job.setMapperClass(SortMapper.class); job.setReducerClass(SortReducer.class); // mapoutput类型 job.setMapOutputKeyClass(FlowBean.class); job.setMapOutputValueClass(NullWritable.class);  job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); //这两个参数正好是 hadoop jar 。。 最后两个参数 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //标准输出 System.exit(job.waitForCompletion(true)?0:1);  }}

排序是针对统计的结果进行排序,故数据元是统计完成之后的00000success那个文件

分组:

java 浏览量的实现 java 流量统计_java_06

FlowSumArea

package cn.itcast.hadoop.mr.areapartition;import java.io.IOException;import org.apache.commons.lang.StringUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.metrics2.impl.ConfigBuilder;import cn.itcast.hadoop.mr.flowsum.FlowBean;/** * 对流量原始日志进行流量统计,将不同省份的用户统计结果输出到不同文件 * 需要自定义改造两个机制 * 1,改造分区的逻辑,自定义一个partitioneer * 2,自定义reduer task的并发任务数 */public class FlowSumArea { public static class FlowSumAreaMapper extends Mapper<LongWritable, Text, Text, FlowBean>{  @Override protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {  //拿一行数据 String line = value.toString();  //切分成各个字段 String[] fields = StringUtils.split(line,"\t");  //拿到我们的字段 String phoneNB = fields[1]; long u_flow = Long.parseLong(fields[7]); long d_flow = Long.parseLong(fields[8]);  //封装数据为kv并输出 context.write(new Text(phoneNB), new FlowBean(phoneNB,u_flow,d_flow)); }   }  public static class FlowSumAreaReducer extends Reducer<Text, FlowBean, Text, FlowBean>{  @Override protected void reduce(Text key, Iterable<FlowBean> values,Context context) throws IOException, InterruptedException {  long up_flow_counter = 0; long d_flow_counter = 0;  for (FlowBean bean : values) { up_flow_counter +=bean.getUp_flow(); d_flow_counter += bean.getD_flow();   } context.write(key, new FlowBean(key.toString(),up_flow_counter,d_flow_counter));   } }  public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  Configuration conf = new Configuration(); Job job = Job.getInstance(conf);  job.setJarByClass(FlowSumArea.class);  //job.setMapperClass(FlowSumAreaMapper.class); job.setMapperClass(FlowSumAreaMapper.class); job.setReducerClass(FlowSumAreaReducer.class);  //设置我们自定义的分组逻辑定义 job.setPartitionerClass(AreaPartitioner.class);   job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class);  //设置reduce的任务并发数,应该跟分组的数量保持一致 job.setNumReduceTasks(6);//进程数如果大了,后面的文件为空,小了会出现错误,为1则没有分组   FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1]));   System.exit(job.waitForCompletion(true)?0:1);  } }

AreaPartitioner

package cn.itcast.hadoop.mr.areapartition;import java.util.HashMap;import org.apache.hadoop.mapreduce.Partitioner;public class AreaPartitioner<KEY, VALUE> extends Partitioner<KEY, VALUE> { private static HashMap<String,Integer> areaMap = new HashMap<>();  static{ areaMap.put("135", 0); areaMap.put("136", 1); areaMap.put("137", 2); areaMap.put("138", 3); areaMap.put("139", 4);   }  @Override public int getPartition(KEY key, VALUE value, int numPartitions) { //从key中拿到手机号,查询手机归属地字典,不同省份返回不同的组号 int areaCoder = areaMap.get(key.toString().substring(0,3))==null?5:areaMap.get(key.toString().substring(0,3));  return areaCoder; } }

运行:

hadoop jar /root/Documents/area.jar cn.itcast.hadoop.mr.areapartition.FlowSumArea /wc/data /wc/areasoutput

java 浏览量的实现 java 流量统计_大数据_07

至此,mapreduce的流量统计,分组,排序工作完成了