1、对流量日志中的用户统计总上、下行流量

技术点:自定义javaBean用来在mapreduce中充当value

注意: javaBean要实现Writable接口,实现两个方法

package com.mr.flowsum;

import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 自定义JavaBean:FlowBean
 */
public class FlowBean implements WritableComparable<FlowBean> {

    private long upFlow;//上行流量
    private long dFlow;//下行流量
    private long sumFlow;//总流量

    //反序列化时,需要反射调用空参构造函数,所以要显示定义一个
    public FlowBean() {
    }

    public FlowBean(long upFlow, long dFlow) {
        this.upFlow = upFlow;
        this.dFlow = dFlow;
        this.sumFlow = upFlow + dFlow;
    }

    public void set(long upFlow, long dFlow) {
        this.upFlow = upFlow;
        this.dFlow = dFlow;
        this.sumFlow = upFlow + dFlow;
    }

    /**
     * 序列化方法
     */
    public void write(DataOutput out) throws IOException {
        out.writeLong(upFlow);
        out.writeLong(dFlow);
        out.writeLong(sumFlow);
    }

    /**
     * 反序列化方法
     * 注意:反序列化的顺序跟序列化的顺序完全一致
     */
    public void readFields(DataInput in) throws IOException {
        upFlow = in.readLong();
        dFlow = in.readLong();
        sumFlow = in.readLong();
    }


    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getdFlow() {
        return dFlow;
    }

    public void setdFlow(long dFlow) {
        this.dFlow = dFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    @Override
    public String toString() {
        return upFlow + "\t" + dFlow + "\t" + sumFlow;
    }

    public int compareTo(FlowBean o) {
        return this.getSumFlow() > o.getSumFlow() ? -1 : 1;//从大到小, 当前对象和要比较的对象比, 如果当前对象大, 返回-1, 交换他们的位置
    }
}
package com.mr.flowsum;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 统计每一个用户(手机号)所耗费的总上行流量、下行流量,总流量
 */
public class FlowCount {
    //Mapper
    public static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
        //1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //将一行内容转成string
            String line = value.toString();
            //切分字段
            String[] fields = line.split("\t");
            //取出手机号
            String phoneNbr = fields[1];
            //取出上下行流量
            long upFlow = Long.parseLong(fields[fields.length - 3]);
            long dFlow = Long.parseLong(fields[fields.length - 2]);

            context.write(new Text(phoneNbr), new FlowBean(upFlow, dFlow));
        }
    }

    //Reduce
    public static class FlowCountReduce extends Reducer<Text, FlowBean, Text, FlowBean> {
        //<183323,bean1><183323,bean2><183323,bean3><183323,bean4>.......
        @Override
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
            long sum_upFlow = 0;
            long sum_dFlow = 0;

            //遍历所有bean,将其中的上行流量,下行流量分别累加
            for (FlowBean bean : values) {
                sum_upFlow += bean.getUpFlow();
                sum_dFlow += bean.getdFlow();
            }
            FlowBean resultBean = new FlowBean(sum_upFlow, sum_dFlow);
            context.write(key, resultBean);
        }
    }

    //Driver
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        //指定本程序的jar包所在的本地路径
        job.setJarByClass(FlowCount.class);

        //指定本业务job要使用的mapper/Reducer业务类
        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReduce.class);

        //指定mapper输出数据的key,value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        //指定最终输出的数据的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        //指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //指定job的输出结果所在目录
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
        /*job.submit();*/
        boolean res = job.waitForCompletion(true);
        System.out.println(res ? 0 : 1);

    }
    /**
     * mvn clean package -DskipTests
     * hadoop jar hadoop-train-1.0.jar com.mr.flowsum.FlowCount /flowsum/input/flow.log /flowsum/output
     */
}

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


flow.log数据

1363157985066 13726230503 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
 1363157995052 13826544101 5C-0E-8B-C7-F1-E0:CMCC 120.197.40.4 4 0  264 0 200
 1363157991076 13926435656 20-10-7A-28-CC-0A:CMCC 120.196.100.99 2 4  132 1512  200
 1363154400022 13926251106 5C-0E-8B-8B-B1-50:CMCC 120.197.40.4 4 0  240 0 200
 1363157993044 18211575961 94-71-AC-CD-E6-18:CMCC-EASY 120.196.100.99 iface.qiyi.com 视频网站 15 12  1527 2106  200
 1363157995074 84138413 5C-0E-8B-8C-E8-20:7DaysInn 120.197.40.4 122.72.52.12 20 16 4116 1432 200
 1363157993055 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15  1116 954  200
 1363157995033 15920133257 5C-0E-8B-C7-BA-20:CMCC 120.197.40.4 sug.so.360.cn 信息安全 20 20  3156 2936  200
 1363157983019 13719199419 68-A1-B7-03-07-B1:CMCC-EASY 120.196.100.82 4 0  240 0 200
 1363157984041 13660577991 5C-0E-8B-92-5C-20:CMCC-EASY 120.197.40.4 s19.cnzz.com 站点统计 24 9  6960 690  200
 1363157973098 15013685858 5C-0E-8B-C7-F7-90:CMCC 120.197.40.4 rank.ie.sogou.com 搜索引擎 28 27  3659 3538  200
 1363157986029 15989002119 E8-99-C4-4E-93-E0:CMCC-EASY 120.196.100.99 www.umeng.com 站点统计 3 3  1938 180  200
 1363157992093 13560439658 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 15 9  918 4938  200
 1363157986041 13480253104 5C-0E-8B-C7-FC-80:CMCC-EASY 120.197.40.4 3 3  180 180  200
 1363157984040 13602846565 5C-0E-8B-8B-B6-00:CMCC 120.197.40.4 2052.flash2-http.qq.com 综合门户 15 12  1938 2910  200
 1363157995093 13922314466 00-FD-07-A2-EC-BA:CMCC 120.196.100.82 img.qfc.cn 12 12 3008 3720 200
 1363157982040 13502468823 5C-0A-5B-6A-0B-D4:CMCC-EASY 120.196.100.99 y0.ifengimg.com 综合门户 57 102  7335 110349  200
 1363157986072 18320173382 84-25-DB-4F-10-1A:CMCC-EASY 120.196.100.99 input.shouji.sogou.com 搜索引擎 21 18  9531 2412  200
 1363157990043 13925057413 00-1F-64-E1-E6-9A:CMCC 120.196.100.55 t3.baidu.com 搜索引擎 69 63  11058 48243  200
 1363157988072 13760778710 00-FD-07-A4-7B-08:CMCC 120.196.100.82 2 2  120 120  200
 1363157985066 13726238888 00-FD-07-A4-72-B8:CMCC 120.196.100.82 i02.c.aliimg.com 24 27 2481 24681 200
 1363157993055 13560436666 C4-17-FE-BA-DE-D9:CMCC 120.196.100.99 18 15  1116 954  200-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
清理后数据
13480253104     180     180     360
 13502468823     7335    110349  117684
 13560436666     1116    954     2070
 13560439658     2034    5892    7926
 13602846565     1938    2910    4848
 13660577991     6960    690     7650
 13719199419     240     0       240
 13726230503     2481    24681   27162
 13726238888     2481    24681   27162
 13760778710     120     120     240
 13826544101     264     0       264
 13922314466     3008    3720    6728
 13925057413     11058   48243   59301
 13926251106     240     0       240
 13926435656     132     1512    1644
 15013685858     3659    3538    7197
 15920133257     3156    2936    6092
 15989002119     1938    180     2118
 18211575961     1527    2106    3633
 18320173382     9531    2412    11943
 84138413        4116    1432    5548

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

1、统计流量且按照流量大小倒序排序

技术点:这种需求,用一个mapreduce -job 不好实现,需要两个mapreduce -job

第一个job负责流量统计,跟上题相同

第二个job读入第一个job的输出,然后做排序

要将flowBean作为map的key输出,这样mapreduce就会自动排序

     此时,flowBean要实现接口WritableComparable

     要实现其中的compareTo()方法,方法中,我们可以定义倒序比较的逻辑

package com.mr.flowsum;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * 统计流量且按照流量大小倒序排序
 */
public class FlowCountSort {

    //13502468823     7335    110349  117684
    //mapper
    static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text> {

        FlowBean bean = new FlowBean();
        Text v = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            // 拿到的是上一个统计程序的输出结果,已经是各手机号的总流量信息
            String line = value.toString();
            String[] fields = line.split("\t");
            String phoneNbr = fields[0];

            long upFlow = Long.parseLong(fields[1]);
            long dFlow = Long.parseLong(fields[2]);

            bean.set(upFlow, dFlow);
            v.set(phoneNbr);

            context.write(bean, v);
        }
    }

    /**
     * 根据key来比较, 传过来的是对象, 每个对象都是不一样的, 所以每个对象都调用一次reduce方法
     */
    static class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean> {
        //<bean(),phoneNum>
        @Override
        protected void reduce(FlowBean bean, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            context.write(values.iterator().next(),bean);
        }
    }

    //Driver
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        //指定本程序的jar包所在的本地路径
        job.setJarByClass(FlowCountSort.class);

        //指定本业务job要使用的mapper/Reducer业务类
        job.setMapperClass(FlowCountSortMapper.class);
        job.setReducerClass(FlowCountSortReducer.class);

        //指定mapper输出数据的key,value类型
        job.setMapOutputKeyClass(FlowBean.class);
        job.setMapOutputValueClass(Text.class);

        //指定最终输出的数据的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        //指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //指定job的输出结果所在目录
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
        /*job.submit();*/
        boolean res = job.waitForCompletion(true);
        System.out.println(res ? 0 : 1);
    }

}



-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

运行后数据:

13502468823     7335    110349  117684
 13925057413     11058   48243   59301
 13726238888     2481    24681   27162
 13726230503     2481    24681   27162
 18320173382     9531    2412    11943
 13560439658     2034    5892    7926
 13660577991     6960    690     7650
 15013685858     3659    3538    7197
 13922314466     3008    3720    6728
 15920133257     3156    2936    6092
 84138413        4116    1432    5548
 13602846565     1938    2910    4848
 18211575961     1527    2106    3633
 15989002119     1938    180     2118
 13560436666     1116    954     2070
 13926435656     132     1512    1644
 13480253104     180     180     360
 13826544101     264     0       264
 13926251106     240     0       240
 13760778710     120     120     240
 13719199419     240     0       240