flume的安装和使用

一.介绍

1.流动方式

1.单级流动
2.多级流动
3.扇入流动
4.扇出流动

二.安装部署

1.解压进入根目录

三.基本使用和配置

touch datacd datavim base.conf

1.nc消息监听

基于channel缓存

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

# 配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
# 配置监听的端口
a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
# 配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000

# 配置sink
a1.sinks.k1.type = logger

# 将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

2.avro监听(重点)

vim avro.conf

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = avro
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000

#配置sink
a1.sinks.k1.type = logger

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

3.监听某个命令的结果的日志模版

vim execsource.conf

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = exec
#(tail -F)每次查看最后一行数据
a1.sources.s1.command = cat /home/a.txt
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000

#配置sink
a1.sinks.k1.type = logger

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

4.监听指定文件(重点)

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = spooldir
a1.sources.s1.spoolDir = cat /usr/local
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000

#配置sink
a1.sinks.k1.type = logger

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

5.序列产生器(不断的产生数据)

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = seq
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000

#配置sink
a1.sinks.k1.type = logger

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

6.监听HTTP(重点)

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = http
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000

#配置sink
a1.sinks.k1.type = logger

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

7.启动和测试

cd bin./flume-ng agent -n a1 -c ../conf -f ../data/basic.conf -Dflume.root.logger=INFO,console

  • 附带nc工具安装
    yum install -y nc
  • nc发送命令测试
    nc iZ2ze8tuatzv94fm6fgl1jZ 8090hello
  • 发送avro序列化发送文件测试:
    ./flume-ng avro-client -H 0.0.0.0 -p 8090 -F /home/a.txt -c ../conf
  • 发送Http测试:
    curl -X POST -d '[{"headers":{"class":"123"},"body":"This is big data"}]' http://0.0.0.0:8090

8.其它

基于channel持久化存到磁盘(临时存放)

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

# 配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = file
a1.channels.c1.dataDirs = /home/flumefile
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000

# 配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

更换sink类型,将内容写入文件

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000

#配置sink,将sink写入文件
a1.sinks.k1.type = file_roll
a1.sinks.k1.sink.directory = /home/flumedata
#每隔30秒一个新文件,单位是秒
a1.sinks.k1.sink.rollInterval = 600

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

sink发送到HDFS(按天存)

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
#配置拦截器
a1.sources.s1.interceptors = i1
#时间戳拦截器,可以获取时间戳
a1.sources.s1.interceptors.i1.type = timestamp

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000

#配置sink,将sink写入hdfs
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoopalone:9000/flume/time=%Y-%m-%D
#每隔30秒一个新文件,单位是秒
a1.sinks.k1.hdfs.rollInterval = 600
a1.sinks.k1.hdfs.fileType = DataStream

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

avro Sink(可以实现多级流动)

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000

#配置sink,将sink写入下一flume
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = 192.168.0.1
a1.sinks.k1.port = 8090


#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

扇出(默认是广播–复制模式)

#给Agent起名a1
a1.sources = s1
#给channel起名
a1.channels = c1 c2
#给sink起名
a1.sinks = k1 k2

#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090

#配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置channel
a1.channels.c2.type = memory
#配置容量10000条,生产环境10万
a1.channels.c2.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c2.transactionCapacity = 1000

#配置sink,将sink写入下一flume
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = 192.168.0.1
a1.sinks.k1.port = 8090
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = 192.168.0.2
a1.sinks.k2.port = 8090

#将source和channel绑定
a1.sources.s1.channels = c1 c2
#将sink和channel绑定
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

扇出(路由模式)

#给Agent起名a1
a1.sources = s1
#给channel起名
a1.channels = c1 c2
#给sink起名
a1.sinks = k1 k2

#配置source的类型
a1.sources.s1.type = http
#配置监听的端口
a1.sources.s1.port = 8090
#配置为路由模式
a1.sources.s1.selector.type = multiplexing
#根据header头进行判断
a1.sources.s1.selector.header = class
#如果header值为big1902发往c1
a1.sources.s1.selector.mapping.big1902= c1
#如果header值为big1903发往c2
a1.sources.s1.selector.mapping.big1903= c2
#如果是其它的发到c2
a1.sources.s1.selector.default = c2
#配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置channel
a1.channels.c2.type = memory
#配置容量10000条,生产环境10万
a1.channels.c2.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c2.transactionCapacity = 1000

#配置sink,将sink写入下一flume
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = 192.168.0.1
a1.sinks.k1.port = 8090
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = 192.168.0.2
a1.sinks.k2.port = 8090

#将source和channel绑定
a1.sources.s1.channels = c1 c2
#将sink和channel绑定
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

拦截器

#给Agent起名a1
a1.sources = s1
#给channel起名
a1.channels = c1
#给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090


#配置拦截器
a1.sources.s1.interceptors = i1
a1.sources.s1.interceptors = i2
a1.sources.s1.interceptors = i3


#时间戳拦截器,可以获取时间戳
a1.sources.s1.interceptors.i1.type = timestamp
#搜索替换拦截器
a1.sources.s1.interceptors.i2.type = search_replace


#想搜索数字正则
a1.sources.s1.interceptors.i2.searchPattern = [0-9]
#搜索完了数字进行替换成*
a1.sources.s1.interceptors.i2.replaceString = *


a1.sources.s1.interceptors.i3.type = regex_filter
#如果数据中有数字会被过滤掉
a1.sources.s1.interceptors.i3.regex = .*[0-9].*
#开启刨除策略
a1.sources.s1.interceptors.i3.excludeEvents = true


#配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000

#配置sink
a1.sinks.k1.type = logger

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

配置sink组实现主备/负载均衡(process\loadbalance)

四.自定义source配置

1.配置文件

# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1

#配置source的类型
a1.sources.s1.type = com.source.CustomSource
#名称自定义
a1.sources.s1.file = /home/person.log
a1.sources.s1.file1 = /home/person1.log
a1.sources.s1.path2 = /home/person2.log
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090

# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000

#配置sink
a1.sinks.k1.type = logger

#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1

2.编写自定义逻辑(最终打成jar包放入lib目录下)

  • 引入依赖
<!-- https://mvnrepository.com/artifact/org.apache.flume/flume-ng-core -->
        <dependency>
            <groupId>org.apache.flume</groupId>
            <artifactId>flume-ng-core</artifactId>
            <version>1.9.0</version>
        </dependency>
  • 自定义逻辑实现
package com.source;

import org.apache.flume.Context;
import org.apache.flume.EventDrivenSource;
import org.apache.flume.Source;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.conf.Configurable;
import org.apache.flume.source.AbstractSource;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**
 * 自定义source
 */

/**
 * 主动拉取数据方式:source主动去拿
 * spollingdir - pollable
 */
//public class CustomSource extends AbstractSource implements Source, PollableSource {
//
//}
/**
 * 事件驱动方式:主动往source推数据
 * seq - eventDriver
 */
public class CustomSource extends AbstractSource implements Configurable,Source, EventDrivenSource {

    private String path;
    private ExecutorService es;

    /**
     * 获取配置文件中的属性
     * @param context
     */
    @Override
    public void configure(Context context) {
        context.getInteger("port");
        path = context.getString("file");
    }

    /**
     * 启动source
     */
    @Override
    public synchronized void start() {
        ChannelProcessor cp = this.getChannelProcessor();
        es = Executors.newFixedThreadPool(10);
        es.submit(new FileThread(path,cp));

    }

    @Override
    public synchronized void stop() {
        es.shutdown();
    }
}
package com.source;

import org.apache.flume.Event;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.event.EventBuilder;

import java.io.*;
import java.util.HashMap;
import java.util.Map;

public class FileThread implements Runnable {
    private BufferedReader br;
    private ChannelProcessor cp;
    public FileThread(String path, ChannelProcessor cp) {
        this.cp=cp;
        File file = new File(path);
        if(file.isFile())
        try {
            br = new BufferedReader(new FileReader(path))
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
    }

    @Override
    public void run() {
        while (true){
            try {
                String name = br.readLine();
                if(name == null){
                    break;
                }
                String age = br.readLine();
                String description = br.readLine();
                Map<String,String> headers = new HashMap<>(2);
                headers.put("name",name);
                headers.put("age",age);
                //创建event对象
                Event e = EventBuilder.withBody(description.getBytes(),headers);
                //将event放到channel中
                cp.processEvent(e);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        try {
            br.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

五.自定义sink

package com.source;

import org.apache.flume.*;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.apache.flume.source.AbstractSource;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
 * 自定义source
 */
public class CustomSink extends AbstractSink implements Configurable, Sink{
    @Override
    public Status process() throws EventDeliveryException {
        return null;
    }

    @Override
    public void configure(Context context) {

    }
}

六.事务

source :putList (双向队列)
sink: takeList (双向队列)