flume的安装和使用
一.介绍
1.流动方式
1.单级流动
2.多级流动
3.扇入流动
4.扇出流动
二.安装部署
1.解压进入根目录
三.基本使用和配置
touch data
cd data
vim base.conf
1.nc消息监听
基于channel缓存
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
# 配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
# 配置监听的端口
a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
# 配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
# 配置sink
a1.sinks.k1.type = logger
# 将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
2.avro监听(重点)
vim avro.conf
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = avro
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000
#配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
3.监听某个命令的结果的日志模版
vim execsource.conf
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = exec
#(tail -F)每次查看最后一行数据
a1.sources.s1.command = cat /home/a.txt
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000
#配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
4.监听指定文件(重点)
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = spooldir
a1.sources.s1.spoolDir = cat /usr/local
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000
#配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
5.序列产生器(不断的产生数据)
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = seq
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000
#配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
6.监听HTTP(重点)
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = http
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000
#配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
7.启动和测试
cd bin
./flume-ng agent -n a1 -c ../conf -f ../data/basic.conf -Dflume.root.logger=INFO,console
- 附带nc工具安装
yum install -y nc
- nc发送命令测试
nc iZ2ze8tuatzv94fm6fgl1jZ 8090
hello
- 发送avro序列化发送文件测试:
./flume-ng avro-client -H 0.0.0.0 -p 8090 -F /home/a.txt -c ../conf
- 发送Http测试:
curl -X POST -d '[{"headers":{"class":"123"},"body":"This is big data"}]' http://0.0.0.0:8090
8.其它
基于channel持久化存到磁盘(临时存放)
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
# 配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = file
a1.channels.c1.dataDirs = /home/flumefile
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
# 配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
更换sink类型,将内容写入文件
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置sink,将sink写入文件
a1.sinks.k1.type = file_roll
a1.sinks.k1.sink.directory = /home/flumedata
#每隔30秒一个新文件,单位是秒
a1.sinks.k1.sink.rollInterval = 600
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
sink发送到HDFS(按天存)
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
#配置拦截器
a1.sources.s1.interceptors = i1
#时间戳拦截器,可以获取时间戳
a1.sources.s1.interceptors.i1.type = timestamp
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置sink,将sink写入hdfs
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoopalone:9000/flume/time=%Y-%m-%D
#每隔30秒一个新文件,单位是秒
a1.sinks.k1.hdfs.rollInterval = 600
a1.sinks.k1.hdfs.fileType = DataStream
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
avro Sink(可以实现多级流动)
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置sink,将sink写入下一flume
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = 192.168.0.1
a1.sinks.k1.port = 8090
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
扇出(默认是广播–复制模式)
#给Agent起名a1
a1.sources = s1
#给channel起名
a1.channels = c1 c2
#给sink起名
a1.sinks = k1 k2
#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
#配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置channel
a1.channels.c2.type = memory
#配置容量10000条,生产环境10万
a1.channels.c2.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c2.transactionCapacity = 1000
#配置sink,将sink写入下一flume
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = 192.168.0.1
a1.sinks.k1.port = 8090
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = 192.168.0.2
a1.sinks.k2.port = 8090
#将source和channel绑定
a1.sources.s1.channels = c1 c2
#将sink和channel绑定
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
扇出(路由模式)
#给Agent起名a1
a1.sources = s1
#给channel起名
a1.channels = c1 c2
#给sink起名
a1.sinks = k1 k2
#配置source的类型
a1.sources.s1.type = http
#配置监听的端口
a1.sources.s1.port = 8090
#配置为路由模式
a1.sources.s1.selector.type = multiplexing
#根据header头进行判断
a1.sources.s1.selector.header = class
#如果header值为big1902发往c1
a1.sources.s1.selector.mapping.big1902= c1
#如果header值为big1903发往c2
a1.sources.s1.selector.mapping.big1903= c2
#如果是其它的发到c2
a1.sources.s1.selector.default = c2
#配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置channel
a1.channels.c2.type = memory
#配置容量10000条,生产环境10万
a1.channels.c2.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c2.transactionCapacity = 1000
#配置sink,将sink写入下一flume
a1.sinks.k1.type = avro
a1.sinks.k1.hostname = 192.168.0.1
a1.sinks.k1.port = 8090
a1.sinks.k2.type = avro
a1.sinks.k2.hostname = 192.168.0.2
a1.sinks.k2.port = 8090
#将source和channel绑定
a1.sources.s1.channels = c1 c2
#将sink和channel绑定
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
拦截器
#给Agent起名a1
a1.sources = s1
#给channel起名
a1.channels = c1
#给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = netcat
#配置监听的主机
a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
a1.sources.s1.port = 8090
#配置拦截器
a1.sources.s1.interceptors = i1
a1.sources.s1.interceptors = i2
a1.sources.s1.interceptors = i3
#时间戳拦截器,可以获取时间戳
a1.sources.s1.interceptors.i1.type = timestamp
#搜索替换拦截器
a1.sources.s1.interceptors.i2.type = search_replace
#想搜索数字正则
a1.sources.s1.interceptors.i2.searchPattern = [0-9]
#搜索完了数字进行替换成*
a1.sources.s1.interceptors.i2.replaceString = *
a1.sources.s1.interceptors.i3.type = regex_filter
#如果数据中有数字会被过滤掉
a1.sources.s1.interceptors.i3.regex = .*[0-9].*
#开启刨除策略
a1.sources.s1.interceptors.i3.excludeEvents = true
#配置channel
a1.channels.c1.type = memory
#配置容量10000条,生产环境10万
a1.channels.c1.capacity = 10000
#配置往sink发送的数据量,每个事物的最大事件数1000-3000
a1.channels.c1.transactionCapacity = 1000
#配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
配置sink组实现主备/负载均衡(process\loadbalance)
四.自定义source配置
1.配置文件
# 给Agent起名a1
a1.sources = s1
# 给channel起名
a1.channels = c1
# 给sink起名
a1.sinks = k1
#配置source的类型
a1.sources.s1.type = com.source.CustomSource
#名称自定义
a1.sources.s1.file = /home/person.log
a1.sources.s1.file1 = /home/person1.log
a1.sources.s1.path2 = /home/person2.log
#配置监听的主机
#a1.sources.s1.bind = 0.0.0.0
#配置监听的端口
#a1.sources.s1.port = 8090
# 配置channel
a1.channels.c1.type = memory
#配置容量10000条
a1.channels.c1.capacity = 10000
# 配置往sink发送的数据量
a1.channels.c1.transactionCapacity = 1000
#配置sink
a1.sinks.k1.type = logger
#将source和channel绑定
a1.sources.s1.channels = c1
#将sink和channel绑定
a1.sinks.k1.channel = c1
2.编写自定义逻辑(最终打成jar包放入lib目录下)
- 引入依赖
<!-- https://mvnrepository.com/artifact/org.apache.flume/flume-ng-core -->
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>1.9.0</version>
</dependency>
- 自定义逻辑实现
package com.source;
import org.apache.flume.Context;
import org.apache.flume.EventDrivenSource;
import org.apache.flume.Source;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.conf.Configurable;
import org.apache.flume.source.AbstractSource;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 自定义source
*/
/**
* 主动拉取数据方式:source主动去拿
* spollingdir - pollable
*/
//public class CustomSource extends AbstractSource implements Source, PollableSource {
//
//}
/**
* 事件驱动方式:主动往source推数据
* seq - eventDriver
*/
public class CustomSource extends AbstractSource implements Configurable,Source, EventDrivenSource {
private String path;
private ExecutorService es;
/**
* 获取配置文件中的属性
* @param context
*/
@Override
public void configure(Context context) {
context.getInteger("port");
path = context.getString("file");
}
/**
* 启动source
*/
@Override
public synchronized void start() {
ChannelProcessor cp = this.getChannelProcessor();
es = Executors.newFixedThreadPool(10);
es.submit(new FileThread(path,cp));
}
@Override
public synchronized void stop() {
es.shutdown();
}
}
package com.source;
import org.apache.flume.Event;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.event.EventBuilder;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
public class FileThread implements Runnable {
private BufferedReader br;
private ChannelProcessor cp;
public FileThread(String path, ChannelProcessor cp) {
this.cp=cp;
File file = new File(path);
if(file.isFile())
try {
br = new BufferedReader(new FileReader(path))
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
@Override
public void run() {
while (true){
try {
String name = br.readLine();
if(name == null){
break;
}
String age = br.readLine();
String description = br.readLine();
Map<String,String> headers = new HashMap<>(2);
headers.put("name",name);
headers.put("age",age);
//创建event对象
Event e = EventBuilder.withBody(description.getBytes(),headers);
//将event放到channel中
cp.processEvent(e);
} catch (IOException e) {
e.printStackTrace();
}
}
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
五.自定义sink
package com.source;
import org.apache.flume.*;
import org.apache.flume.channel.ChannelProcessor;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.apache.flume.source.AbstractSource;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* 自定义source
*/
public class CustomSink extends AbstractSink implements Configurable, Sink{
@Override
public Status process() throws EventDeliveryException {
return null;
}
@Override
public void configure(Context context) {
}
}
六.事务
source :putList (双向队列)
sink: takeList (双向队列)