一:整体架构如下图所示,有两台服务器A和B,要把服务器A上的数据采集到服务器B上的HDFS。
二:首先是原始数据,我这里一直保存的是近3天的数据。
可以使用下面脚本实现
#!/bin/bash
find /home/ftpuser/home/ftpuser -mtime +2 -name "202*" -exec rm -rf {} \;
Flume的配置:
1.在服务器A的Flume安装目录的conf目录下新建aserver.conf
#服务器A
a1.sources = r1 r2 r3 r4
a1.sinks = k1
a1.channels = c1
# 配置监控文件,注意:不能往目录中放同名文件,否则会崩溃
#对于source r1的配置描述 监听文件中的新增数据 exec spoolDir
a1.sources.r1.type = exec
a1.sources.r1.shell = /bin/bash -c
# 因为我的第一行数据是不需要的,所以从第二行开始采集
a1.sources.r1.command = tail -n +2 -F "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_acct_flow"
a1.sources.r1.interceptors = i1 i2
a1.sources.r1.interceptors.i1.type = static
#静态的在header中添加一个key value,下面就配置了两个拦截器,i1和i2
a1.sources.r1.interceptors.i1.key = type
# value和文件的后缀名是一样的,便于理解
a1.sources.r1.interceptors.i1.value = acct_flow
a1.sources.r1.interceptors.i2.type = timestamp
#对于source r2的配置描述 监听文件中的新增数据 exec
a1.sources.r2.type = exec
a1.sources.r2.shell = /bin/bash -c
a1.sources.r2.command = tail -n +2 -F "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_ntdflow_record"
a1.sources.r2.interceptors = i1 i2
a1.sources.r2.interceptors.i1.type = static
#静态的在header中添加一个key value,下面就配置了两个拦截器,i1和i2
a1.sources.r2.interceptors.i1.key = type
a1.sources.r2.interceptors.i1.value = ntdflow_record
a1.sources.r2.interceptors.i2.type = timestamp
#对于source r3 的配置描述 监听文件中的新增数据 exec
a1.sources.r3.type = exec
a1.sources.r3.shell = /bin/bash -c
a1.sources.r3.command = tail -n +2 -F "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_online_detail"
a1.sources.r3.interceptors = i1 i2
a1.sources.r3.interceptors.i1.type = static
#静态的在header中添加一个key value,下面就配置了两个拦截器,i1和i2
a1.sources.r3.interceptors.i1.key = type
a1.sources.r3.interceptors.i1.value = online_detail
a1.sources.r3.interceptors.i2.type = timestamp
#对于source r4 的配置描述 监听文件中的新增数据 exec
a1.sources.r4.type = exec
a1.sources.r4.shell = /bin/bash -c
a1.sources.r4.command = tail -n +2 -F "/home/ftpuser/home/ftpuser/`date +%Y%m%d`_userinfo"
a1.sources.r4.interceptors = i1 i2
a1.sources.r4.interceptors.i1.type = static
#静态的在header中添加一个key value,下面就配置了两个拦截器,i1和i2
a1.sources.r4.interceptors.i1.key = type
a1.sources.r4.interceptors.i1.value = userinfo
a1.sources.r4.interceptors.i2.type = timestamp
# 配置sink
a1.sinks.k1.type = avro
a1.sinks.k1.hostname=192.168.xxx.xx
a1.sinks.k1.port = 44444
# 配置channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 30000
a1.channels.c1.transactionCapacity = 10000
a1.sources.r1.channels = c1
a1.sources.r2.channels = c1
a1.sources.r3.channels = c1
a1.sources.r4.channels = c1
a1.sinks.k1.channel = c1
2.在服务器B的Flume安装目录的conf目录下新建bserver.conf
#服务器B
b1.sources = r2
b1.sinks = k2
b1.channels = c2
# 配置监控文件
b1.sources.r2.type = avro
# 此处为服务器A的host
b1.sources.r2.bind=192.168.xxx.xx
b1.sources.r2.port = 44444
#b1.sources.r2.interceptors = i1
#b1.sources.r2.interceptors.i1.type = timestamp
# 配置sink
b1.sinks.k2.type =hdfs
b1.sinks.k2.channel = c1
# 保存到HDFS上的路径
b1.sinks.k2.hdfs.path = hdfs://192.168.xxx.xx/user/hive/warehouse/ods.db/%{type}/dt=%Y%m%d/
# 生成的文件前缀
#b1.sinks.k2.hdfs.filePrefix = events-
b1.sinks.k2.hdfs.filePrefix = %{type}
# 是否按照时间滚动文件夹
b1.sinks.k2.hdfs.round = true
# 多少时间单位创建一个文件夹
b1.sinks.k2.hdfs.roundValue = 10
# 重新定义时间单位
b1.sinks.k2.hdfs.roundUnit = minute
# hdfs上每60秒生成一个文件
b1.sinks.k2.hdfs.rollInterval = 60
# hdfs上的文件达到128M生成一个文件
b1.sinks.k2.hdfs.rollSize = 134217728
# 不按照条数生成文件
b1.sinks.k2.hdfs.rollCount = 0
# 积攒多少个Event才flush到HDFS一次
b1.sinks.k2.hdfs.batchSize = 100
# 是否使用本地时间戳
b1.sinks.k2.hdfs.useLocalTimeStamp = true
#生成的文件类型,默认是Sequencefile,可用DataStream,则为普通文本
b1.sinks.k2.hdfs.fileType = DataStream
# 配置channel
b1.channels.c2.type = memory
b1.channels.c2.capacity = 10000
b1.channels.c2.transactionCapacity = 100
# 将三者串联
b1.sources.r2.channels = c2
b1.sinks.k2.channel = c2
3.脚本运行,这里使用的脚本运行,也可以使用单个语句执行
#!/bin/bash
export JAVA_HOME=/opt/jdk1.8.0_181
path=/home/flume/flume-1.9
echo $path
JAR="flume"
function start(){
echo "开始启动 ...."
num=`ps -ef|grep java|grep $JAR|wc -l`
echo "进程数:$num"
if [ "$num" = "0" ] ; then
# 请自行修改启动的所需要的参数
eval nohup $path/bin/flume-ng agent --name a1 --conf $path/conf --conf-file $path/conf/aserver.conf >>/home/flume/flume-1.9/logs/flume.log 2>&1 &
echo "启动成功...."
echo "日志路径: $path/logs/flume.log"
exit 0
else
echo "进程已经存在,启动失败,请检查....."
exit 0
fi
}
function stop(){
echo "开始stop ....."
num=`ps -ef|grep java|grep $JAR|wc -l`
if [ "$num" != "0" ] ; then
#ps -ef|grep java|grep $JAR|awk '{print $2;}'|xargs kill -9
# 正常停止flume
ps -ef|grep java|grep $JAR|awk '{print $2;}'|xargs kill
echo "进程已经关闭..."
else
echo "服务未启动,无需停止..."
fi
}
function restart(){
echo "begin stop process ..."
stop
# 判断程序是否彻底停止
num=`ps -ef|grep java|grep $JAR|wc -l`
while [ $num -gt 0 ]; do
sleep 1
num=`ps -ef|grep java|grep $JAR|wc -l`
done
echo "process stoped,and starting ..."
start
echo "started ..."
}
case "$1" in
"start")
start $@
exit 0
;;
"stop")
stop
exit 0
;;
"restart")
restart
exit 0
;;
*)
echo "用法: $0 {start|stop|restart}"
exit 1
;;
esac
4.定时启动(centos7)输入: crontab -e
30 2 * * * /home/flume/flume-1.9/timingFlume.sh start
50 2 * * * /home/flume/flume-1.9/timingFlume.sh stop
因为我的数据量不多,几分钟就结束了,所以采集完就stop了。
5.HDFS上的数据
注意:要先启动B服务器,再启动A服务器。