一、flume采集文件到HDFS(跟踪文件夹)

1.首先准备三台虚拟机和一个打包好项目的jar包(打包项目包双击package)

nginx 使用flv nginx flume_hdfs


(1)、注:这里我是将jar包放在hdp-2下apps下的

(2)、启动命令:Java -jar frame

2、将下载好的flume包放在root/apps下,进行解压

解压命令:tar -zxvf apache-flume-1.6.0-bin.tar.gz

3、配置
1、在/root/apps/apache-flume-1.6.0-bin目录下进行flume文件配置 vi dir-hdfs.conf(内容如下)
pooldir:flume中自带的读取目录的source,只要出现新文件就会被读走
#定义三大组件的名称
ag1.sources = source1
ag1.sinks = sink1
ag1.channels = channel1

ag1.sources.source1.type = spooldir
#官网flume.apache.org
ag1.sources.source1.spoolDir = /root/log/
#具体的目录
ag1.sources.source1.fileSuffix=.FINISHED
#文件后缀,读走了就改成这样了
ag1.sources.source1.deserializer.maxLineLength=5129
#每一行的大小

ag1.sinks.sink1.type = hdfs
ag1.sinks.sink1.hdfs.path =hdfs://hdp-1:9000/access_log/%y-%m-%d/%H-%M
ag1.sinks.sink1.hdfs.filePrefix = app_log
ag1.sinks.sink1.hdfs.fileSuffix = .log
ag1.sinks.sink1.hdfs.batchSize= 100
#多少条记录切一次
ag1.sinks.sink1.hdfs.fileType = DataStream
#普通的数据流读
ag1.sinks.sink1.hdfs.writeFormat = Text

ag1.sinks.sink1.hdfs.rollSize = 512000

按文件体积(字节)来切 500k

#需要讲的:hdfs.rollInterval hdfs.rollCount hdfs.writeFormat hdfs.fileType
ag1.sinks.sink1.hdfs.rollCount = 1000000
ag1.sinks.sink1.hdfs.rollInterval = 60
ag1.sinks.sink1.hdfs.round = true
ag1.sinks.sink1.hdfs.roundValue = 10
#多久切一次 10分钟
ag1.sinks.sink1.hdfs.roundUnit = minute
#单位

ag1.sinks.sink1.hdfs.useLocalTimeStamp = true
#使用本地机器的时间

ag1.channels.channel1.type = memory
ag1.channels.channel1.capacity = 500000

ag1.channels.channel1.transactionCapacity = 600

ag1.sources.source1.channels = channel1
ag1.sinks.sink1.channel = channel1

4、启动flume
命令为:./flume-ng agent -C …/conf/ -f …/dir-hdfs.conf -n ag1 -Dflume.root.logger=INFO.console

出现如下即启动成功

5、测试flume是否跟踪文件夹
在根目录下新建log文件夹,在里面新建文件
成功显示文件后缀出现FINISHER

6、配置tail-hdfs.conf
ag1.sources = source1
ag1.sinks = sink1
ag1.channels = channel1

ag1.sources.source1.type = exec
ag1.sources.source1.command = tail -F /usr/local/nginx/logs/frame.log

ag1.sinks.sink1.type = hdfs
ag1.sinks.sink1.hdfs.path =hdfs://hdp-1:9000/access_log/%y-%m-%d/%H-%M
ag1.sinks.sink1.hdfs.filePrefix = app_log
ag1.sinks.sink1.hdfs.fileSuffix = .log
ag1.sinks.sink1.hdfs.batchSize= 100
ag1.sinks.sink1.hdfs.fileType = DataStream
ag1.sinks.sink1.hdfs.writeFormat = Text

ag1.sinks.sink1.hdfs.rollSize = 512000
ag1.sinks.sink1.hdfs.rollCount = 1000000
ag1.sinks.sink1.hdfs.rollInterval = 60
ag1.sinks.sink1.hdfs.round = true
ag1.sinks.sink1.hdfs.roundValue = 10
ag1.sinks.sink1.hdfs.roundUnit = minute
ag1.sinks.sink1.hdfs.useLocalTimeStamp = true

ag1.channels.channel1.type = memory
ag1.channels.channel1.capacity = 500000
ag1.channels.channel1.transactionCapacity = 600

ag1.sources.source1.channels = channel1
ag1.sinks.sink1.channel = channel1
(1)、cd log 生成shell脚本文件makelog.sh
vi makelog.sh
while true
do
echo ‘000000000000000’ >> access.log
sleep 0.1
done
(2)、增加makelog.sh的可执行权限 chmod +x makelog.sh
(3)执行makelog.log sh makelog.sh
启动flume采集程序:
在flume的bin目录下
./flume-ng agent -C …/conf/ -f …/tail-hdfs.conf -n a1 -Dflume.root.logger=INFO,console
注意此处引用的配置文件为tail-hdfs.conf

:注 flume采集文件成功

二、nginx安装配置
(1).安装make
yum -y install gcc automake autoconf libtool make

(2).安装g++:
yum install gcc gcc-c++

(3).安装openssl
yum -y install openssl openssl-devel

(4).安装PCRE库:
上传PCRE到虚拟机的apps路径下

解压:tar -zxvf pcre-8.39.tar.gz
cd pcre-8.39

./configure
make
make install

注:安装Zlip库、nginx库与安装PCRE库一样

1…启动nginx
cd /usr/local/nginx/sbin
启动命令: ./nginx
2.配置nginx
Vi /usr/local/nginx/conf下的nginx.conf

#user nobody;
worker_processes 1;

#error_log logs/error.log;
#error_log logs/error.log notice;
#error_log logs/error.log info;

#pid logs/nginx.pid;

events {
worker_connections 1024;
}

http {
include mime.types;
default_type application/octet-stream;
log_format main '$remote_addr - nginx 使用flv nginx flume_hdfs_02time_local] “KaTeX parse error: Double superscript at position 34: … '̲status nginx 使用flv nginx flume_hdfs_03http_referer” ’
‘“nginx 使用flv nginx flume_html_04http_x_forwarded_for”’;

#access_log  logs/access.log  main;

sendfile        on;
#tcp_nopush     on;

#keepalive_timeout  0;
keepalive_timeout  65;

#gzip  on;
upstream frame-tomcat {
      server hdp-2:8180; 
}
server {
    listen       80;
    server_name  hdp-1;

    #charset koi8-r;

    access_log  logs/log.frame.access.log  main;

    location / {
        # root   html;
        # index  index.html index.htm;
        proxy_pass http://frame-tomcat;
    }

    error_page   500 502 503 504  /50x.html;
    location = /50x.html {
        root   html;
    }
}
server {
    listen       80;
    server_name  localhost;

    #charset koi8-r;

    #access_log  logs/host.access.log  main;

    location / {
        root   html;
        index  index.html index.htm;
    }

    #error_page  404              /404.html;

    # redirect server error pages to the static page /50x.html
    #
    error_page   500 502 503 504  /50x.html;
    location = /50x.html {
        root   html;
    }

    # proxy the PHP scripts to Apache listening on 127.0.0.1:80
    #
    #location ~ \.php$ {
    #    proxy_pass   http://127.0.0.1;
    #}

    # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
    #
    #location ~ \.php$ {
    #    root           html;
    #    fastcgi_pass   127.0.0.1:9000;
    #    fastcgi_index  index.php;
    #    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;
    #    include        fastcgi_params;
    #}

    # deny access to .htaccess files, if Apache's document root
    # concurs with nginx's one
    #
    #location ~ /\.ht {
    #    deny  all;
    #}
}


# another virtual host using mix of IP-, name-, and port-based configuration
#
#server {
#    listen       8000;
#    listen       somename:8080;
#    server_name  somename  alias  another.alias;

#    location / {
#        root   html;
#        index  index.html index.htm;
#    }
#}


# HTTPS server
#
#server {
#    listen       443;
#    server_name  localhost;

#    ssl                  on;
#    ssl_certificate      cert.pem;
#    ssl_certificate_key  cert.key;

#    ssl_session_timeout  5m;

#    ssl_protocols  SSLv2 SSLv3 TLSv1;
#    ssl_ciphers  HIGH:!aNULL:!MD5;
#    ssl_prefer_server_ciphers   on;

#    location / {
#        root   html;
#        index  index.html index.htm;
#    }
#}

}
主要修改server (反向代理)
在hdp-1上启动nginx 在hdp-3上启动项目
在网页上搜hdp-1

– 测试流程
----刷新页面 http://hdp-1( 注意:如果浏览器有缓存,可能不会访问nginx,测试时需要禁用浏览器缓存)
---- cd /usr/local/nginx/logs
---- tail -f log.frame.access.log
这时虚拟机上hdp-1会根据在网页上操作会更新数据

3.利用flume采集数据到HDFS
----使用flume将nginx的accesslog收集到hdfs
(1).在hdp-3启动项目

成功启动项目:

(2).在hdp-1启动flume

启动成功:

(3).查看日志是否被收集:

显示日志被收集到

Flume:
是一种分布式,可靠且可用的服务,用于有效地收集,聚合和移动大量日志数据。 它具有基于流数据流的简单灵活的架构。 它具有可靠的可靠性机制和许多故障转移和恢复机制,具有强大的容错能力。 它使用简单的可扩展数据模型,允许在线分析应用程序。

Nginx :
是一个高性能的HTTP和反向代理web服务器