今天无意中看到这样一个服务:monit,跑在linux/unix平台,可用于监控本地资源,触发告警和动作。

http://mmonit.com/monit/


安装

yum install monit -y

配置

[root@lvs122101master monit.d]# cat /etc/monit.conf |grep -v '^#'
set daemon 600
set mailserver 192.168.11.27
set alert shanks@null.com with reminder on 1 cycle
include /etc/monit.d/*

[root@lvs122101master monit.d]# cat /etc/monit.d/system
check device gen with path /dev/mapper/vg_livedvd-lv_root
        if space usage > 50% then alert

启动

/etc/init.d/monit start

查看日志

[root@lvs122101master log]# tail -f monit
[CST Nov 26 10:39:10] info     : Monit started
[CST Nov 26 10:39:10] error    : 'gen' space usage 76.6% matches resource limit [space usage>50.0%]
[CST Nov 26 10:40:11] info     : Monit has not changed
[CST Nov 26 10:40:11] error    : 'gen' space usage 76.6% matches resource limit [space usage>50.0%]
[CST Nov 26 10:40:59] info     : monit daemon with pid [21487] killed
[CST Nov 26 10:40:59] info     : Monit stopped
[CST Nov 26 10:41:03] info     : Monit started
[CST Nov 26 10:41:03] error    : 'gen' space usage 76.6% matches resource limit [space usage>50.0%]
[CST Nov 26 10:41:08] error    : Sendmail: error receiving data from the mailserver '192.168.80.27' -- Resource temporarily unavailable
[CST Nov 26 10:41:08] error    : Aborting event

告警邮件格式

Resource limit matched Service gen
    Date:        Tue, 26 Nov 2013 10:40:11 +0800
    Action:      alert
    Host:        lvs122101master
    Description: 'gen' space usage 76.6% matches resource limit [space usage>50.0%]
Your faithful employee,
monit

监控自定义服务

# 监控nginx
#
# 需要提供进程pid文件信息
check process nginx with pidfile /var/run/nginx.pid
    # 进程启动命令行,注:必须是命令全路径
    start program = "/etc/init.d/nginx start"
    # 进程关闭命令行
    stop program  = "/etc/init.d/nginx stop"
    # nginx进程状态测试,监测到nginx连不上了,则自动重启
    if failed host www.example.com port 80 protocol http then restart
    # 多次重启失败将不再尝试重启,这种就是系统出现严重错误的情况
    if 3 restarts within 5 cycles then timeout
    # 可选,设置分组信息
    group server
#   可选的ssl端口的监控,如果有的话
#    if failed port 443 type tcpssl protocol http
#       with timeout 15 seconds
#       then restart
#
# 监控apache
#
check process apache with pidfile /var/run/apache2.pid
    start program = "/etc/init.d/apache2 start"
    stop program  = "/etc/init.d/apache2 stop"
    # apache吃cpu和内存比较厉害,额外添加一些关于这方面的监控设置
    if cpu > 50% for 2 cycles then alert
    if cpu > 70% for 5 cycles then restart
    if totalmem > 1500 MB for 10 cycles then restart
    if children > 250 then restart
    if loadavg(5min) greater than 10 for 20 cycles then stop
    if failed host www.example.com port 8080 protocol http then restart
    if 3 restarts within 5 cycles then timeout
    group server
    # 可选,依赖于nginx
    depends on nginx
#
# 监控spawn-fcgi进程(其实就是fast-cgi进程)
#
check process spawn-fcgi with pidfile /var/run/spawn-fcgi.pid
    # spawn-fcgi一定要带-P参数才会生成pid文件,默认是没有的
    start program = "/usr/bin/spawn-fcgi -a 127.0.0.1 -p 8081 -C 10 -u userxxx -g groupxxx -P /var/run/spawn-fcgi.pid -f /usr/bin/php-cgi"
    stop program = "/usr/bin/killall /usr/bin/php-cgi"
    # fast-cgi走的不是http协议,monit的protocol参数也没有cgi对应的设置,这里去掉protocol http即可。
    if failed host 127.0.0.1 port 8081 then restart
    if 3 restarts within 5 cycles then timeout
    group server
    depends on nginx