用shell写一个简单的告警系统

  • 创建目录结构
mkdir -p /usr/local/sbin/mon/{bin,conf,shares,mail,log}
mon  //主目录
bin   //主程序目录
shares //子程序目录
mail //发邮件目录
log //日志目录
  • 主程序入口文件/mon/bin/main.sh
#!/bin/bash
export send=1
export addr=`/usr/sbin/ifconfig | grep -A1 'ens33'|awk '/inet/{print $2}'`
dir=`pwd`
last_dir=`echo $dir|awk -F'/' '{print $NF}'`
if [ $last_dir == "bin" ] || [ $last_dir == "bin/" ]; then
        conf_file="../conf/mon.conf"
else
        echo "you should cd bin dir."
        exit
fi
#exec 1>>../log/mon.log 2>>../log/err.log
echo "`date +'%F %T'` load average"
/bin/bash ../shares/load.sh   //在一个脚本中执行了另外一个脚本load.sh
if grep -q 'to_mon_502=1' $conf_file; then
        export log=`grep 'logfile=' $conf_file | awk -F '=' '{print $2}' | sed 's@ @@g'`
        /bin/bash ../shares/502.sh
fi
  • 主配置文件/mon/conf/mon.conf(自定义变量)
## to config the options if to monitor
## 定义mysql的服务器地址、端口以及user、password 
to_mon_cdb=0
db_ip=192.168.221.10
db_port=3306
db_user=username
db_pass=passwd
## httpd
to_mon_httpd=0
to_mon_php_socket=0
## http_code_502
to_mon_502=0
logfile=/data/log/xxx.xxx.com/access.log
## request_count 定义日志路径以及域名
to_mon_request_count=0
req_log=/data/log/www.discuz.net/access.log
domainname=www.discuz.net
  • 监控的脚本

监控系统负载的脚本/usr/local/sbin/mon/shares/load.sh

#!/bin/bash
load=`/usr/bin/uptime | awk -F'average:' '{print $2}'|cut -d, -f1|sed 's@ @@'|cut -d. -f1`
if [ $load -lt 10 ] && [ $send -eq "1" ]; then      //为了测试,设置了一分钟系统的负载小于了10
        echo "${addr}-$(date +%T)-load-is-$load" > ../log/load.tmp
        /bin/bash ../mail/mail.sh "发邮件给谁" "$addr\_load:$load" "$(cat ../log/load.tmp)"  //调用了mail.sh
fi
echo "`date +%T` load is $load"

监控磁盘的脚本/usr/local/sbin/mon/shares/disk.sh

rm -f ../log/disk.tmp
for r in `df -h | awk -F'[ %]+' '{print $5}' | grep -v Use`
do
        if [ $r -gt 90 ] && [ $send -eq "1" ]; then
                echo "$addr `date +%T` disk useage is $r" >> ../log/disk.tmp
        fi
        if [ -f ../log/disk.tmp ]; then
                df -h >> ../log/disk.tmp
                /bin/bash ../mail/mail.sh $addr\_disk $r ../log/disk.tmp   //调用了mail.sh
                echo "`date +%T` disk useage is Alert!!!"
        else
                echo "`date +%T` disk useage is ok"
        fi
done

监控网站出现502的脚本

#!/bin/bash
d=`date -d '-1 min' +%H:%M`
c_502=`grep :$d: $log |grep '502'|wc -l`
if [ $c_502 -gt 10 ] && [ $send == 1 ]; then
        echo "$addr $d 502 count is $c_502" > ../log/502.tmp
        /bin/bash ../mail/mail.sh $addr\_502 $c_502 ../log/502.tmp
fi
echo "`date +%T` 502 $c_502"
  • 发邮件的脚本/usr/local/sbin/mon/mail/mail.py
http://blog.51cto.com/13480443/2084118   /usr/lib/zabbix/alertscripts/mail.py  //这个地址的这个文件
  • 告警收敛脚本/usr/local/sbin/mon/mail/mail.sh
#!/bin/bash
log=$1
t_s=`date +%s`
t_s2=`date -d "2 hours ago" +%s`
if [ ! -f /tmp/$log ]
then
    echo $t_s2 > /tmp/$log
fi
t_s2=`tail -1 /tmp/$log|awk '{print $1}'`
echo $t_s>>/tmp/$log
v=$[$t_s-$t_s2]
echo $v
if [ $v -gt 3600 ]
then
    /usr/bin/python /usr/local/sbin/mon/mail/mail.py $1 "trouble continue 10 min $2" $3
    echo "0" > /tmp/$log.txt
else
    if [ ! -f /tmp/$log.txt ]
    then
        echo "0" > /tmp/$log.txt
    fi
    nu=`cat /tmp/$log.txt`
    nu2=$[$nu+1]
    echo $nu2>/tmp/$log.txt
    if [ $nu2 -gt 10 ]
    then 
         /usr/bin/python /usr/local/sbin/mon/mail/mail.py $1 "trouble continue 10 min $2" "$3"
         echo "0" > /tmp/$log.txt
    fi
fi
//异常间隔大于1小时,直接发邮件,异常小于1小时时则每隔10分钟发一次邮件
  • 计划每分钟执行/usr/local/sbin/mon/bin/main.sh
* * * * * cd /usr/local/sbin/mon/bin/; /usr/bin/bash /usr/local/sbin/mon/bin/main.sh

注意:这里主程序main.sh中只调用了子程序 load.sh,另外两个子程序没有调用到,系统会自动每分钟发邮件给root用户的邮箱(当然我这里测试是用的root用户)

总结:

  • 主程序文件中要对主配置文件定义的内容进行过滤作为条件,再调用子程序的脚本
  • 子程序中也要引用主配置文件中的内容作为自己的条件判断,再调用发邮件脚本
  • 主配置文件相当于一个总的开头
  • 发邮件的脚本mail.py,注意参数是以空格作为分隔的,如 echo "${addr} $(date +%T)-load-is-$load" > ../log/load.tmp /bin/bash ../mail/mail.sh "发邮件给谁" "$addr_load:$load" "$(cat ../log/load.tmp)" 只会识别${addr},不会辨别$(date +%T)-load-is-$load
  • 这里用到了环境变量