用shell写一个简单的告警系统
- 创建目录结构
mkdir -p /usr/local/sbin/mon/{bin,conf,shares,mail,log}
mon //主目录
bin //主程序目录
shares //子程序目录
mail //发邮件目录
log //日志目录
- 主程序入口文件/mon/bin/main.sh
#!/bin/bash
export send=1
export addr=`/usr/sbin/ifconfig | grep -A1 'ens33'|awk '/inet/{print $2}'`
dir=`pwd`
last_dir=`echo $dir|awk -F'/' '{print $NF}'`
if [ $last_dir == "bin" ] || [ $last_dir == "bin/" ]; then
conf_file="../conf/mon.conf"
else
echo "you should cd bin dir."
exit
fi
#exec 1>>../log/mon.log 2>>../log/err.log
echo "`date +'%F %T'` load average"
/bin/bash ../shares/load.sh //在一个脚本中执行了另外一个脚本load.sh
if grep -q 'to_mon_502=1' $conf_file; then
export log=`grep 'logfile=' $conf_file | awk -F '=' '{print $2}' | sed 's@ @@g'`
/bin/bash ../shares/502.sh
fi
- 主配置文件/mon/conf/mon.conf(自定义变量)
## to config the options if to monitor
## 定义mysql的服务器地址、端口以及user、password
to_mon_cdb=0
db_ip=192.168.221.10
db_port=3306
db_user=username
db_pass=passwd
## httpd
to_mon_httpd=0
to_mon_php_socket=0
## http_code_502
to_mon_502=0
logfile=/data/log/xxx.xxx.com/access.log
## request_count 定义日志路径以及域名
to_mon_request_count=0
req_log=/data/log/www.discuz.net/access.log
domainname=www.discuz.net
- 监控的脚本
监控系统负载的脚本/usr/local/sbin/mon/shares/load.sh
#!/bin/bash
load=`/usr/bin/uptime | awk -F'average:' '{print $2}'|cut -d, -f1|sed 's@ @@'|cut -d. -f1`
if [ $load -lt 10 ] && [ $send -eq "1" ]; then //为了测试,设置了一分钟系统的负载小于了10
echo "${addr}-$(date +%T)-load-is-$load" > ../log/load.tmp
/bin/bash ../mail/mail.sh "发邮件给谁" "$addr\_load:$load" "$(cat ../log/load.tmp)" //调用了mail.sh
fi
echo "`date +%T` load is $load"
监控磁盘的脚本/usr/local/sbin/mon/shares/disk.sh
rm -f ../log/disk.tmp
for r in `df -h | awk -F'[ %]+' '{print $5}' | grep -v Use`
do
if [ $r -gt 90 ] && [ $send -eq "1" ]; then
echo "$addr `date +%T` disk useage is $r" >> ../log/disk.tmp
fi
if [ -f ../log/disk.tmp ]; then
df -h >> ../log/disk.tmp
/bin/bash ../mail/mail.sh $addr\_disk $r ../log/disk.tmp //调用了mail.sh
echo "`date +%T` disk useage is Alert!!!"
else
echo "`date +%T` disk useage is ok"
fi
done
监控网站出现502的脚本
#!/bin/bash
d=`date -d '-1 min' +%H:%M`
c_502=`grep :$d: $log |grep '502'|wc -l`
if [ $c_502 -gt 10 ] && [ $send == 1 ]; then
echo "$addr $d 502 count is $c_502" > ../log/502.tmp
/bin/bash ../mail/mail.sh $addr\_502 $c_502 ../log/502.tmp
fi
echo "`date +%T` 502 $c_502"
- 发邮件的脚本/usr/local/sbin/mon/mail/mail.py
http://blog.51cto.com/13480443/2084118 /usr/lib/zabbix/alertscripts/mail.py //这个地址的这个文件
- 告警收敛脚本/usr/local/sbin/mon/mail/mail.sh
#!/bin/bash
log=$1
t_s=`date +%s`
t_s2=`date -d "2 hours ago" +%s`
if [ ! -f /tmp/$log ]
then
echo $t_s2 > /tmp/$log
fi
t_s2=`tail -1 /tmp/$log|awk '{print $1}'`
echo $t_s>>/tmp/$log
v=$[$t_s-$t_s2]
echo $v
if [ $v -gt 3600 ]
then
/usr/bin/python /usr/local/sbin/mon/mail/mail.py $1 "trouble continue 10 min $2" $3
echo "0" > /tmp/$log.txt
else
if [ ! -f /tmp/$log.txt ]
then
echo "0" > /tmp/$log.txt
fi
nu=`cat /tmp/$log.txt`
nu2=$[$nu+1]
echo $nu2>/tmp/$log.txt
if [ $nu2 -gt 10 ]
then
/usr/bin/python /usr/local/sbin/mon/mail/mail.py $1 "trouble continue 10 min $2" "$3"
echo "0" > /tmp/$log.txt
fi
fi
//异常间隔大于1小时,直接发邮件,异常小于1小时时则每隔10分钟发一次邮件
- 计划每分钟执行/usr/local/sbin/mon/bin/main.sh
* * * * * cd /usr/local/sbin/mon/bin/; /usr/bin/bash /usr/local/sbin/mon/bin/main.sh
注意:这里主程序main.sh中只调用了子程序 load.sh,另外两个子程序没有调用到,系统会自动每分钟发邮件给root用户的邮箱(当然我这里测试是用的root用户)
总结:
- 主程序文件中要对主配置文件定义的内容进行过滤作为条件,再调用子程序的脚本
- 子程序中也要引用主配置文件中的内容作为自己的条件判断,再调用发邮件脚本
- 主配置文件相当于一个总的开头
- 发邮件的脚本mail.py,注意参数是以空格作为分隔的,如 echo "${addr} $(date +%T)-load-is-$load" > ../log/load.tmp /bin/bash ../mail/mail.sh "发邮件给谁" "$addr_load:$load" "$(cat ../log/load.tmp)" 只会识别${addr},不会辨别$(date +%T)-load-is-$load
- 这里用到了环境变量