公司服务器越来越多了,本来用一个脚本去检测了,现在改用Nagios
ubuntu 客户端安装脚本
#!/bin/bash tmp_dir=/tmp/nagios nagios_ser="192.168.1.3" groupadd nagios useradd -g nagios -s /sbin/nologin nagios if [ ! -d $tmp_dir ]; then mkdir $tmp_dir fi cd $tmp_dir wget http://downloads.sourceforge.net/project/nagios/nrpe-2.x/nrpe-2.15/nrpe-2.15.tar.gz wget http://nagios-plugins.org/download/nagios-plugins-2.0.1.tar.gz #---- install for i in `ls -1` do tar xf $i done apt-get -y --force-yes install openssl ruby1.9.1 build-essential apt-get -y --force-yes install libssl-dev lm-sensors tar xvf nagios-plugins-2.0.1.tar.gz cd nagios-plugins-2.0.1 ./configure --with-nagios-user=nagios --with-nagios-group=nagios make make install cd ../ tar xvf nrpe-2.15.tar.gz cd ./nrpe-2.15 ./configure --with-ssl-lib=/usr/lib/x86_64-linux-gnu make all make install-plugin make install-daemon make install-daemon-config #mv ./check_* /usr/local/nagios/libexec #chmod 755 -R /usr/local/nagios/libexec chown -R nagios:nagios /usr/local/nagios/ cat >/usr/local/nagios/etc/nrpe.cfg<<EOF log_facility=daemon pid_file=/var/run/nrpe.pid server_port=5666 nrpe_user=nagios nrpe_group=nagios allowed_hosts=127.0.0.1,$nagios_ser dont_blame_nrpe=0 allow_bash_command_substitution=0 debug=0 command_timeout=60 connection_timeout=300 command[check_users]=/usr/local/nagios/libexec/check_users -w 5 -c 10 command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20 command[check_zombie_procs]=/usr/local/nagios/libexec/check_procs -w 5 -c 10 -s Z command[check_total_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200 command[check_procs]=/usr/local/nagios/libexec/check_procs -w 150 -c 200 command[check_alldisk]=/usr/local/nagios/libexec/check_alldisk -w 90 -c 95 command[check_http]=/usr/local/nagios/libexec/check_http -H 127.0.0.1 -w 5 -c 10 command[check_ping]=/usr/local/nagios/libexec/check_ping -H 127.0.0.1 -w 3000.0,80% -c 5000.0,100% -p 5 command[check_ssh]=/usr/local/nagios/libexec/check_ssh -4 127.0.0.1 command[check_swap]=/usr/local/nagios/libexec/check_swap -w 30% -c 10% command[check_sensors]=/usr/local/nagios/libexec/check_sensors command[check_mdadm]=/usr/local/nagios/libexec/check_mdadm command[check_smart]=/usr/local/nagios/libexec/check_smart command[check_drbd]=/usr/local/nagios/libexec/check_drbd EOF echo "/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d" >> /etc/rc.local /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d rm -rf $tmp_dir
自己折腾的ruby脚本,
1:check_smart 磁盘状态检测
#!/usr/bin/env ruby #0 ok; 1 warning; 2 critical; 3 unknown #echo "nagios ALL=NOPASSWD:/usr/sbin/smartctl" >>/etc/sudoers #CentOS sed -i "s:Defaults requiretty:Defaults:nagios !requiretty:" /etc/sudoers #调用 check_nrpe!check_smart health = "" `ls -1 /dev/sd[a-z]* | grep [a-z]$`.split.each do |hdd| status = `sudo /usr/sbin/smartctl -H #{hdd} | grep result | awk -F: '{print $2}'` if status.match(/PASSED/) health = health + hdd + " OK\n" else health = health + hdd + " Fail\n" end end if health.include? "Fail" puts health exit 2 end puts health exit 0
2:check_mdadm 软阵列检测
#!/usr/bin/env ruby #0 ok; 1 warning; 2 critical; 3 unknown status = `cat /proc/mdstat` if status.scan('U').size == status.scan('md').size * 2 puts "Soft Raid OK" exit 0 else puts "Soft Raid Fail" exit 2 end
3:check_drbd DRBD检测
#!/usr/bin/ruby #0 ok; 1 warning; 2 critical; 3 unknown if `cat /proc/drbd`.scan("UpToDate").count == `ls -la /dev/ | grep ^b | grep drbd | wc -l`.to_i * 2 puts "DRBD OK" exit 0 else puts "DRBD Critical" exit 2 end
4:check_alldisk 检测磁盘空间
#!/usr/bin/env ruby #ARGV[1] min ,ARGV[3] max # -w 90 -c 95 #0 ok; 1 warning; 2 critical; 3 unknown space = '' status = `df -hl -x tmpfs -x devtmpfs | grep -v ^Filesystem`.split if status.size < 6 #unkown puts "UNKOWN" exit 3 end (status.size / 6).times do |x| current_use, min_use, max_use = status[4 + x * 6][0..-2].to_i, ARGV[1].to_i, ARGV[3].to_i if current_use > max_use #critical space = space + status[x * 6] + " " + status[4 + x * 6] + " " + status[5 + x * 6] +" Critical\n" elsif current_use > min_use and current_use <= max_use #warning space = space + status[x * 6] + " " + status[4 + x * 6] + " " + status[5 + x * 6] + " Warning\n" elsif current_use <= min_use #ok space = space + status[x * 6] + " " + status[4 + x * 6] + " " + status[5 + x * 6] + " OK\n" end end if space.include?("Crtitical") puts space exit 2 elsif space.include?("Warning") puts space exit 1 else puts space exit 0 end