服务器健康状态监控,包括监控内存、硬盘、cpu、进程等,从而形成状态报告。
#!/bin/bash # #内存最大使用比率 mem_quota=80 #硬盘最大使用比率 hd_quota=80 #cpu最大使用比率 cpu_quota=80 #获取cpu状态的时间间隔 time_gap=60 #每10分钟形成报告 runtime_gap=600 #内存监控函数 watch_memory() { mem_total=`cat /proc/meminfo | grep MemTotal | awk '{print $2}'` mem_free=`cat /proc/meminfo | grep MemFree | awk '{print $2}'` mem_usage=$((100-mem_free*100/mem_total)) if [ $mem_usage -gt $mem_quota ];then mem_message="the memory usage is $mem_usage%!!!" return 1 else return 0 fi } #硬盘监控函数 watch_hd() { hd_usage=`df | grep /dev/sda2 | awk '{print $5}' | sed 's/%//g'` if [ $hd_usage -gt $hd_quota ];then hd_message="the hard disk usage is $hd_usage%!!!" return 1 else return 0 fi } #函数读取/proc/stat文件,做数据准备工作,从而获得cpu从启动到现在的忙碌时间 get_cpu_info() { cat /proc/stat | grep -i "^cpu[0-9]\+" | awk '{used+=$2+$3+$4; unused+=$5+$6+$7+$8} END{print used,unused}' } #cpu监控函数 watch_cpu() { time_point_1=`get_cpu_info` sleep $time_gap time_point_2=`get_cpu_info` cpu_usage=`return $time_point_1 $time_point_2 | awk '{used=$3-$1;total+=$3+$4-$1-$2;print used*100/total}'` if [ $cpu_usage > $cpu_quota ];then cpu_message="the cpu usage is over $cpu_usage!!!" echo $cpu_usage return 1 else echo $cpu_usage return 0 fi } #获得最近忙碌的进程 proc_cpu_top10() { proc_busiest=`ps aux | sort -nk 3r | head -10` } while true; do report="" #报告内容 if [ `watch_memory` -eq 1 ];then #内存监控 report=$report'\n'$mem_message fi if [ `watch_hd` -eq 1 ];then #硬盘监控 report=$report'\n'$hd_message fi if [ `watch_cpu` -eq 1 ];then report=$report'\n'$cpu_message proc_cpu_top10 report=$report'\n'$proc_busiest fi if [ -n $report ];then echo sendmessage phonenumber report fi sleep $`runtime_gap-time_gap` done