场景:
CentOS6.3的二次封装版本,安装hortonworks的hadoop发行版,并按照其官方文档实现高可用,但无fence设备支持,因此导致断网和断电测试时,备用HA节点无法得到资源释放的通知。
因此,笔者简单写了一个脚本,让HA的备节点在主节点断网和断电时能够通过简单测试得知并获取资源,以实现真正意义的HA。
思考:
如何判断网络的连通性?
如何判断VIP的可用性?
如何判断自身的故障?
如何尽可能防止脑裂?
实现:
1、当自身节点IP均不正常时,使本节点重启
2、当检测自身网络正常,且VIP不在线,且日志指出对方HA节点故障时获取资源
3、ganglia的VIP连接修复(不在本文主要思考范围)
代码如下:
#!/bin/bash STD=$$ Cron_if=`ps aux|grep $0|grep -vE "grep|$STD"|/usr/bin/wc -l` [[ "$Cron_if" -ge 2 ]] && exit 2 sleep 30 VIP=${VIP:-192.168.1.198} RE_PRD=${RE_PRD:-10} SAFE_TIME=${SAFE_TIME:-60} NMK=${NMK:-8} NUL=/dev/null Date="/bin/date +%k:%M:%S/%Y-%m-%d" [ ! -f /etc/sysconfig/hdp.conf ] && echo "`$Date` Error: No such config fil." >> $Mlog && exit 1 [ ! -f /etc/init.d/cman ] && echo "`$Date` War: `uname -n` Invalid HA node." >> $Mlog && exit 2 [ ! -f /etc/init.d/rgmanager ] && echo "`$Date` War: `uname -n` Invalid HA node." >> $Mlog && exit 2 [ -f /etc/sysconfig/hdp.conf ] && . /etc/sysconfig/hdp.conf while :;do RQE1=`/etc/init.d/rgmanager status 2> $Mlog|grep "is running."` RQE2=`/etc/init.d/cman status 2> $Mlog|grep "is running."` RQE3=`/etc/init.d/ricci status 2> $Mlog|grep "is running."` RQE4=`/etc/init.d/modclusterd status 2> $Mlog|grep "is running."` [ -z "$RQE2" ] && /etc/init.d/cman start &> $NUL [ -z "$RQE1" ] && /etc/init.d/rgmanager start &> $NUL [ -z "$RQE3" ] && /etc/init.d/ricci start &> $NUL [ -z "$RQE4" ] && /etc/init.d/modclusterd start &> $NUL if [[ -n "$RQE1" && -n "$RQE2" ]];then break else sleep $SAFE_TIME continue fi done NODE=(`grep clusternode /etc/cluster/cluster.conf|grep nodeid|awk -F\" '{print $2}'`) for i in ${NODE[@]};do NODE_IP=`grep $i /etc/hosts|awk '{print $1}'` JUDG_VAR=`/sbin/ip a|grep $NODE_IP` [ -n "$JUDG_VAR" ] && N_NAME=$i [ -z "$JUDG_VAR" ] && R_NAME=$i done Node_Reboot(){ W_VIP=`/sbin/ip a show $Ne|grep $VIP` Nic_File=/etc/sysconfig/network-scripts/ifcfg-$Ne PHY_IP_FILE=`grep IPADDR $Nic_File|awk -F\= '{print $2}'` IP_FILE_IF=`/sbin/ifconfig $Ne|grep $PHY_IP_FILE` if [[ -z "$W_VIP" && -z "$IP_FILE_IF" ]];then KILL_PID=(`ps aux|grep -E "rgmanager|fenced|dlm_controld|gfs_controld|corosync"|grep -v grep|awk '{print $2}'`) for i in ${KILL_PID[@]};do kill -9 $i &> $NUL done /etc/init.d/rgmanager stop &> $NUL /etc/init.d/cman stop &> $NUL /sbin/reboot &> $NUL fi } FREE_RESOURCE(){ DFGW=`/sbin/route -n|grep $Ne|awk '{print $2}'|grep -v "0.0.0.0"` NFS_IP=`grep netfs /etc/cluster/cluster.conf|awk -F\" '{print $8}'|grep -v "^$"` P_CMD="/bin/ping -c 3 -W 1" if ! $P_CMD $VIP &>$NUL;then if $P_CMD $DFGW &>$NUL || $P_CMD $R_NAME &>$NUL || $P_CMD $NFS_IP &>$NUL ;then if ! $P_CMD $VIP &>$NUL;then DOWN_LOG=`/usr/bin/tail -1 /var/log/cluster/rgmanager.log|grep "$R_NAME DOWN"` FENCE_LOG=`tail -1 /var/log/cluster/fenced.log|grep "$R_NAME failed"` if [[ -n "$DOWN_LOG" && -n "$FENCE_LOG" ]];then echo absolutely|/usr/sbin/fence_ack_manual $R_NAME fi fi else echo "`$Date` Gateway:$DFGW and HA_node:$R_NAME and Nfs:$NFS_IP Offline!!" >> $Mlog fi fi } GGA(){ RE=`netstat -anup|grep gmond|grep $VIP|/usr/bin/wc -l` [ "$RE" -eq 4 ] && return 0 MGF=/etc/ganglia/gmond.conf RE=`grep -A 2 -E "udp_send_channel|tcp_accept_channel" $MGF|grep $VIP|/usr/bin/wc -l` if [ "$RE" -ne 2 ];then sed -i "/^udp_accept_channel/a\ \ bind = $VIP" $MGF sed -i "/^tcp_accept_channel/a\ \ bind = $VIP" $MGF fi GFL=(`find /etc/ganglia/hdp -name "gmond.slave.conf"`) for g in ${GFL[@]};do if grep "bind = $VIP" $g &> $NUL;then continue fi sed -i "/\<host\>/i\ \ bind = $VIP" $g done /etc/init.d/gmond restart &> $NUL /etc/init.d/hdp-gmond restart &> $NUL } while :;do GGA FREE_RESOURCE Node_Reboot sleep $RE_PRD done