1.背景

  最近公司有个比较差的游戏项目,简直快突破运维下线,环境条件组合极多,为了快速完成更新脚本,所以采用shell来完成,由于长时间没有写过代码,因为一个概念性问题闹出一个大大的乌龙。

 

2.环境以及排错过程

  这里还是先画个图吧。

  

关于shell脚本返回值,函数的一个乌龙_redis

  以下代码是问题修改完成后的代码,这里还是贴出来,有问题的代码会在排错思路中标识出来。

1 管理端tw-hxh
2 [root@tw-hxh-ope infra]# cat /data/infra/scripts/cbt1/login/tw_hxh
3 #!/bin/bash
4
5 Action=$1
6 Package=$2
7 PackageMd5=$3
8
9 Host="cbt-login"
10 ProgPrex="server_global"
11 GameRoot=/data/s8/stress_test/
12
13 WorkDirectory=/data/infra/package/
14 WorkDirectoryBackup=/data/infra/package/backup/
15
16 function PackageHandle {
17 #删除老的sot文件
18 #rm -rf ${WorkDirectory}/sot
19
20 #判断包是否存在
21 echo -e "\033[31m 判断包是否存在 \033[0m"
22 [ ! -d ${WorkDirectoryBackup} ] && mkdir -p ${WorkDirectoryBackup}
23 if [ ! -f ${WorkDirectory}${Package} ];then
24 coscmd download ${Package} ${WorkDirectory}
25 if [ $? != 0 ];then
26 echo -e "\033[31m 云上包不存在,更新程序退出 \033[0m" && exit 1
27 fi
28 else
29 echo -e "\033[31m 文件已存在本地,继续更新流程 \033[0m"
30 fi
31 #echo $WorkDirectory$Package
32 #判断包的md5是否正常
33 echo -e "\033[31m 判断包的Md5是否正常 \033[0m"
34 DownPackMd5=`md5sum $WorkDirectory$Package | awk '{print $1}'`
35 if [ ${DownPackMd5} != ${PackageMd5} ];then
36 echo -e "\033[31m 包md5校验失败,更新程序退出 \033[0m" && exit 2
37 else
38 echo -e "\033[32m 包校验成功 \033[0m"
39 fi
40
41 #解压更新包,作为md5校验以及更新用
42 #echo -e "\033[31m 解压更新包 \033[0m"
43 #cd ${WorkDirectory}
44 #tar zxf $Package
45
46 }
47
48 #备份时长10分钟,逻辑取消
49 #function BackupDir {
50 # DATE1="`date +'%Y%m%d_%H%M'`"
51 # tar zcf /data/infra/backup/game.tar.gz server_global --exclude core* --exclude config.json --exclude log
52 # ansible ${BackupHost} -m shell -a "mkdir -p /data/infra/backup/; cd ${GameRoot} && zip -rq -x "core*" -x "config.json" -x "log/*" /data/infra/backup/game.zip_${DATE1} ${BackupDir}" -o
53 # ansible ${BackupHost} -m synchronize -a "src=/data/infra/backup/game.zip_${DATE1} dest=/data/infra/backup/server_global mode=pull" -o
54 #}
55
56 function BackupPackage {
57 echo -e "\033[31m 备份包至文件夹以及存储桶 \033[0m"
58 mv -rf ${WorkDirectory}${Package} ${WorkDirectoryBackup}
59
60 #备份包至存储桶备份文件夹
61 coscmd -b tw-hxh-package-1301763669 -r ap-taipei move tw-hxh-package-1301763669.cos.ap-taipei.myqcloud.com/${Package} package/
62 echo -e "\033[32m 备份成功 \033[0m"
63 ls ${WorkDirectoryBackup}${Package}
64 coscmd list package/$Package
65 }
66
67 CheckStat(){
68 [ "$?" -eq 0 ] && success $"$base startup" || failure $"$base startup"
69 }
70
71 function UpdateCheck {
72 RegionNum=0
73 for i in `cat /etc/hosts | egrep ${Host} | awk '{print $2}'`;do
74 Games=`ssh $i "cd ${GameRoot};ls -D |grep ${ProgPrex}"`
75 for g in $Games;do
76 let RegionNum+=1
77 done
78 done
79 PackageBinVerNum=`ls ${WorkDirectory}${Package} | awk -F_ '{print $2}'`
80 PackageCfgVerNum=`ls ${WorkDirectory}${Package} | awk -F_ '{print $3}' | awk -F.t '{print $1}'`
81 GameBinVerNum=`ansible ${Host} -m shell -a "cat ${GameRoot}${ProgPrex}*/bin_ver" | grep ${PackageBinVerNum} |wc -l`
82 GameCfgVerNum=`ansible ${Host} -m shell -a "cat ${GameRoot}${ProgPrex}*/cfg_ver" | grep ${PackageCfgVerNum} |wc -l`
83
84 if [ ${Action} == "hupdate" ];then
85 if [ ${GameCfgVerNum} == ${RegionNum} ];then
86 echo -e "\033[32m 热更版本号匹配成功,进入md5校验 \033[0m"
87 else
88 echo -e "\033[31m 停服更新版本号不匹配,以下主机版本号不匹配当前更新版本,程序退出 \033[0m"
89 ansible ${Host} -m shell -a "cat ${GameRoot}${ProgPrex}*/cfg_ver" | grep -v ${PackageCfgVerNum} && exit 3
90 fi
91 elif [ ${Action} == "cupdate" ];then
92 if [ ${GameBinVerNum} == ${RegionNum} ];then
93 echo -e "\033[32m 停服更新版本号匹配成功,进入md5校验 \033[0m"
94 else
95 echo -e "\033[31m 停服更新版本号不匹配,以下主机版本号不匹配当前更新版本,程序退出 \033[0m"
96 ansible ${Host} -m shell -a "cat ${GameRoot}${ProgPrex}*/bin_ver" | grep -v ${PackageBinVerNum} && exit 4
97 fi
98 else
99 read -p "输入需要验证的版号类别" ver
100 read -p "输入需要验证的版本号" vernum
101 if [ ${ver} == 'bin' ];then
102 ansible ${Host} -m shell -a "cat ${GameRoot}${ProgPrex}*/bin_ver" | grep ${vernum} | wc -l
103 elif [ ${ver} == 'cfg' ];then
104 ansible ${Host} -m shell -a "cat ${GameRoot}${ProgPrex}*/cfg_ver" | grep ${vernum} | wc -l
105 fi
106 fi
107
108 }
109
110 function ActionHandle {
111 if [ ${Action} == 'status' ];then
112 FailNum=`ansible ${Host} -m shell -a "sh /data/infra/scripts/tw_hxh $1" | egrep "Fail" | wc -l`
113 if [ ${FailNum} == 0 ];then
114 hostnum=`ansible ${Host} -m shell -a "sh /data/infra/scripts/tw_hxh $1" | egrep "Running|Stopping" | awk '{print $7}' | uniq -c | awk '{print $1}'`
115 hoststatus=`ansible ${Host} -m shell -a "sh /data/infra/scripts/tw_hxh $1" | egrep "Running|Stopping" | awk '{print $7}' | uniq -c | awk '{print $2}'`
116 echo -e "\033[32m 所有服务器状态正常 $hoststatus 的主机有 $hostnum 台 \033[0m "
117 else
118 echo -e "\033[31m 以下服务器状态异常 \033[0m"
119 ansible ${Host} -m shell -a "sh /data/infra/scripts/tw_hxh $1" | egrep "Fail" && exit 5
120 fi
121 elif [ ${Action} == 'start' ] || [ ${Action} == 'stop' ] || [ ${Action} == 'hupdate' ] || [ ${Action} == 'cupdate' ] || [ ${Action} == 'wupdate' ];then
122 ansible ${Host} -m shell -a "sh /data/infra/scripts/tw_hxh $1 $2" -f 50
123 fi
124 }
125
126
127 case $Action in
128 start)
129 echo -e "\033[32m**********************************************************************\033[0m start all server \033[32m**********************************************************************\033[0m"
130 ActionHandle start
131 ;;
132 stop)
133 echo -e "\033[32m**********************************************************************\033[0m stop all server \033[32m**********************************************************************\033[0m"
134 ActionHandle stop
135 ;;
136 status)
137 ActionHandle status
138 ;;
139 restart)
140 ActionHandle stop
141 sleep 5
142 ActionHandle start
143 ;;
144 check)
145 UpdateCheck
146 ;;
147 hupdate)
148 echo -e "\033[32m**********************************************************************\033[0m hupdate server \033[32m**********************************************************************\033[0m"
149 PackageHandle
150 ActionHandle rsynccode ${Package}
151 UpdateCheck
152 ;;
153 cupdate)
154 echo -e "\033[32m**********************************************************************\033[0m cupdte server \033[32m**********************************************************************\033[0m"
155 PackageHandle
156 ActionHandle rsynccode ${Package}
157 UpdateCheck
158 ;;
159 wupdate)
160 echo -e "\033[32m**********************************************************************\033[0m web tools update \033[32m**********************************************************************\033[0m"
161 ActionHandle wupdate
162 ;;
163 *)
164 echo "USAG:check|start|stop|status|hupdate:热更代码(不停服)|cupdate:停服下更新代码."
165 ;;
166 esac

管理端tw_hxh

 


1 #游戏服执行端tw-hxh
2 [root@tw-hxh-cbt-game01 hunter_server1]# cat /data/infra/scripts/tw_hxh
3 #!/bin/bash
4 # version 2.0 by san at 2022-06-21
5
6
7 # 公共库函数
8 . /etc/init.d/functions
9 # 本机游戏根目录
10 GameRoot=/data/s8/stress_test/
11
12 Package=$2
13
14
15 # 本机游戏前缀(一机多开)
16 ProgPrex="hunter_server"
17 #游戏的进程
18 Process="http_server|manager_server|log_server|data_server|scene_server|game_server|gate_server|center_server"
19 #Process="cross_server"
20 #正常游戏启动后的进程数量
21 ProcessNum=18
22 Games=$(ls -D $GameRoot |grep ${ProgPrex}|egrep -v "disable")
23
24 function CheckManagerProcess {
25 num=0
26 Execs=$(ps -ef |egrep -v grep | egrep "${Process}" | awk '{print $2}')
27 for game in $Games;do
28 for propid in $Execs;do
29 FolderName=$(ls -l /proc/${propid}/cwd | cut -d- -f2 | awk -F / '{print $5}')
30 if [ "$game" == "$FolderName" ];then
31 let num+=1
32 fi
33 done
34 if [ "$num" == "$ProcessNum" ];then
35 echo -e "\033[32m $game \033[0m Service is \033[32m Running \033[0m"
36 elif [ "$num" == 0 ];then
37 echo -e "\033[32m $game \033[0m Service is \033[32m Stopping \033[0m"
38 else
39 echo -e "\033[32m $game \033[0m Service is \033[31m Fail \033[0m ,PorcessNum is $num" && exit 10
40 fi
41 num=0
42 done
43 }
44
45 CheckStat(){
46 [ "$?" -eq 0 ] && success $"$base startup" || failure $"$base startup"
47 }
48
49 #function RsyncCode {
50 # for g in $Games;
51 # do
52 # echo -e "\e[1;31m ---> $g <--- \e[0m"
53 # rsync -azP root@ope::tw_hxh_update ${GameRoot}${g}/ > /dev/null 2>&1
54 # CheckStat
55 # done
56 #}
57
58 function RsyncCode {
59 rsync -azP root@ope::tw_hxh_update/${Package} /tmp/ > /dev/null
60 if [ $? == 0 ]; then echo -e "\033[32m 获取安装包正常 \033[0m" ; else echo -e "\033[31m 获取安装包失败 \033[0m && exit 20";fi
61 tar zxf /tmp/${Package} -C /tmp/
62 if [ $? == 0 ]; then echo -e "\033[32m 解压安装包正常 \033[0m" ; else echo -e "\033[31m 解压安装包失败 \033[0m && exit 30";fi
63 for g in $Games;
64 do
65 echo -e "\e[1;31m ---> $g <--- \e[0m"
66 rsync -azP /tmp/sot/ ${GameRoot}${g}/ > /dev/null
67 if [ $? == 0 ]; then echo -e "\033[32m $g,同步代码正常 \033[0m" ; else echo -e "\033[31m $g,同步代码失败 \033[0m && exit 40";fi
68 done
69 rm -rf /tmp/sot/
70 rm -rf /tmp/${Package}
71 }
72
73 function UpdateCheck {
74 for g in ${Games};do
75 if [ "$1" == 'bin' ];then
76 GameBinVerNum=`cat ${GameRoot}${ProgPrex}*/bin_ver | grep ${2} |wc -l`
77 if [ ${GameBinVerNum} == 1 ];then
78 echo -e "\033[32m $g bin_ver正常 \033[0m"
79 else
80 echo -e "\033[31m $g bin_ver不正常 \033[0m" && exit 50
81 fi
82 elif [ "$1" == 'cfg' ];then
83 GameCfgVerNum=`cat ${GameRoot}${ProgPrex}*/cfg_ver | grep ${2} |wc -l`
84 if [ ${GameCfgVerNum} == 1 ];then
85 echo -e "\033[32m $g cfg_ver正常 \033[0m"
86 else
87 echo -e "\033[31m $g cfg_ver不正常 \033[0m" && exit 60
88 fi
89 fi
90 done
91 }
92
93 # 定义循环管理服务器函数
94 Game(){
95 cd $GameRoot
96 for g in $Games;
97 do
98 echo -e "\e[1;31m ---> $g <--- \e[0m"
99 cd ${GameRoot}/$g
100 sh $1
101 done
102 }
103
104
105 # 定义webtools管理服务器函数
106 WebTools(){
107 cd $GameRoot
108 for g in $Games;
109 do
110 echo -e "\e[1;31m ---> $g <--- \e[0m"
111 source /opt/rh/rh-python36/enable
112 cd ${GameRoot}/$g/s8_web_tools/
113 python3 $1
114 done
115 }
116
117 case $1 in
118 start)
119 echo "Starting games ..."
120 Game start.sh
121 ;;
122 stop)
123 echo "Stopping games ..."
124 Game close.sh
125 ;;
126 status)
127 CheckManagerProcess
128 ;;
129 restart)
130 echo -n "Stopping games ..."
131 Game close.sh
132 sleep 5
133 echo -n "Starting games ..."
134 Game start.sh
135 ;;
136 rsynccode)
137 echo " rsync code update ..."
138 RsyncCode
139 ;;
140 cupdate)
141 echo "Cold Code version update ..."
142 Game update_db.sh
143 ;;
144 hupdate)
145 echo "Hot Code version update ..."
146 Game reload_data.sh
147 ;;
148 wupdate)
149 echo "web tools update ..."
150 WebTools update.py
151 ;;
152 check)
153 echo "Code Check ..."
154 UpdateCheck $2 $3
155 ;;
156 *)
157 echo "USAG:start|stop|status|restart"
158 ;;
159 esac

游戏服执行端tw-hxh

 


1 #游戏服启动脚本start.sh
2 [root@tw-hxh-cbt-game01 hunter_server1]# cat start.sh
3 set -ex
4 if [ -e /opt/rh/rh-python36/enable ]
5 then
6 source /opt/rh/rh-python36/enable
7 fi
8 python3 start.py
9
10 #cd sot_rank_service
11 #python3 restart_rank_service.py
12 #cd ..
13
14 sleep 20
15
16 sh python3.sh monitor.py --start
17
18 if service zabbix-agent start
19 then
20 :
21 fi

游戏服启动脚本start.sh

 

1 #游戏服启动脚本start.py
2 [root@tw-hxh-cbt-game01 hunter_server1]# cat start.py
3 #! /usr/bin/env python
4 # encoding=utf-8
5 from __future__ import absolute_import
6 from __future__ import division
7 from __future__ import print_function
8 from __future__ import unicode_literals
9
10 import codecs
11 import json
12 import os
13 import time
14 import logging
15 import psutil
16 import sys
17 import argparse
18 import typing
19
20 import requests
21
22
23 __author__ = '1661'
24
25
26 def init_logger(module_name, filename=None, stdout=True, stdout_level=logging.INFO, file_log_level=logging.DEBUG):
27 if filename is None:
28 filename = module_name + u".log"
29
30 # print u'module_name(%s)' % module_name
31 logger = logging.getLogger()
32
33 if file_log_level < stdout_level:
34 logger_level = file_log_level
35 else:
36 logger_level = stdout_level
37
38 # logging.Logger()
39 # logging.setLoggerClass()
40 #
41 logger.setLevel(logger_level)
42
43 if True:
44 file_handler = logging.FileHandler(filename)
45 file_handler.setLevel(file_log_level)
46 formatter = logging.Formatter(
47 u'[%(asctime)s] [%(process)d] [%(name)s] [%(filename)s:%(lineno)d] [%(levelname)s] %(message)s'
48 )
49 file_handler.setFormatter(formatter)
50
51 logger.addHandler(file_handler)
52
53 # logger.fatal(u'test1')
54
55 if stdout:
56 #################################################################################################
57 # 定义一个StreamHandler,将INFO级别或更高的日志信息打印到标准错误,并将其添加到当前的日志处理对象#
58 console = logging.StreamHandler()
59 # console.setLevel(logging.DEBUG)
60 console.setLevel(stdout_level)
61 formatter = logging.Formatter(
62 u'[%(asctime)s] [%(name)s] [%(process)d] [%(filename)s:%(lineno)d] [%(levelname)s] %(message)s'
63 )
64 console.setFormatter(formatter)
65
66 logger.addHandler(console)
67
68 # logger.fatal(u'test2')
69 # print logger
70 # print id(logger)
71 return logging.getLogger(module_name)
72
73
74 class ShellFailError(BaseException):
75 def __init__(self, returncode, out, err):
76 super(BaseException, self).__init__()
77 self.returncode = returncode
78 self.out = out
79 self.err = err
80
81 def __str__(self):
82 return 'returncode(%s) out(%s) err(%s)' % (self.returncode, self.out, self.err)
83
84
85 def get_shell_cmd_output(cmd, exit_on_error=True, print_fun=None):
86 if print_fun is None:
87 print_fun = print
88 import subprocess
89 print_fun('调用外部命令(%s)' % cmd)
90 p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
91 pid = p.pid
92 out, err = p.communicate()
93 ret_code = p.returncode
94 if ret_code != 0:
95 print_fun('cmd(%s) fail, ret_code(%d), stdout(%s), stderr(%s)' % (cmd, ret_code, out, err))
96 if exit_on_error:
97 exit(1)
98 else:
99 raise ShellFailError(ret_code, out, err)
100 else:
101 return out, err, pid
102
103
104 def set_limit_on_linux():
105 try:
106 import resource
107 except:
108 pass
109 else:
110 resource.setrlimit(resource.RLIMIT_CORE, (resource.RLIM_INFINITY, resource.RLIM_INFINITY))
111 resource.setrlimit(resource.RLIMIT_NOFILE, (4096, 4096))
112
113
114 def start_server_group(exe_name, server_index_list, pid_file, is_restart_on_crash=False):
115 logger = logging.getLogger(__name__)
116
117 set_limit_on_linux()
118
119 all_server_list = list()
120 for server_index in server_index_list:
121 log_tag = '%s %s' % (exe_name, server_index)
122
123 use_heap_profile = False
124 use_heap_check = False
125
126 # if (exe_name, server_index) == ('scene_server', 27):
127 # use_heap_profile = True
128
129 env = dict(
130 ASAN_OPTIONS='abort_on_error=1:detect_leaks=0:disable_coredump=0',
131 LD_PRELOAD='/data/gperftools_gcc720/lib/libprofiler.so',
132 CPUPROFILESIGNAL=12,
133 CPUPROFILE='gperf.out/{program_name}.{param}'.format(
134 program_name=exe_name,
135 param=server_index
136 )
137 )
138
139 if use_heap_check:
140 env.update(
141 PPROF_PATH='/data/gperftools_gcc720/bin/pprof',
142 LD_PRELOAD='/data/gperftools_gcc720/lib/libtcmalloc.so',
143 HEAPCHECK='normal',
144 )
145
146 if use_heap_profile:
147 # 非嵌入式heap_profiler
148 env.update(dict(
149 LD_PRELOAD='/data/gperftools_gcc720/lib/libtcmalloc.so',
150 # # Dump heap profiling information each time the specified number of bytes has been allocated by the
151 # # program. default 1G
152 # HEAP_PROFILE_ALLOCATION_INTERVAL=10 * 1024 * 1024,
153 # # Dump heap profiling information whenever the high-water memory usage mark increases by the
154 # # specified number of bytes. default 100M
155 # HEAP_PROFILE_INUSE_INTERVAL=1 * 1024 * 1024,
156 # # Dump heap profiling information each time the specified number of seconds has elapsed. default 0
157 # HEAP_PROFILE_TIME_INTERVAL=10,
158 # Dump heap profiling information whenever the specified signal is sent to the process. default disabled
159 # HEAP_PROFILE_MMAP=1
160 HEAPPROFILESIGNAL=13,
161 HEAPPROFILE='./heap_prof.{program_name}'.format(program_name=exe_name),
162 ))
163
164 # # 嵌入式heap_profiler, 需要在编译时cmake -DPPROF_SERVER=1
165 # env.update(dict(
166 # TCMALLOC_SAMPLE_PARAMETER=524288,
167 # # 进程启动时, 开启pprof_server(供pprof remote server使用的一个http_server)
168 # PPROF_ON_BOOTUP=1,
169 # ))
170 # del env['LD_PRELOAD']
171
172 optional_kv_params = dict()
173 optional_params = list()
174
175 if is_restart_on_crash:
176 optional_params.append('-crash')
177 optional_params.append('1') # 游戏服需要强制加上一个value才能正常运行
178
179 if exe_name == 'manager_server':
180 server_numbers = get_server_number_of_types()
181 # optional_kv_params['-total_server_num'] = 43
182 optional_kv_params['-total_server_num'] = server_numbers['all']
183
184 cmd = '{env} setsid ./{exe_name} {server_index} {optional_params} {optional_kv_params} >> ./log/{exe_name}_{server_index}.clog 2>&1 & echo $!'.format(
185 env=' '.join('%s=%s' % (a, b) for a, b in env.items()),
186 exe_name=exe_name,
187 server_index=server_index,
188 optional_params=' '.join('%s' % x for x in optional_params),
189 # optional_kv_params=' '.join('%s=%s' % (a, b) for a, b in optional_kv_params.items()),
190 # 目前程序不支持标准的k=v方式
191 optional_kv_params=' '.join('%s %s' % (str(a), str(b)) for a, b in optional_kv_params.items()),
192 )
193
194 try:
195 out, err, _ = get_shell_cmd_output(cmd, exit_on_error=False, print_fun=logger.debug)
196 except ShellFailError as e:
197 logger.warning('%s launch fail' % log_tag)
198 return False
199 else:
200 pid = int(out)
201 logger.info('启动 %s, pid(%s)' % (log_tag, pid))
202 try:
203 sub = psutil.Process(pid)
204 except:
205 logger.error("启动失败 %s" % log_tag)
206 continue
207
208 pid_file.pid_dict[ExeTag.pack(exe_name, server_index)] = pid
209 pid_file.save_to_file()
210 all_server_list.append(dict(
211 exe_name=exe_name,
212 server_index=server_index,
213 pid=pid,
214 status='not listen',
215 ))
216
217 def check_pid_listened(pid):
218 try:
219 p = psutil.Process(pid)
220 except:
221 return 'no pid'
222 try:
223 for sock in p.connections('tcp'):
224 if sock.status == 'LISTEN':
225 return 'listen'
226 except psutil.AccessDenied as e:
227 logger.debug('''access deny for pid(%s) maybe it's lost''' % pid)
228 try:
229 p2 = psutil.Process(pid)
230 except:
231 return 'no pid'
232 else:
233 logger.fatal('进程(%s)存在但是无法获取socket信息' % pid)
234 raise e
235 return 'not listen'
236
237 ts = time.time()
238 while True:
239 now = time.time()
240 if now > ts + 10:
241 logger.info('等待server_index %s 建立端口监听' % (', '.join(
242 str(x['server_index']) for x in all_server_list if x['status'] != 'listen'),
243 ))
244 ts = now
245 for server in all_server_list:
246 if server['status'] == 'listen':
247 continue
248 pid = server['pid']
249 result = check_pid_listened(pid)
250 if result == 'no pid':
251 logger.error('pid(%s)已丢失, server_index(%s)' % (pid, server['server_index']))
252 server['status'] = result
253 return False
254 elif result == 'listen':
255 server['status'] = result
256 elif result == 'not listen':
257 pass
258 else:
259 logger.fatal('logic error, result(%s)' % result)
260 raise ValueError(result)
261
262 if not [x for x in all_server_list if x['status'] != 'listen']:
263 break
264 time.sleep(0.1)
265
266 return True
267
268
269 class ExeTag(object):
270 @staticmethod
271 def pack(exe, index):
272 return '%s_%s' % (exe, index)
273
274 @staticmethod
275 def unpack(exe_tag):
276 try:
277 exe, index = exe_tag.rsplit('_', 1)
278 index = int(index)
279 return exe, index
280 except (IndexError, ValueError) as e:
281 raise ValueError('invalid format exe_tag(%s)' % exe_tag)
282
283
284 class PidFile(object):
285 def __init__(self):
286 self.file_name = 'start.pid'
287 self.js = dict()
288
289 def read_from_file(self):
290 try:
291 with codecs.open(self.file_name, encoding='utf8', mode='rb') as f:
292 js = json.load(f)
293 except IOError:
294 return False
295 except Exception as e:
296 logging.getLogger(__name__).error('json parse fail, while reading file(%s)' % self.file_name)
297 raise e
298
299 self.js.clear()
300 self.js.update(js)
301 return True
302
303 def check_pid_running(self):
304 modify = False
305 for k, v in self.pid_dict.items():
306 if v is not None:
307 this_modify = False
308 try:
309 p = psutil.Process(v)
310 except psutil.NoSuchProcess:
311 this_modify = True
312 except psutil.AccessDenied as e:
313 logging.getLogger(__name__).error('no acess to read process(%s)(%s)' % (k, v))
314 raise e
315 else:
316 cmdline = p.cmdline()
317 exe_name, _ = ExeTag.unpack(k)
318 if not cmdline or cmdline[0].find(exe_name) == -1:
319 this_modify = True
320
321 if this_modify:
322 self.pid_dict[k] = None
323 modify = True
324
325 if modify:
326 self.save_to_file()
327
328 def save_to_file(self):
329 with codecs.open(self.file_name, encoding='utf8', mode='wb') as f:
330 json.dump(self.js, f, ensure_ascii=False, sort_keys=True, indent=4)
331
332 @property
333 def pid_dict(self):
334 return self.js
335
336
337 def get_server_number_of_types():
338 scene_server_num = len(get_ex_config('multi_scene_server_type'))
339 gate_server_num = 4
340 is_battle_field = get_server_config()['server_config']['zone_id'] in get_server_config()['server_config']['battle_field_zone']
341 if not is_battle_field:
342 server_name_list = [
343 'http_server',
344 'manager_server',
345 'log_server',
346 'data_server',
347 'center_server',
348 'scene_server',
349 'game_server',
350 'gate_server',
351 ]
352 else:
353 server_name_list = [
354 'http_server',
355 'manager_server',
356 'log_server',
357 'data_server',
358 # 'center_server',
359 'cross_center_server',
360 'scene_server',
361 # 'game_server',
362 'gate_server',
363 ]
364 all_server_num = len(server_name_list) - 2 + scene_server_num + gate_server_num
365 # all_server_num = scene_server_num + gate_server_num + 6
366 return dict(
367 all=all_server_num,
368 gate_server=gate_server_num,
369 scene_server=scene_server_num,
370 server_name_list=server_name_list,
371 )
372
373
374 def start_in_order():
375 logger = logging.getLogger(__name__)
376
377 pid_file = PidFile()
378 pid_file.read_from_file()
379 pid_file.check_pid_running()
380 ok = True
381 for k, v in pid_file.pid_dict.items():
382 if v is not None:
383 logger.error('发现残留进程(%s)(%s)' % (k, v))
384 ok = False
385 if not ok:
386 return False
387 pid_file.pid_dict.clear()
388
389 server_numbers = get_server_number_of_types()
390 server_name_list = server_numbers['server_name_list']
391 for server_name in server_name_list:
392 n = server_numbers.get(server_name, 1)
393
394 logger.info('启动%s' % server_name)
395 if not start_server_group(server_name, list(range(n)), pid_file):
396 logger.error('启动失败')
397 return False
398
399 return True
400
401
402 _ex_config_default = dict()
403 _ex_config_user = dict()
404
405
406 def get_ex_config(key):
407 global _ex_config_default
408 global _ex_config_user
409 if not _ex_config_default:
410 with codecs.open('ex_config_default.json', encoding='utf8', mode='rb') as f:
411 _ex_config_default = json.load(f)
412
413 try:
414 f = codecs.open('ex_config_user.json', encoding='utf8', mode='rb')
415 except:
416 pass
417 else:
418 _ex_config_user = json.load(f)
419 f.close()
420
421 try:
422 return _ex_config_user[key]
423 except KeyError:
424 return _ex_config_default[key]
425
426
427 _server_config = None # type: typing.Union[None, typing.Dict]
428
429
430 def get_server_config():
431 global _server_config
432 if _server_config is None:
433 with open('config.json', mode='r', encoding='utf8') as f:
434 _server_config = json.load(f)
435
436 return _server_config
437
438
439 def get_redis_heart_heat_time(exe, index):
440 """
441 获取进程在redis中写入的心跳时间, 如果没有找到, 返回None
442 :param exe:
443 :param index:
444 :return:
445 """
446 global _db_redis
447 global _zone_id
448 if _db_redis is None:
449 with open('config.json', 'r', encoding='utf8') as f:
450 config = json.load(f)
451 redis_config = config['server_config']['common_redis']
452 _zone_id = config['server_config']['zone_id']
453 _db_redis = redis.Redis(
454 host=redis_config['ip'], password=redis_config['password'], port=redis_config['port'],
455 db=redis_config['db']
456 )
457
458 #key = '%s_heart_beat' % (sot_server_type.to_enum_enum(exe) + index,)
459 key = '%s_zone_%s_heart_beat' % (_zone_id,sot_server_type.to_enum_enum(exe) + index,)
460 # logging.getLogger(__name__).info(key)
461 ts = _db_redis.get(key)
462 if ts is None:
463 logging.getLogger(__name__).info('no find')
464 return ts
465 ts = int(ts)
466 return ts
467
468
469 def wait_for_manager_server_run():
470 try:
471 import monitor
472 except ImportError:
473 from . import monitor
474 pid_file = PidFile()
475 pid_file.read_from_file()
476 pid_file.check_pid_running()
477 pid = pid_file.pid_dict.get('manager_server_0', None)
478 if pid is None:
479 logging.getLogger(__name__).error('manager_server进程丢失')
480 return False
481 else:
482 return monitor.monitor_server_launching('mn', 0, pid)
483
484
485 def main():
486 # ap = argparse.ArgumentParser()
487 # ap.add_argument('cmd', choices=['start', 'monitor'])
488 # args = ap.parse_args()
489 init_logger(sys.argv[0])
490 # if args.cmd == 'start':
491 if True:
492 if not start_in_order():
493 exit(1)
494 result = wait_for_manager_server_run()
495 try:
496 out, _, _ = get_shell_cmd_output('python3 info.py', exit_on_error=False)
497 except ShellFailError:
498 logging.getLogger(__name__).warning('info.py调用失败')
499 else:
500 for line in out.split('\n'):
501 logging.getLogger(__name__).info(line)
502 if not result:
503 exit(1)
504
505
506 if __name__ == '__main__':
507 main()

游戏服启动脚本start.py

 

  以上图片和代码可以看出,里面调用一个脚本执行任务调用了4层。

  最初的目的是,在2,3,4层调用的脚本或者命令执行失败,会马上终止任务给出非0的返回值,并且自动化运维平台会根据信号量来判断任务走向,是失败还是正常。然而结果并非如此。不管怎么执行报错,第1层的ansible执行永远返回0,自动化运维平台执行结果永远成功。所以有了以下搞笑的排错事件。以上图片和代码可以看出,里面调用一个脚本执行任务调用了4层。

  1.刚开始start.sh执行报错,没有非0的信号值,由于前端1,2层使用的是ansible执行,于是没想到是1,2曾出现的问题,可能是3层的start.sh没有正确返回信号值,于是排查,对start.sh 中的python start.py代码下echo$?,各种测试,结果发现是正常了返回1的。

  2.于是怀疑是start.py没有正确的返回值,最后看到start.py的最后几行,执行失败返回的是固定的1,所以start.py是正常的

  3.如果3,4层的start.sh和start.py都是正常的,那么只有不可能的1,2行了。于是从第一层开始查找,于是看到了管理端tw-hxh代码

elif [ ${Action} == 'start' ] || [ ${Action} == 'stop' ] || [ ${Action} == 'hupdate' ] || [ ${Action} == 'cupdate' ] || [ ${Action} == 'wupdate' ];then
ansible ${Host} -m shell -a "sh /data/infra/scripts/tw_hxh $1 $2" -f 50
fi

  这么一看,如果ansible执行的是面对2,3,4层的和自动化平台的中间纽带,所以觉得可能是ansible执行的时候,后面2,3,4层执行的结果不管失败还是错误的结果,都会返回0,因为是2,3,4层出的错,又不是我1层出的错,后面发现,结果并非如此,因为用其它脚本测试,被ansible调用的脚本出错了,ansible也会返回非0值

  4.于是查到了第二层,游戏服的tw-hxh,那么最关键的代码就是下面的

# 定义循环管理服务器函数
#修改前
Game(){
cd $GameRoot
for g in $Games;
do
(echo -e "\e[1;31m ---> $g <--- \e[0m"
cd ${GameRoot}/$g
sh $1
echo)
wait #注意这里
done
}

#修改后,正常的代码
Game(){
cd $GameRoot
for g in $Games;
do
echo -e "\e[1;31m ---> $g <--- \e[0m"
cd ${GameRoot}/$g
sh $1
done
}

  上面的代码先不用关注,等会用到。刚开始以为是函数的问题,以为函数没有返回值,即使函数里面报错了,命令行执行函数的脚本都会是0。于是做了以下测试。

[root@tw-hxh-ope tmp]# cat test.sh 
#!/bin/bash

function test {
(
ls sdlkfjldksjflksdfj
echo )

wait
}

test
[root@tw-hxh-ope tmp]# sh test.sh
ls: cannot access sdlkfjldksjflksdfj: No such file or directory

[root@tw-hxh-ope tmp]# echo $?
0

  好像结果确实如此,但是这时候觉得代码太多了,无法准确判断问题,于是代码改成

[root@tw-hxh-ope tmp]# cat test.sh 
#!/bin/bash

function test {
ls sdlkfjldksjflksdfj

}

test
[root@tw-hxh-ope tmp]# sh test.sh
ls: cannot access sdlkfjldksjflksdfj: No such file or directory
[root@tw-hxh-ope tmp]# echo $?
2

  然后结果正常了,看来问题真的出现在这里。但是这里到底是因为代码放入后台所以没有错误的返回值还是什么原因?于是再次更改代码

[root@tw-hxh-ope tmp]# cat test.sh 
#!/bin/bash

function test {
ls sdlkfjldksjflksdfj
wait
}

test
[root@tw-hxh-ope tmp]# sh test.sh
ls: cannot access sdlkfjldksjflksdfj: No such file or directory
[root@tw-hxh-ope tmp]# echo $?
0

  添加了wait后,又即使出错,返回值又变成0了,还是不对,于是再次更改

[root@tw-hxh-ope tmp]# cat test.sh 
#!/bin/bash

function test {
(ls sdlkfjldksjflksdfj
echo)

}

test
[root@tw-hxh-ope tmp]# sh test.sh
ls: cannot access sdlkfjldksjflksdfj: No such file or directory

[root@tw-hxh-ope tmp]# echo $?
0

  看到这里,所以发现了吧,其实跟后台执行,函数执行都没有关系。因为在报错的代码后面还有一条正常的命令,要么就是wait,要么就是echo,因为本人把他当作一个功能对象了,结果不管是echo的语法还是wait的语法,其实都是命令,他们的执行结果都是正确的,所以覆盖了ls sdlkfjldksjflksdfj的错误执行结果,从shell执行脚本来看,获取的就是最后一条命令(wait)的返回结果,所以一直都是0。因为shell即使报错,没有明确的指定让他退出的时候,它还是会继续执行剩下的命令。所以wait就被执行了。

  结果把wait和echo取消,返回信号值正常,自动化运维平台正常工作

 

3.总结

  因为太久没写脚本导致这次很简单事情的乌龙,刚开始还以为4层脚本调用,每一层都需要显式的明确的返回信号值给上一层才可以实现正确的信号值传递,其实脚本早就返回了正确的信号值,只是陷入了误区,以为需要特殊操作,其实是多余的没有注意到的正确的命令的结果覆盖了出错的结果。

  所以在shell里面,如果出现这种情况,看看下面还有没有正常的命令。比如执行管理端的sh tw-hxh start, ActionHandle start一定会报错,但是 ActionHandle status是正确的,那么 shell sh tw_hxh start的结果是什么?

case $Action in 
start)
echo -e "\033[32m**********************************************************************\033[0m start all server \033[32m**********************************************************************\033[0m"
ActionHandle start
ActionHandle status
;;

    结果一定返回的是 0,因为对于 sh tw_hxh start来说,最后一条命令是ActionHandle status。这条命令的返回结果是正确的,那么sh tw_hxh start返回的结果一定也是0

 

 

 

作者:​​小家电维修​​

转世燕还故榻,为你衔来二月的花。