系统环境:
SUSE Linux Enterprise Server 10 SP1 (x86_64)
问题背景:
由于线上系统环境下的crontab内容比较多,在进行日常crontab任务调度时,经常会异常挂掉而影响业务的正常使用,因此结合C和Shell写了一个简单的对crontab进行健康检测的功能。
处理思路:
修改syslog的配置参数,把crontab调度日志单独抽取出来,同时在crontab项里添加检测标记,通过后台守护进程定期检测状态标记来判断当前crontab调度是否正常,同时为了避免日志文件过大而影响性能,会定期对日志文件做切割和清理处理。
#--------------------------------------------------------------------------------------------------------------------------------------------
1、相关目录创建
# mkdir -p /data/logs/crontab
# mkdir -p /data/scripts
# mkdir -p /data/backup/crontab
#--------------------------------------------------------------------------------------------------------------------------------------------
2、crontab健康检测C代码
# cd /data/scripts
# vim check_cron_process.h
#ifndef __CHECK_CRON_PROCESS_H__ #define __CHECK_CRON_PROCESS_H__ #include <stdio.h> #include <stdlib.h> #include <string.h> #define BUFFSIZE1 1024 #define BUFFSIZE2 32 #define LOCKFILE "/var/run/check_cron_process.pid" #define LOGFILE "/var/log/check_cron_process.log" #define LOCKMODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) #define SYSCMD1 "ps aux | grep -w cron | grep -v grep" #define SYSCMD2 "ps aux | grep -w cron | grep -v grep | grep defunct" #define SYSCMD3 "tail -6 /data/logs/crontab/cron.log | grep '(root) CMD (cd'" #define SYSCMD4 "killall -9 cron >/dev/null 2>&1" #define SYSCMD5 "/sbin/service cron start >/dev/null 2>&1" void already_running(void); void init_daemon(void); int run_system_cmd(const char *syscmd); #endif
# vim check_cron_process.c
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/param.h> #include <sys/stat.h> #include <syslog.h> #include <fcntl.h> #include <errno.h> #include <time.h> #include "check_cron_process.h" static char buffer[BUFFSIZE1] = {0}; static char datetime[BUFFSIZE2] = {0}; /* 获得当前系统时间 */ int get_curr_date(char *strtime, unsigned int ustrlen) { struct tm *pt = NULL; time_t timer; if (!strtime) { return -1; } time(&timer); strtime[0] = '\0'; pt = localtime(&timer); if (!pt) { return -1; } memset(strtime, 0, ustrlen); sprintf(strtime, "%04d-%02d-%02d-%02d:%02d:%02d", pt->tm_year + 1900, pt->tm_mon + 1, pt->tm_mday, pt->tm_hour, pt->tm_min, pt->tm_sec); return 0; } /* 将信息写入日志文件 */ int writelog(const char *pLoginfo) { FILE *fp = NULL; unsigned int ustrlen = 0; if (pLoginfo == NULL) { return -1; } ustrlen = strlen(pLoginfo); if (ustrlen > 256) { return -1; } if ((fp = fopen(LOGFILE, "a+")) == NULL) { return -1; } memset(datetime, 0, BUFFSIZE2); get_curr_date(datetime, BUFFSIZE2); fprintf(fp, "%s %s", datetime, pLoginfo); fclose(fp); return 0; } int LockFile(int fd) { struct flock fl; fl.l_type = F_WRLCK; fl.l_start = 0; fl.l_whence = SEEK_SET; fl.l_len = 0; return (fcntl(fd, F_SETLK, &fl)); } /* 只允许一个副本运行 */ void already_running(void) { int fd = -1; char buf[16] = {0}; fd = open(LOCKFILE, O_RDWR | O_CREAT, LOCKMODE); if (fd < 0) { syslog(LOG_ERR, "can't open %s: %s", LOCKFILE, strerror(errno)); exit(1); } if (LockFile(fd) < 0) { if (errno == EACCES || errno == EAGAIN) { close(fd); exit(1); } syslog(LOG_ERR, "can't lock %s: %s", LOCKFILE, strerror(errno)); exit(1); } ftruncate(fd, 0); sprintf(buf, "%d", getpid()); write(fd, buf, strlen(buf)); close(fd); } /* 作为守护进程运行 */ void init_daemon(void) { int pid = -1; if ((pid = fork())) { exit(0); } else if (pid < 0) { exit(1); } setsid(); if ((pid = fork())) { exit(0); } else if (pid < 0) { exit(1); } chdir("/tmp"); umask(0); return; } /* 运行系统命令 */ int run_system_cmd(const char *syscmd) { FILE *fp = NULL; if (syscmd == NULL) { return -1; } memset(buffer, 0, BUFFSIZE1); snprintf(buffer, BUFFSIZE1, syscmd); fp = popen(buffer, "r"); if (!fp) { return 0; } memset(buffer, 0, BUFFSIZE1); if (!fgets(buffer, BUFFSIZE1, fp)) { pclose(fp); return 0; } if (!strncasecmp(buffer, "", BUFFSIZE1)) { pclose(fp); return 0; } pclose(fp); return 1; } int main(int argc, char *argv[]) { int ret = 0; init_daemon(); already_running(); openlog(NULL, LOG_CONS | LOG_PID, LOG_LOCAL1); while(1) { /* 1.检查cron进程是否已经运行 */ ret = run_system_cmd(SYSCMD1); if (!ret) { writelog("The cron process is not running, now start it! \n"); sleep(1); system(SYSCMD5); goto CHECK_CRON; } /* 2.如果已经运行,查看是否有僵尸进程 */ ret = run_system_cmd(SYSCMD2); if (ret) { writelog("The cron process is defunct, now restart it! \n"); sleep(1); system(SYSCMD4); sleep(1); system(SYSCMD5); goto CHECK_CRON; } /* 3.如果进程运行正常,检查任务是否正常调度 */ ret = run_system_cmd(SYSCMD3); if (!ret) { writelog("The cron work is down, now restart it! \n"); sleep(1); system(SYSCMD4); sleep(1); system(SYSCMD5); goto CHECK_CRON; } /* 4.crontab进程运行正常 */ writelog("The cron process is ok! \n"); CHECK_CRON: /* 休眠5分钟后,继续检测 */ sleep(300); } closelog(); return 0; }
# vim Makefile
CC = gcc CXX = g++ BINARY = check_cron_process OBJS = check_cron_process.o CFLAGS += -I/usr/include -I/usr/local/include -Wall -Wno-unused-variable LDFLAGS = -static -O2 all:$(BINARY) $(BINARY):$(OBJS) $(CC) $(LDFLAGS) -o $(BINARY) $(OBJS) $(OBJS):%.o:%.c $(CC) $(CFLAGS) -c $^ -o $@ clean: rm -f $(BINARY) $(OBJS)
# make
# /data/scripts/check_cron_process
#--------------------------------------------------------------------------------------------------------------------------------------------
3、crontab任务调度日志配置【新增】
# vim /etc/syslog.conf
## check_crontab_start.tag.1 cron.* /data/logs/crontab/cron.log ## check_crontab_end.tag.1
# vim /etc/syslog-ng/syslog-ng.conf
## check_crontab_start.tag.1 destination dst_cron { file("/data/logs/crontab/cron.log"); }; log { source(src); filter(f_cron); destination(dst_cron); }; ## check_crontab_end.tag.1
# /sbin/service syslog restart
#--------------------------------------------------------------------------------------------------------------------------------------------
4、crontab任务调度日志处理
(1)、日志切割与清理
# vim /data/scripts/cut_cron_log.sh
#!/bin/bash PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin ## MY CRON LOG PATH LOGPATH="/data/logs/crontab" retval=`ps aux | grep sbin/cron | grep -v grep | wc -l` if [ ${retval} -eq 0 ]; then echo "The cron process is not running ^_^" exit 1 fi ## cut crontab's log mv ${LOGPATH}/cron.log ${LOGPATH}/cron_$(date -d "yesterday" +"%Y-%m-%d").log /sbin/service syslog restart ## clear 10 days ago's crontab logs rm -f ${LOGPATH}/cron_$(date -d "10 days ago" +"%Y-%m-%d").log
(2)、crontab信息备份
# vim /data/scripts/backup_crontab.sh
#!/bin/bash PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin CRONTAB_BACKUP_DIR="/data/backup/crontab" #备份crontab内容 mkdir -p ${CRONTAB_BACKUP_DIR} crontab -uroot -l > ${CRONTAB_BACKUP_DIR}/crontab_`date +%F` #清理10前的备份 CRONBAK=crontab_$(date -d "10 days ago" +"%Y-%m-%d") find ${CRONTAB_BACKUP_DIR} -type f -name ${CRONBAK} -exec rm -f {} \;
(3)、crontab垃圾头信息清理
# vim /data/scripts/clean_crontab_trash.sh
#!/bin/bash PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin ## The crontab's spool file CRONFILE="/var/spool/cron/tabs/root" sed -i '/# DO NOT EDIT THIS FILE/d' ${CRONFILE} sed -i '/# (\/data\/crontab.tmp/d' ${CRONFILE} sed -i '/# (\/tmp\/crontab/d' ${CRONFILE} sed -i '/# (Cron version/d' ${CRONFILE} sed -i '/# (- installed on/d' ${CRONFILE} sed -i '/# (\/usr\/local\/agenttools/d' ${CRONFILE} sed -i '/# (\/tmp\/cron.tmp/d' ${CRONFILE} sed -i '/# (tmp2 installed/d' ${CRONFILE} sed -i '/# (crontab.tmp/d' ${CRONFILE} sed -i '/# (\/data\/crontab_/d' ${CRONFILE}
(4)、crontab设置
# crontab -e
## crontab日志切割与清理
00 00 * * * /data/scripts/cut_cron_log.sh >/dev/null 2>&1
## 运行状况检测标记
*/1 * * * * cd /usr/local; echo >/dev/null 2>&1
## crontab信息备份
30 08 * * * /data/scripts/backup_crontab.sh >/dev/null 2>&1
## crontab垃圾头信息清理
*/30 * * * * /data/scripts/clean_crontab_trash.sh >/dev/null 2>&1