系统环境:

SUSE Linux Enterprise Server 10 SP1 (x86_64)


问题背景:

由于线上系统环境下的crontab内容比较多,在进行日常crontab任务调度时,经常会异常挂掉而影响业务的正常使用,因此结合C和Shell写了一个简单的对crontab进行健康检测的功能。


处理思路:
修改syslog的配置参数,把crontab调度日志单独抽取出来,同时在crontab项里添加检测标记,通过后台守护进程定期检测状态标记来判断当前crontab调度是否正常,同时为了避免日志文件过大而影响性能,会定期对日志文件做切割和清理处理。


#--------------------------------------------------------------------------------------------------------------------------------------------

1、相关目录创建

# mkdir -p /data/logs/crontab

# mkdir -p /data/scripts

# mkdir -p /data/backup/crontab

#--------------------------------------------------------------------------------------------------------------------------------------------

2、crontab健康检测C代码

# cd /data/scripts

# vim check_cron_process.h

#ifndef __CHECK_CRON_PROCESS_H__
#define __CHECK_CRON_PROCESS_H__
                                                                                
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
                                                                                
#define BUFFSIZE1 1024
#define BUFFSIZE2 32
                                                                                
#define LOCKFILE "/var/run/check_cron_process.pid"
#define LOGFILE  "/var/log/check_cron_process.log"
                                                                                
#define LOCKMODE (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
                                                                                
#define SYSCMD1 "ps aux | grep -w cron | grep -v grep"
#define SYSCMD2 "ps aux | grep -w cron | grep -v grep | grep defunct"
#define SYSCMD3 "tail -6 /data/logs/crontab/cron.log | grep '(root) CMD (cd'"
#define SYSCMD4 "killall -9 cron >/dev/null 2>&1"
#define SYSCMD5 "/sbin/service cron start >/dev/null 2>&1"
                                                                                
void already_running(void);
void init_daemon(void);
int run_system_cmd(const char *syscmd);
                                                                                
#endif

# vim check_cron_process.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <syslog.h>
#include <fcntl.h>
#include <errno.h>
#include <time.h>
#include "check_cron_process.h"
                                                                            
static char buffer[BUFFSIZE1] = {0};
static char datetime[BUFFSIZE2] = {0};
                                                                            
/* 获得当前系统时间 */
int get_curr_date(char *strtime, unsigned int ustrlen)
{
    struct tm *pt = NULL;
    time_t timer;
                                                                               
    if (!strtime) {
        return -1;
    }
                                                                               
    time(&timer);
    strtime[0] = '\0';
    pt = localtime(&timer);
    if (!pt) {
        return -1;
    }
                                                                               
    memset(strtime, 0, ustrlen);
                                                                               
    sprintf(strtime, "%04d-%02d-%02d-%02d:%02d:%02d",
        pt->tm_year + 1900, pt->tm_mon + 1, pt->tm_mday, pt->tm_hour, pt->tm_min, pt->tm_sec);
                                                                               
    return 0;
}
                                                                            
/* 将信息写入日志文件 */
int writelog(const char *pLoginfo)
{
    FILE *fp = NULL;
    unsigned int ustrlen = 0;
                                                                               
    if (pLoginfo == NULL) {
        return -1;
    }
                                                                               
    ustrlen = strlen(pLoginfo);
    if (ustrlen > 256) {
        return -1;
    }
                                                                               
    if ((fp = fopen(LOGFILE, "a+")) == NULL) {
        return -1;
    }
                                                                               
    memset(datetime, 0, BUFFSIZE2);
    get_curr_date(datetime, BUFFSIZE2);
                                                                               
    fprintf(fp, "%s   %s", datetime, pLoginfo);
    fclose(fp);
                                                                               
    return 0;
}
                                                                            
int LockFile(int fd)
{
    struct flock fl;
                                                                               
    fl.l_type = F_WRLCK;
    fl.l_start = 0;
    fl.l_whence = SEEK_SET;
    fl.l_len = 0;
                                                                               
    return (fcntl(fd, F_SETLK, &fl));
}
                                                                            
/* 只允许一个副本运行 */
void already_running(void)
{
    int fd = -1;
    char buf[16] = {0};
                                                                               
    fd = open(LOCKFILE, O_RDWR | O_CREAT, LOCKMODE);
    if (fd < 0) {
        syslog(LOG_ERR, "can't open %s: %s", LOCKFILE, strerror(errno));
        exit(1);
    }
                                                                               
    if (LockFile(fd) < 0) {
        if (errno == EACCES || errno == EAGAIN) {
            close(fd);
            exit(1);
        }
                                                                                   
        syslog(LOG_ERR, "can't lock %s: %s", LOCKFILE, strerror(errno));
        exit(1);
    }
                                                                               
    ftruncate(fd, 0);
    sprintf(buf, "%d", getpid());
    write(fd, buf, strlen(buf));
    close(fd);
}
                                                                            
/* 作为守护进程运行 */
void init_daemon(void)
{
    int pid = -1;
                                                                               
    if ((pid = fork())) {
        exit(0);
    } else if (pid < 0) {
        exit(1);
    }
                                                                               
    setsid();
                                                                               
    if ((pid = fork())) {
        exit(0);
    } else if (pid < 0) {
        exit(1);
    }
                                                                               
    chdir("/tmp");
    umask(0);
                                                                               
    return;
}
                                                                            
/* 运行系统命令 */
int run_system_cmd(const char *syscmd)
{
    FILE *fp = NULL;
                                                                               
    if (syscmd == NULL) {
        return -1;
    }
                                                                               
    memset(buffer, 0, BUFFSIZE1);
    snprintf(buffer, BUFFSIZE1, syscmd);
    fp = popen(buffer, "r");
    if (!fp) {
        return 0;
    }
                                                                               
    memset(buffer, 0, BUFFSIZE1);
                                                                               
    if (!fgets(buffer, BUFFSIZE1, fp)) {
        pclose(fp);
        return 0;
    }
                                                                               
    if (!strncasecmp(buffer, "", BUFFSIZE1)) {
        pclose(fp);
        return 0;
    }
                                                                               
    pclose(fp);
                                                                               
    return 1;
}
                                                                            
int main(int argc, char *argv[])
{
    int ret = 0;
                                                                               
    init_daemon();
    already_running();
                                                                               
    openlog(NULL, LOG_CONS | LOG_PID, LOG_LOCAL1);
                                                                               
    while(1) {
        /* 1.检查cron进程是否已经运行 */
        ret = run_system_cmd(SYSCMD1);
        if (!ret) {
            writelog("The cron process is not running, now start it! \n");
            sleep(1);
            system(SYSCMD5);
            goto CHECK_CRON;
        }
                                                                                   
        /* 2.如果已经运行,查看是否有僵尸进程 */
        ret = run_system_cmd(SYSCMD2);
        if (ret) {
            writelog("The cron process is defunct, now restart it! \n");
            sleep(1);
            system(SYSCMD4);
            sleep(1);
            system(SYSCMD5);
            goto CHECK_CRON;
        }
                                                                                   
        /* 3.如果进程运行正常,检查任务是否正常调度 */
        ret = run_system_cmd(SYSCMD3);
        if (!ret) {
            writelog("The cron work is down, now restart it! \n");
            sleep(1);
            system(SYSCMD4);
            sleep(1);
            system(SYSCMD5);
            goto CHECK_CRON;
        }
                                                                                   
        /* 4.crontab进程运行正常 */
        writelog("The cron process is ok! \n");
                                                                                   
CHECK_CRON:
        /* 休眠5分钟后,继续检测 */
        sleep(300);
    }
                                                                               
    closelog();
                                                                               
    return 0;
}

# vim Makefile

CC = gcc
CXX = g++
                                                                        
BINARY = check_cron_process
OBJS = check_cron_process.o
                                                                        
CFLAGS += -I/usr/include -I/usr/local/include -Wall -Wno-unused-variable
LDFLAGS = -static -O2
                                                                        
all:$(BINARY)
                                                                        
$(BINARY):$(OBJS)
    $(CC) $(LDFLAGS) -o $(BINARY) $(OBJS)
                                                                        
$(OBJS):%.o:%.c
    $(CC) $(CFLAGS) -c $^ -o $@
                                                                        
clean:
    rm -f $(BINARY) $(OBJS)

# make

# /data/scripts/check_cron_process


#--------------------------------------------------------------------------------------------------------------------------------------------

3、crontab任务调度日志配置【新增】

# vim /etc/syslog.conf

## check_crontab_start.tag.1
cron.*         /data/logs/crontab/cron.log
## check_crontab_end.tag.1

# vim /etc/syslog-ng/syslog-ng.conf

## check_crontab_start.tag.1
destination dst_cron {
    file("/data/logs/crontab/cron.log");
};
                                                                  
log {
    source(src);
    filter(f_cron);
    destination(dst_cron);
};
## check_crontab_end.tag.1

# /sbin/service syslog restart


#--------------------------------------------------------------------------------------------------------------------------------------------

4、crontab任务调度日志处理

(1)、日志切割与清理

# vim /data/scripts/cut_cron_log.sh

#!/bin/bash
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
                                                              
## MY CRON LOG PATH
LOGPATH="/data/logs/crontab"
                                                              
retval=`ps aux | grep sbin/cron | grep -v grep | wc -l`
if [ ${retval} -eq 0 ]; then
    echo "The cron process is not running ^_^"
    exit 1
fi
                                                              
## cut crontab's log
mv ${LOGPATH}/cron.log ${LOGPATH}/cron_$(date -d "yesterday" +"%Y-%m-%d").log
/sbin/service syslog restart
## clear 10 days ago's crontab logs
rm -f ${LOGPATH}/cron_$(date -d "10 days ago" +"%Y-%m-%d").log

(2)、crontab信息备份

# vim /data/scripts/backup_crontab.sh

#!/bin/bash
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
                                                           
CRONTAB_BACKUP_DIR="/data/backup/crontab"
                                                           
#备份crontab内容
mkdir -p ${CRONTAB_BACKUP_DIR}
crontab -uroot -l > ${CRONTAB_BACKUP_DIR}/crontab_`date +%F`
                                                           
#清理10前的备份
CRONBAK=crontab_$(date -d "10 days ago" +"%Y-%m-%d")
find ${CRONTAB_BACKUP_DIR} -type f -name ${CRONBAK} -exec rm -f {} \;

(3)、crontab垃圾头信息清理

# vim /data/scripts/clean_crontab_trash.sh

#!/bin/bash
PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/bin:/usr/local/sbin
                                                        
## The crontab's spool file
CRONFILE="/var/spool/cron/tabs/root"
sed -i '/# DO NOT EDIT THIS FILE/d' ${CRONFILE}
sed -i '/# (\/data\/crontab.tmp/d' ${CRONFILE}
sed -i '/# (\/tmp\/crontab/d' ${CRONFILE}
sed -i '/# (Cron version/d' ${CRONFILE}
sed -i '/# (- installed on/d' ${CRONFILE}
sed -i '/# (\/usr\/local\/agenttools/d' ${CRONFILE}
sed -i '/# (\/tmp\/cron.tmp/d' ${CRONFILE}
sed -i '/# (tmp2 installed/d' ${CRONFILE}
sed -i '/# (crontab.tmp/d' ${CRONFILE}
sed -i '/# (\/data\/crontab_/d' ${CRONFILE}

(4)、crontab设置

# crontab -e

## crontab日志切割与清理

00 00 * * * /data/scripts/cut_cron_log.sh >/dev/null 2>&1


## 运行状况检测标记

*/1 * * * * cd /usr/local; echo >/dev/null 2>&1


## crontab信息备份

30 08 * * * /data/scripts/backup_crontab.sh >/dev/null 2>&1


## crontab垃圾头信息清理

*/30 * * * * /data/scripts/clean_crontab_trash.sh >/dev/null 2>&1