本文为项目开发总结的原创文档。
本项目,添加一个watchdog守护进程,用来监控环境的三大进程mozart、bitbox、mplayer,任何一个进程出现故障,整个环境进行重启。
首先有通过版级驱动/arch/mips/xburst/soc-x1000/common# vim reset.c
找到与看门狗有关的code[同事发现,牛!];因此主要是将核心代码从内核空间搬移到用户空间,及如何监控应用层的进程。
整体实现思路:
1.创建一个进程作为守护进程:watchdog
进程的添加:
在configs下添加watchdog.mak;
在src下添加watchdog包,用于加入watchdog相关的code;Makefile;
进程的启动:在app.c中的startall中调用mozart_system("watchdog -b");
2.调整进程优先级:
1)如何查看进程的优先级
2)如何修改进程的优先级;
3.watchdog守护进程如何监控mozart和bitbox和mplayer
我们知道内核会通过/proc虚拟文件系统导出系统中正在运行的进程信息,每个进程都有一个/proc/<pid>目录。因此我们可以将检测进程是否存在转换为检测/proc/<pid>目录是否存在,这样就简单多了。
如下文详细代码中的processExists;
实现难点突破:
用户空间和内核空间操作的都是虚拟地址。
1)如果是拿到的是物理地址,用户空间可以通过mmap的方式将物理地址转成虚拟地址(每一次的地址值都不一样),可以直接对这个虚拟地址赋值。
如下:
static int dev_fd;
dev_fd = open("/dev/mem", O_RDWR | O_NDELAY);
if (dev_fd < 0) {
printf("open(/dev/mem) failed.");
return 0;
}
mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dev_fd,
WDT_IOBASE
*( unsigned long*)(map_base + TCU_TSCR) = (1 << 16);
close(dev_fd);
2)而内核空间,从物理地址转成虚拟地址,一般是固定的。0x1000200---0xb000200;
3)同一个物理地址转成虚拟地址,用户空间和内核空间是不相同的。
关于用户空间和内核空间:
物理地址在内核空间和用户空间映射地址不一样~~~
看门狗实际上就是一个定时器,其硬件内部维护了一个计数的寄存器。每当时钟信号到来时,计数寄存器减掉1,。如果减到0,则重启系统。
如果减到0之前,系统又设置计数寄存器到一个较大的值,则系统永远不会重启。
watchdog的基本实现原理是:
用户空间程序打开 /dev/mem设备(俗称“开门放狗”),
就会导致在内核中启动一个定时器(本项目mdt_start_count的入参是20000ms即20s),此后,用户空间程序需要保证在20分钟之内向这个设备写入数据(俗称“定期喂狗”),每次写操作会导致重新设定定时器(本项目是每sleep 10s重新去设定)。如果用户空间程序在20分钟之内没有写操作,定时器到期会导致一次系统Reboot操作(“狗咬人了”)。
watchdog.c 内容如下:
#include <string.h>
#include <stdlib.h>
#include <signal.h>
#include <stdbool.h>
#include <unistd.h>
#include <pthread.h>
#include <errno.h>
#include <time.h>
#include <sys/types.h>
#include <pwd.h>
#include <sys/stat.h>
#include <linux/input.h>
#include <fcntl.h>
#include <execinfo.h>
#include <sys/mman.h>
#define WDT_IOBASE (0x10002000)
#define MAP_SIZE 0xFF
#define JZ_EXTAL_RTC 32768 /* RTC extal freq: 32.768 KHz */
#define TCU_IOBASE 0x10002000
#define TCU_TSCR (0x3C) /* Timer Stop Clear Register */
#define WDT_TCSR (0x0c) /* rw, 32, 0x???????? */
#define WDT_TCER (0x04) /* rw, 32, 0x???????? */
#define WDT_TDR (0x00) /* rw, 32, 0x???????? */
#define WDT_TCNT (0x08) /* rw, 32, 0x???????? */
#define TCU_TSSR (0x2C) /* Timer Stop Set Register */
static void wdt_start_count(int msecs)
{
static int dev_fd;
dev_fd = open("/dev/mem", O_RDWR | O_NDELAY);
if (dev_fd < 0) {
printf("open(/dev/mem) failed.");
return 0;
}
unsigned char *map_base=(unsigned char * )mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dev_fd, WDT_IOBASE );
int time = JZ_EXTAL_RTC / 64 * msecs / 1000;
if(time > 65535)
time = 65535;
#if 0
outl(1 << 16,TCU_IOBASE + TCU_TSCR);
outl(0,WDT_IOBASE + WDT_TCNT); //counter
outl(time,WDT_IOBASE + WDT_TDR); //data
outl((3<<3 | 1<<1),WDT_IOBASE + WDT_TCSR);
outl(0,WDT_IOBASE + WDT_TCER);
outl(1,WDT_IOBASE + WDT_TCER);
#endif
/*上文屏蔽部分是内核空间对寄存器的操作,修改成用户空间对寄存器的操作,关键是物理地址在用户空间需要通过mmap进行转换*/
// printf("wdt_start_count begin~~~. map_base = %p,time=%d\n",map_base,time);
*( unsigned long*)(map_base + TCU_TSCR) = (1 << 16);
*( unsigned long*)(map_base + WDT_TCNT) = 0;//counter
*( unsigned long*)(map_base + WDT_TDR) = time;//data
*( unsigned long*)(map_base + WDT_TCSR) = (3<<3 | 1<<1);
*( unsigned long*)(map_base + WDT_TCER) = 0;
*( unsigned long*)(map_base + WDT_TCER) = 1;
close(dev_fd);
// printf("wdt_start_count end.\n");
}
static void wdt_stop_count(void)
{
static int dev_fd;
dev_fd = open("/dev/mem", O_RDWR | O_NDELAY);
if (dev_fd < 0) {
printf("open(/dev/mem) failed.");
return 0;
}
unsigned char *map_base=(unsigned char * )mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, dev_fd, WDT_IOBASE );
printf("\033[1;33mwdt_stop_count begin~~~. map_base = %p\n\033[m",map_base);
#if 0
outl(1 << 16,TCU_IOBASE + TCU_TSCR);
outl(0,WDT_IOBASE + WDT_TCNT); //counter
outl(65535,WDT_IOBASE + WDT_TDR); //data
outl(1 << 16,TCU_IOBASE + TCU_TSSR);
#endif
*( unsigned long*)(map_base + TCU_TSCR) = (1 << 16);
*( unsigned long*)(map_base + WDT_TCNT) = 0;//counter
*( unsigned long*)(map_base + WDT_TDR) = 65535;
*( unsigned long*)(map_base + TCU_TSSR) = (1 << 16);
close(dev_fd);
printf("wdt_stop_count end.\n");
}
void jz_wdt_restart()
{
printf("Restarting after 4 ms\n");
while(1)
{
wdt_start_count(20000);
sleep(10);
}
while(1)
printf("check wdt.\n");
}
/*判 断 进 程 是 否 存 在*/
bool processExists(char * process_name) {
FILE *ptr;
int RE_BUF_SIZE = 32;
char rebuff[RE_BUF_SIZE];
char ps[128];
snprintf(ps, sizeof(ps), "ps | grep %s |grep -v grep| wc -l", process_name);
if((ptr = popen(ps, "r")) != NULL) {
int count = 0;
fgets(rebuff, RE_BUF_SIZE, ptr);
if(rebuff != NULL) {
count = atoi(rebuff);
}
pclose(ptr);
return count >= 1;
}
printf("Current process %s is not Exist!!!!\n",process_name);
return false;
}
static char *signal_str[] = {
[1] = "SIGHUP", [2] = "SIGINT", [3] = "SIGQUIT", [4] = "SIGILL", [5] = "SIGTRAP",
[6] = "SIGABRT", [7] = "SIGBUS", [8] = "SIGFPE", [9] = "SIGKILL", [10] = "SIGUSR1",
[11] = "SIGSEGV", [12] = "SIGUSR2", [13] = "SIGPIPE", [14] = "SIGALRM", [15] = "SIGTERM",
[16] = "SIGSTKFLT", [17] = "SIGCHLD", [18] = "SIGCONT", [19] = "SIGSTOP", [20] = "SIGTSTP",
[21] = "SIGTTIN", [22] = "SIGTTOU", [23] = "SIGURG", [24] = "SIGXCPU", [25] = "SIGXFSZ",
[26] = "SIGVTALRM", [27] = "SIGPROF", [28] = "SIGWINCH", [29] = "SIGIO", [30] = "SIGPWR",
[31] = "SIGSYS", [34] = "SIGRTMIN", [35] = "SIGRTMIN+1", [36] = "SIGRTMIN+2", [37] = "SIGRTMIN+3",
[38] = "SIGRTMIN+4", [39] = "SIGRTMIN+5", [40] = "SIGRTMIN+6", [41] = "SIGRTMIN+7", [42] = "SIGRTMIN+8",
[43] = "SIGRTMIN+9", [44] = "SIGRTMIN+10", [45] = "SIGRTMIN+11", [46] = "SIGRTMIN+12", [47] = "SIGRTMIN+13",
[48] = "SIGRTMIN+14", [49] = "SIGRTMIN+15", [50] = "SIGRTMAX-14", [51] = "SIGRTMAX-13", [52] = "SIGRTMAX-12",
[53] = "SIGRTMAX-11", [54] = "SIGRTMAX-10", [55] = "SIGRTMAX-9", [56] = "SIGRTMAX-8", [57] = "SIGRTMAX-7",
[58] = "SIGRTMAX-6", [59] = "SIGRTMAX-5", [60] = "SIGRTMAX-4", [61] = "SIGRTMAX-3", [62] = "SIGRTMAX-2",
[63] = "SIGRTMAX-1", [64] = "SIGRTMAX",
};
static void usage(const char *app_name)
{
printf("%s [-f file] -h\n"
" -h help (show this usage text)\n"
" -f file\n", app_name);
return;
}
void sig_handler(int signo)
{
char cmd[64] = {};
void *array[10];
int size = 0;
char **strings = NULL;
int i = 0;
#if 0
printf("\n\n[%s: %d] bitbox crashed by signal %s.\n", __func__, __LINE__, signal_str[signo]);
printf("Call Trace:\n");
size = backtrace(array, 10);
strings = backtrace_symbols(array, size);
if (strings) {
for (i = 0; i < size; i++)
printf (" %s\n", strings[i]);
free (strings);
} else {
printf("Not Found\n\n");
}
if (signo == SIGSEGV || signo == SIGBUS ||
signo == SIGTRAP || signo == SIGABRT) {
sprintf(cmd, "cat /proc/%d/maps", getpid());
printf("Process maps:\n");
system(cmd);
}
#else
wdt_stop_count();
#endif
exit(-1);
}
int main(int argc, char **argv)
{
int c = -1;
int daemonize = 0;
printf("watchdog V1.7 start!!!!@_@\n");
signal(SIGPIPE, SIG_IGN);
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
signal(SIGBUS, sig_handler);
signal(SIGSEGV, sig_handler);
signal(SIGABRT, sig_handler);
while (1) {
c = getopt(argc, argv, "bBf:h");
if (c < 0)
break;
switch (c) {
case 'b':
case 'B':
daemonize = 1;
break;
case 'f':
break;
case 'h':
return 0;
default:
return -1;
}
}
/* run in the background */
if (daemonize) {
if (daemon(0, 1)) {
perror("daemon");
return -1;
}
}
while(1)
{
if(processExists("mozart")==true && processExists("bitbox")==true && processExists("mplayer")==true)
{
// printf("Both mozart and bitbox and mplayer are exists!!!!!\n");
wdt_start_count(20000);
sleep(10);
}
else
{
printf(" mozart or bitbox or mplayer is not exist, Reboot!!!!! \n");
printf("Mozart process exist ???: %d\n",processExists("mozart"));
printf("BitBox process exist ???: %d\n",processExists("bitbox"));
printf("Mplayer process exist ???: %d\n",processExists("mplayer"));
break;
}
}
}
程序运行起来后,通过ps可查看到:
S 0 241 1 7868 716 0:0 13:16 00:00:01 watchdog -b