linux kernel panic之后重启

panic_timeout

//linux-xxx/kernel/panic.c
core_param(panic, panic_timeout, int, 0644);

void panic(const char *fmt, ...)
{
...
if (panic_timeout > 0) {
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
if (i >= i_next) {
i += panic_blink(state ^= 1);
i_next = i + 3600 / PANIC_BLINK_SPD;
}
mdelay(PANIC_TIMER_STEP);
}
}
if (panic_timeout != 0) {
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
emergency_restart();
}
...
}

只要设置模块参数panic大于0就能调用emergency_restart重启了
core_param(panic, panic_timeout, int, 0644);
通过core_param定义的参数在/sys/module/kernel/parameters目录下
通过module_param定义的参数在/sys/module/xxx/parameters目录下


emergency_restart

emergency_restart
machine_emergency_restart //include/asm-generic/emergency_restart.h
machine_restart //arch/rlx/kernel/reset.c
_machine_restart <==> bsp_machine_restart //bsp/setup.c
reboot_by_wdt //drivers/watchdog/rtsx_wdt.c

最终调用了watchdog中的函数来复位整个系统。


reboot命令

reboot来自busybox,看看reboot的一步步调用流程是怎样的
reboot时打印消息:

The system is going down NOW !!  
Sending SIGTERM to all processes.
Sending SIGKILL to all processes.
Please stand by while rebooting the system.
Restarting system.

用户空间调用流程

在busybox代码init/init.c中有这么一段,

int init_main(int argc UNUSED_PARAM, char **argv)
{
...
bb_signals(0
+ (1 << SIGUSR1) /* halt */
+ (1 << SIGTERM) /* reboot */
+ (1 << SIGUSR2) /* poweroff */
, halt_reboot_pwoff);
signal(SIGQUIT, restart_handler); /* re-exec another init */
...

单独拿出halt_reboot_pwoff和restart_handler

static void halt_reboot_pwoff(int sig)
{
const char *m;
unsigned rb;

/* We may call run() and it unmasks signals,
* including the one masked inside this signal handler.
* Testcase which would start multiple reboot scripts:
* while true; do reboot; done
* Preventing it:
*/
reset_sighandlers_and_unblock_sigs();
run_shutdown_and_kill_processes();

m = "halt";
rb = RB_HALT_SYSTEM;
if (sig == SIGTERM) {
m = "reboot";
rb = RB_AUTOBOOT;
} else if (sig == SIGUSR2) {
m = "poweroff";
rb = RB_POWER_OFF;
}
message(L_CONSOLE, "Requesting system %s", m);
pause_and_low_level_reboot(rb);
/* not reached */
}

/* Handler for QUIT - exec "restart" action,
* else (no such action defined) do nothing */
static void restart_handler(int sig UNUSED_PARAM)
{
struct init_action *a;

for (a = init_action_list; a; a = a->next) {
if (!(a->action_type & RESTART))
continue;

/* Starting from here, we won't return.
* Thus don't need to worry about preserving errno
* and such.
*/

reset_sighandlers_and_unblock_sigs();
run_shutdown_and_kill_processes();

#ifdef RB_ENABLE_CAD
/* Allow Ctrl-Alt-Del to reboot the system.
* This is how kernel sets it up for init, we follow suit.
*/
reboot(RB_ENABLE_CAD); /* misnomer */
#endif

if (open_stdio_to_tty(a->terminal)) {
dbg_message(L_CONSOLE, "Trying to re-exec %s", a->command);
/* Theoretically should be safe.
* But in practice, kernel bugs may leave
* unkillable processes, and wait() may block forever.
* Oh well. Hoping "new" init won't be too surprised
* by having children it didn't create.
*/
reset_sighandlers_and_unblock_sigs();
run_shutdown_and_kill_processes();

#ifdef RB_ENABLE_CAD
/* Allow Ctrl-Alt-Del to reboot the system.
* This is how kernel sets it up for init, we follow suit.
*/
reboot(RB_ENABLE_CAD); /* misnomer */
#endif

if (open_stdio_to_tty(a->terminal)) {
dbg_message(L_CONSOLE, "Trying to re-exec %s", a->command);
/* Theoretically should be safe.
* But in practice, kernel bugs may leave
* unkillable processes, and wait() may block forever.
* Oh well. Hoping "new" init won't be too surprised
* by having children it didn't create.
*/
//while (wait(NULL) > 0)
// continue;
init_exec(a->command);
}
/* Open or exec failed */
pause_and_low_level_reboot(RB_HALT_SYSTEM);
/* not reached */
}
}

我们看到他们都会有调用这两个函数:reset_sighandlers_and_unblock_sigs();以及 run_shutdown_and_kill_processes();,我们重点关注如下这个函数:

static void run_shutdown_and_kill_processes(void)
{
/* Run everything to be run at "shutdown". This is done _prior_
* to killing everything, in case people wish to use scripts to
* shut things down gracefully... */
run_actions(SHUTDOWN);

message(L_CONSOLE | L_LOG, "The system is going down NOW!");

/* Send signals to every process _except_ pid 1 */
kill(-1, SIGTERM);
message(L_CONSOLE | L_LOG, "Sent SIG%s to all processes", "TERM");
sync();
sleep(1);

kill(-1, SIGKILL);
message(L_CONSOLE, "Sent SIG%s to all processes", "KILL");
sync();
/*sleep(1); - callers take care about making a pause */
}

终于看到了上面的打印信息:The system is going down NOW !! 以及Sending SIGTERM to all processes.
同时在上面的halt_reboot_pwoff和restart_handler中都会调用这样一个函数:

static void pause_and_low_level_reboot(unsigned magic)
{
pid_t pid;

/* Allow time for last message to reach serial console, etc */
sleep(1);

/* We have to fork here, since the kernel calls do_exit(EXIT_SUCCESS)
* in linux/kernel/sys.c, which can cause the machine to panic when
* the init process exits... */
pid = vfork();
if (pid == 0) { /* child */
reboot(magic);
_exit(EXIT_SUCCESS);
}
while (1)
sleep(1);
}

这里最终调用了内核提供的reboot系统调用。


linux内核空间调用流程

//reboot系统调用
SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
void __user *, arg)
{
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
kernel_restart(NULL);
...
}

kernel_restart
kernel_restart_prepare
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
syscore_shutdown
ops->shutdown()
machine_restart //arch/rlx/kernel/reset.c
_machine_restart <==> bsp_machine_restart //bsp/setup.c
reboot_by_wdt //drivers/watchdog/rtsx_wdt.c

可以实现在驱动中通过register_reboot_notifier向reboot_notifier_list注册回调函数,这样在系统reboot的时候回调函数就会被调用到了
也可以在驱动中实现syscore_ops函数,这样在系统reboot的时候也会被调用到
当调用到machine_restart就和上面发生panic时调用的流程一样了,最终通过watchdog提供的函数接口复位整个系统。


参考文章

  1. ​​基于Linux与Busybox的Reboot命令流程分析 ​​