[CRIU] [PATCH 3/3] core: Print stack trace in case of SIGSEGV

Andrei Vagin avagin at virtuozzo.com
Tue May 30 15:08:15 PDT 2017


On Mon, May 29, 2017 at 05:02:00PM +0300, Kirill Tkhai wrote:
> Register SIGSEGV handler and print call trace in case of error.
> This should make the debug easier. Look at error messages before
> and after.
> 
> 1)Before:
> 
> (01.227591)pie: 1: Task 86  exited, status= 11
> (01.241495) Error (criu/cr-restore.c:1114): 489968 exited, status=1
> 
> and nobody knows where and why.
> 
> 2)After:
> 
> (01.090136)     86: Error (criu/mount.c:346): mnt: stack 86#0: criu(print_stack_trace+0x1e) [0x459aee]
> (01.090143)     86: Error (criu/mount.c:346): mnt: stack 86#1: criu(print_stack_and_exit+0x3e) [0x459b8e]
> (01.090145)     86: Error (criu/mount.c:346): mnt: stack 86#2: /lib64/libc.so.6(+0x35250) [0x7fbbf16fb250]
> (01.090147)     86: Error (criu/mount.c:346): mnt: stack 86#3: criu(phys_stat_resolve_dev+0x63) [0x459c03]
> (01.090149)     86: Error (criu/mount.c:346): mnt: stack 86#4: criu(__open_mountpoint+0x4d) [0x45ca7d]
> (01.090151)     86: Error (criu/mount.c:346): mnt: stack 86#5: criu() [0x44cf8e]
> (01.090153)     86: Error (criu/mount.c:346): mnt: stack 86#6: criu() [0x44d7d3]
> (01.090155)     86: Error (criu/mount.c:346): mnt: stack 86#7: criu() [0x44db6e]
> (01.090157)     86: Error (criu/mount.c:346): mnt: stack 86#8: criu(prepare_fds+0x477) [0x44b077]
> (01.090159)     86: Error (criu/mount.c:346): mnt: stack 86#9: criu() [0x43cbef]
> (01.090471)pie: 1: Task 86  exited, status= 255
> (01.103910) Error (criu/cr-restore.c:1114): 492906 exited, status=1
> 
> and everything is clear.
> 
> This will help developers in debug.
> 
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
>  criu/cr-dump.c      |    6 ++++++
>  criu/cr-restore.c   |   11 +++++++++--
>  criu/include/util.h |    1 +
>  criu/util.c         |   10 ++++++++++
>  4 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 1d661e7b1..acb90d9b5 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1759,6 +1759,12 @@ int cr_dump_tasks(pid_t pid)
>  	pr_info("Dumping processes (pid: %d)\n", pid);
>  	pr_info("========================================\n");
>  
> +#ifdef __GLIBC__
> +	if (signal(SIGSEGV, print_stack_and_exit) == SIG_ERR) {
> +		pr_perror("signal() failed");
> +		goto err;
> +	}
> +#endif
>  	if (opts.remote && push_snapshot_id() < 0) {
>  		pr_err("Failed to push image namespace.\n");
>  		goto err;
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index a420f77af..8e9c04272 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1429,7 +1429,7 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
>  	futex_abort_and_wake(&task_entries->nr_in_progress);
>  }
>  
> -int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
> +int criu_signals_setup(void (*sigchld_handler)(int, siginfo_t *, void *))
>  {
>  	int ret;
>  	struct sigaction act;
> @@ -1442,7 +1442,7 @@ int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
>  	}
>  
>  	act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART;
> -	act.sa_sigaction = handler;
> +	act.sa_sigaction = sigchld_handler;
>  	sigemptyset(&act.sa_mask);
>  	sigaddset(&act.sa_mask, SIGCHLD);
>  
> @@ -1452,6 +1452,12 @@ int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
>  		return -1;
>  	}
>  
> +#ifdef __GLIBC__
> +	if (signal(SIGSEGV, print_stack_and_exit) == SIG_ERR) {
> +		pr_perror("signal() failed");
> +		return -1;
> +	}
> +#endif
>  	/*
>  	 * The block mask will be restored in sigreturn.
>  	 *
> @@ -1459,6 +1465,7 @@ int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
>  	 */
>  	sigfillset(&blockmask);
>  	sigdelset(&blockmask, SIGCHLD);
> +	sigdelset(&blockmask, SIGSEGV);
>  
>  	/*
>  	 * Here we use SIG_SETMASK instead of SIG_BLOCK to avoid the case where
> diff --git a/criu/include/util.h b/criu/include/util.h
> index c1dd66676..b5fb77455 100644
> --- a/criu/include/util.h
> +++ b/criu/include/util.h
> @@ -311,6 +311,7 @@ extern int open_fd_of_real_pid(pid_t pid, int fd, int flags);
>  
>  extern int call_in_child_process(int (*fn)(void *), void *arg);
>  extern void print_stack_trace(pid_t pid);
> +extern void print_stack_and_exit(int signum);
>  
>  #define block_sigmask(saved_mask, sig_mask)	({					\
>  		sigset_t ___blocked_mask;						\
> diff --git a/criu/util.c b/criu/util.c
> index fef3ce7c8..cdec4b63b 100644
> --- a/criu/util.c
> +++ b/criu/util.c
> @@ -45,6 +45,7 @@
>  #include "vma.h"
>  #include "mem.h"
>  #include "namespaces.h"
> +#include "pstree.h"
>  #include "criu-log.h"
>  
>  #include "clone-noasan.h"
> @@ -1413,6 +1414,15 @@ void print_stack_trace(pid_t pid)
>  
>  	free(strings);
>  }
> +
> +void print_stack_and_exit(int signum)
> +{
> +	pid_t pid = current ? vpid(current) : -1;
> +
> +	pr_err("OOPS: task with vpid=%d got SIGSEGV:\n", pid);
> +	print_stack_trace(pid);
> +	exit(-1);

All modern linuxes have watchers for core-s:

[avagin at laptop kubernetes-incubator]$ cat /proc/sys/kernel/core_pattern 
|/usr/libexec/abrt-hook-ccpp %s %c %p %u %g %t %P %I

and with their help, we can get much more information than just a back
trace. But with this patch a process exits with -1 without
generating a core file. We need to find a way how to not affect core
file generation. We can try to set a SIGSEGV handler and when a process
gets into it once, we need to do our work, set a default handler and return
back. A process will repeat a faulted command and get SIGSEGV a second
time with the default handler.

> +}
>  #endif
>  
>  /*
> 


More information about the CRIU mailing list