[CRIU] [PATCH 3/3] core: Print stack trace in case of SIGSEGV
Andrei Vagin
avagin at virtuozzo.com
Tue May 30 15:08:15 PDT 2017
On Mon, May 29, 2017 at 05:02:00PM +0300, Kirill Tkhai wrote:
> Register SIGSEGV handler and print call trace in case of error.
> This should make the debug easier. Look at error messages before
> and after.
>
> 1)Before:
>
> (01.227591)pie: 1: Task 86 exited, status= 11
> (01.241495) Error (criu/cr-restore.c:1114): 489968 exited, status=1
>
> and nobody knows where and why.
>
> 2)After:
>
> (01.090136) 86: Error (criu/mount.c:346): mnt: stack 86#0: criu(print_stack_trace+0x1e) [0x459aee]
> (01.090143) 86: Error (criu/mount.c:346): mnt: stack 86#1: criu(print_stack_and_exit+0x3e) [0x459b8e]
> (01.090145) 86: Error (criu/mount.c:346): mnt: stack 86#2: /lib64/libc.so.6(+0x35250) [0x7fbbf16fb250]
> (01.090147) 86: Error (criu/mount.c:346): mnt: stack 86#3: criu(phys_stat_resolve_dev+0x63) [0x459c03]
> (01.090149) 86: Error (criu/mount.c:346): mnt: stack 86#4: criu(__open_mountpoint+0x4d) [0x45ca7d]
> (01.090151) 86: Error (criu/mount.c:346): mnt: stack 86#5: criu() [0x44cf8e]
> (01.090153) 86: Error (criu/mount.c:346): mnt: stack 86#6: criu() [0x44d7d3]
> (01.090155) 86: Error (criu/mount.c:346): mnt: stack 86#7: criu() [0x44db6e]
> (01.090157) 86: Error (criu/mount.c:346): mnt: stack 86#8: criu(prepare_fds+0x477) [0x44b077]
> (01.090159) 86: Error (criu/mount.c:346): mnt: stack 86#9: criu() [0x43cbef]
> (01.090471)pie: 1: Task 86 exited, status= 255
> (01.103910) Error (criu/cr-restore.c:1114): 492906 exited, status=1
>
> and everything is clear.
>
> This will help developers in debug.
>
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
> criu/cr-dump.c | 6 ++++++
> criu/cr-restore.c | 11 +++++++++--
> criu/include/util.h | 1 +
> criu/util.c | 10 ++++++++++
> 4 files changed, 26 insertions(+), 2 deletions(-)
>
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 1d661e7b1..acb90d9b5 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1759,6 +1759,12 @@ int cr_dump_tasks(pid_t pid)
> pr_info("Dumping processes (pid: %d)\n", pid);
> pr_info("========================================\n");
>
> +#ifdef __GLIBC__
> + if (signal(SIGSEGV, print_stack_and_exit) == SIG_ERR) {
> + pr_perror("signal() failed");
> + goto err;
> + }
> +#endif
> if (opts.remote && push_snapshot_id() < 0) {
> pr_err("Failed to push image namespace.\n");
> goto err;
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index a420f77af..8e9c04272 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1429,7 +1429,7 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
> futex_abort_and_wake(&task_entries->nr_in_progress);
> }
>
> -int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
> +int criu_signals_setup(void (*sigchld_handler)(int, siginfo_t *, void *))
> {
> int ret;
> struct sigaction act;
> @@ -1442,7 +1442,7 @@ int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
> }
>
> act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART;
> - act.sa_sigaction = handler;
> + act.sa_sigaction = sigchld_handler;
> sigemptyset(&act.sa_mask);
> sigaddset(&act.sa_mask, SIGCHLD);
>
> @@ -1452,6 +1452,12 @@ int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
> return -1;
> }
>
> +#ifdef __GLIBC__
> + if (signal(SIGSEGV, print_stack_and_exit) == SIG_ERR) {
> + pr_perror("signal() failed");
> + return -1;
> + }
> +#endif
> /*
> * The block mask will be restored in sigreturn.
> *
> @@ -1459,6 +1465,7 @@ int criu_signals_setup(void (*handler)(int, siginfo_t *, void *))
> */
> sigfillset(&blockmask);
> sigdelset(&blockmask, SIGCHLD);
> + sigdelset(&blockmask, SIGSEGV);
>
> /*
> * Here we use SIG_SETMASK instead of SIG_BLOCK to avoid the case where
> diff --git a/criu/include/util.h b/criu/include/util.h
> index c1dd66676..b5fb77455 100644
> --- a/criu/include/util.h
> +++ b/criu/include/util.h
> @@ -311,6 +311,7 @@ extern int open_fd_of_real_pid(pid_t pid, int fd, int flags);
>
> extern int call_in_child_process(int (*fn)(void *), void *arg);
> extern void print_stack_trace(pid_t pid);
> +extern void print_stack_and_exit(int signum);
>
> #define block_sigmask(saved_mask, sig_mask) ({ \
> sigset_t ___blocked_mask; \
> diff --git a/criu/util.c b/criu/util.c
> index fef3ce7c8..cdec4b63b 100644
> --- a/criu/util.c
> +++ b/criu/util.c
> @@ -45,6 +45,7 @@
> #include "vma.h"
> #include "mem.h"
> #include "namespaces.h"
> +#include "pstree.h"
> #include "criu-log.h"
>
> #include "clone-noasan.h"
> @@ -1413,6 +1414,15 @@ void print_stack_trace(pid_t pid)
>
> free(strings);
> }
> +
> +void print_stack_and_exit(int signum)
> +{
> + pid_t pid = current ? vpid(current) : -1;
> +
> + pr_err("OOPS: task with vpid=%d got SIGSEGV:\n", pid);
> + print_stack_trace(pid);
> + exit(-1);
All modern linuxes have watchers for core-s:
[avagin at laptop kubernetes-incubator]$ cat /proc/sys/kernel/core_pattern
|/usr/libexec/abrt-hook-ccpp %s %c %p %u %g %t %P %I
and with their help, we can get much more information than just a back
trace. But with this patch a process exits with -1 without
generating a core file. We need to find a way how to not affect core
file generation. We can try to set a SIGSEGV handler and when a process
gets into it once, we need to do our work, set a default handler and return
back. A process will repeat a faulted command and get SIGSEGV a second
time with the default handler.
> +}
> #endif
>
> /*
>
More information about the CRIU
mailing list