[CRIU] [PATCH v2 1/3] seccomp: add initial support for SECCOMP_MODE_STRICT

Thu Jun 4 04:10:04 PDT 2015

> diff --git a/cr-restore.c b/cr-restore.c
> index aa00dc2..d8331a4 100644
> --- a/cr-restore.c
> +++ b/cr-restore.c
> @@ -24,6 +24,8 @@
>  
>  #include <sys/sendfile.h>
>  
> +#include <linux/seccomp.h>
> +
>  #include "ptrace.h"
>  #include "compiler.h"
>  #include "asm/types.h"
> @@ -1135,7 +1137,6 @@ static inline int fork_with_pid(struct pstree_item *item)
>  		goto err_unlock;
>  	}
>  
> -
>  	if (item == root_item) {
>  		item->pid.real = ret;
>  		pr_debug("PID: real %d virt %d\n",
> @@ -1558,6 +1559,7 @@ static inline int stage_participants(int next_stage)
>  	case CR_STATE_RESTORE_SIGCHLD:
>  		return task_entries->nr_threads;
>  	case CR_STATE_RESTORE_CREDS:
> +	case CR_STATE_SECCOMP_SUSPENDED:
>  		return task_entries->nr_threads;

Threads do not switch this stage, so on threaded app parent will get stuck in
waiting for its completion.

>  	}
>  
> @@ -1632,6 +1634,9 @@ static int attach_to_tasks(bool root_seized, enum trace_flags *flag)
>  				return -1;
>  			}
>  
> +			if (suspend_seccomp(pid) < 0)
> +				return -1;

BTW, why do we need to suspend seccomp on restore? The sigreturn is allowed in strict
mode so if we just turn one ON before it nothing bad will (should) happen.

> +
>  			ret = ptrace_stop_pie(pid, rsti(item)->breakpoint, flag);
>  			if (ret < 0)
>  				return -1;

> diff --git a/pie/restorer.c b/pie/restorer.c
> index 8713c6a..9ef93ef 100644
> --- a/pie/restorer.c
> +++ b/pie/restorer.c
> @@ -1214,6 +1252,18 @@ long __export_restore_task(struct task_restore_args *args)
>  
>  	restore_posix_timers(args);
>  
> +	/*
> +	 * Finally, restore seccomp just before the final sigreturn. A slight
> +	 * abuse of the stage mecahnism here: usually we wait for all the
> +	 * chidlren to be done, but in this case we're waiting for the parent
> +	 * to switch to the SECCOMP_SUSPENDED stage to indicate that it is
> +	 * safe for us to restore seccomp.
> +	 */
> +	futex_wait_while(&task_entries->start, CR_STATE_RESTORE_CREDS);

But this line will never wait for anything -- the call to RESTORE_CREDS stage
completion above has done exactly this.

> +	prepare_seccomp(sys_getpid(), args->seccomp_mode);
> +
> +	log_set_fd(-1);
> +
>  	sys_munmap(args->rst_mem, args->rst_mem_size);
>  
>  	/*

> diff --git a/ptrace.c b/ptrace.c
> index be6b67b..4448a26 100644
> --- a/ptrace.c
> +++ b/ptrace.c
> @@ -30,14 +33,32 @@ int unseize_task(pid_t pid, int orig_st, int st)
>  	else if (st == TASK_STOPPED) {
>  		if (orig_st == TASK_ALIVE)
>  			kill(pid, SIGSTOP);
> -	} else if (st == TASK_ALIVE)
> +	} else if (st == TASK_ALIVE) {
>  		/* do nothing */ ;
> -	else
> +	} else
>  		pr_err("Unknown final state %d\n", st);
>  
>  	return ptrace(PTRACE_DETACH, pid, NULL, NULL);
>  }
>  
> +#ifdef CONFIG_HAS_SUSPEND_SECCOMP
> +int suspend_seccomp(pid_t pid)
> +{
> +	if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
> +		pr_perror("suspending seccomp failed");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +#else
> +int suspend_seccomp(pid_t pid)
> +{
> +	pr_err("seccomp enabled and seccomp suspending not supported\n");
> +	return -1;

You call suspend_seccomp() unconditionally and abort on error, so secoomp-less
compilation will stop working.

> +}
> +#endif
> +
>  /*
>   * This routine seizes task putting it into a special
>   * state where we can manipulate the task via ptrace

-- Pavel