[CRIU] [PATCH 1/3] seccomp: add initial support for SECCOMP_MODE_STRICT

Tycho Andersen tycho.andersen at canonical.com
Tue Jun 2 06:38:04 PDT 2015


On Tue, Jun 02, 2015 at 03:40:24PM +0300, Pavel Emelyanov wrote:
> Cool :) And can you elaborate on why exactly the filtered mode cannot be
> supported?

Yes, simply because we can't read the filters out of the kernel
(they're not exposed to userland at all). I've mailed the seccomp
maintainer about how to expose these, but he just had a baby and so is
not very responsive right now. Hopefully I'll hear back about how to
do this soon :)

> More comments inline.
> 
> > diff --git a/cr-dump.c b/cr-dump.c
> > index f865967..72533f1 100644
> > --- a/cr-dump.c
> > +++ b/cr-dump.c
> > @@ -19,6 +19,8 @@
> >  #include <sched.h>
> >  #include <sys/resource.h>
> >  
> > +#include <linux/seccomp.h>
> > +
> >  #include "protobuf.h"
> >  #include "protobuf/fdinfo.pb-c.h"
> >  #include "protobuf/fs.pb-c.h"
> > @@ -551,6 +553,22 @@ err:
> >  	return ret;
> >  }
> >  
> > +static int get_seccomp_state(pid_t pid, TaskCoreEntry *tc)
> > +{
> > +	struct proc_status_creds cr;
> > +	int ret;
> > +
> > +	ret = parse_pid_status(pid, &cr);
> > +	if (ret < 0)
> > +		return -1;
> 
> Going to read proc files slows things down. Can we take the status
> parse result from the seize_task() here?

Yep, actually I forgot about this double parse, thanks :)

> > +	if (cr.seccomp_mode != SECCOMP_MODE_DISABLED) {
> > +		tc->has_seccomp_mode = true;
> > +		tc->seccomp_mode = cr.seccomp_mode;
> > +	}
> > +	return 0;
> > +}
> > +
> >  static DECLARE_KCMP_TREE(vm_tree, KCMP_VM);
> >  static DECLARE_KCMP_TREE(fs_tree, KCMP_FS);
> >  static DECLARE_KCMP_TREE(files_tree, KCMP_FILES);
> > @@ -672,6 +690,10 @@ static int dump_task_core_all(struct pstree_item *item,
> >  	if (ret < 0)
> >  		goto err;
> >  
> > +	ret = get_seccomp_state(pid, core->tc);
> > +	if (ret < 0)
> > +		goto err;
> > +
> >  	strncpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
> >  	core->tc->flags = stat->flags;
> >  	core->tc->task_state = item->state;
> > diff --git a/cr-restore.c b/cr-restore.c
> > index aa00dc2..ea9d8d2 100644
> > --- a/cr-restore.c
> > +++ b/cr-restore.c
> > @@ -24,6 +24,8 @@
> >  
> >  #include <sys/sendfile.h>
> >  
> > +#include <linux/seccomp.h>
> > +
> >  #include "ptrace.h"
> >  #include "compiler.h"
> >  #include "asm/types.h"
> > @@ -756,6 +758,36 @@ static int collect_helper_pids()
> >  	return 0;
> >  }
> >  
> > +/*
> > + * Prepare the seccomp state for the process. Note that seccomp is restored in
> > + * suspended mode, and then unsuspended after the restorer blob completes in
> > + * finalize_restore().
> > + */
> 
> I'm not sure I get this. The prctl below will just turn the unsuspended strict mode
> on. The ptrace-s in fork_with_pid() will try to suspend one, but this looks racy.
> What prevents the prctl from being called _before_ the mentioned ptrace-s?

The ptraces in fork_with_pid happen during CR_STATE_FORKING (this
state is finished in restore_task_with_children, but after
create_children_and_session is called, which itself does more forking
and ptracing in fork_with_pid), but the prctl here happen in
CR_STATE_RESTORE. I think it should be ok.

> > +static int prepare_seccomp(pid_t pid, CoreEntry *core)
> > +{
> > +	if (core->tc->seccomp_mode == SECCOMP_MODE_DISABLED)
> > +		return 0;
> > +
> > +	pr_info("restoring seccomp mode %d for %d\n", core->tc->seccomp_mode, pid);
> > +
> > +	switch (core->tc->seccomp_mode) {
> > +	case SECCOMP_MODE_STRICT:
> > +		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0)) {
> > +			pr_perror("setting seccomp failed");
> > +			return -1;
> > +		}
> > +		break;
> > +	case SECCOMP_MODE_FILTER:
> > +		pr_err("seccomp mode 2 not supported\n");
> > +		return -1;
> > +	default:
> > +		pr_err("unknown seccomp mode %d\n", core->tc->seccomp_mode);
> > +		return -1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int open_cores(int pid, CoreEntry *leader_core)
> >  {
> >  	int i, tpid;
> > @@ -826,6 +858,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
> >  	if (collect_helper_pids() < 0)
> >  		return -1;
> >  
> > +	if (prepare_seccomp(pid, core) < 0)
> > +		return -1;
> > +
> >  	if (inherit_fd_fini() < 0)
> >  		return -1;
> >  
> > @@ -1135,13 +1170,57 @@ static inline int fork_with_pid(struct pstree_item *item)
> >  		goto err_unlock;
> >  	}
> >  
> > -
> >  	if (item == root_item) {
> >  		item->pid.real = ret;
> >  		pr_debug("PID: real %d virt %d\n",
> >  				item->pid.real, item->pid.virt);
> >  	}
> >  
> > +	if (ca.core->tc->seccomp_mode != SECCOMP_MODE_DISABLED) {
> > +#ifdef CONFIG_HAS_SUSPEND_SECCOMP
> 
> Can we avoid $ifdef-s in the middle of the code, please?

Sure, what's the usual pattern for doing this without them? I could
look for EIO from PTRACE_SUSPEND_SECCOMP (indicating a bad ptrace
command), unless you have some better way in mind?

> > +		int status;
> > +
> > +		/*
> > +		 * Since the process is going to restore seccomp state, we
> > +		 * need to temporarily suspend seccomp before we restore this
> > +		 * state, otherwise subsequent restore syscalls might be
> > +		 * blocked. seccomp is resumed at the end in finalize_restore()
> > +		 */
> > +		if (ptrace(PTRACE_ATTACH, ret, NULL, NULL) < 0) {
> > +			pr_perror("ptrace attach failed");
> > +			kill(ret, SIGKILL);
> > +			goto err_unlock;
> > +		}
> > +
> > +		if (waitpid(ret, &status, 0) < 0) {
> > +			pr_perror("waidpid failed");
> > +			kill(ret, SIGKILL);
> > +			goto err_unlock;
> > +		}
> > +
> > +		if (!WIFSTOPPED(status)) {
> > +			pr_err("not stopped after ptrace attach?\n");
> > +			kill(ret, SIGKILL);
> > +			goto err_unlock;
> > +		}
> > +
> > +		if (ptrace(PTRACE_SUSPEND_SECCOMP, ret, NULL, 1) < 0) {
> > +			pr_perror("seccomp suspend failed %d", ret);
> > +			kill(ret, SIGKILL);
> > +			goto err_unlock;
> > +		}
> > +
> > +		if (ptrace(PTRACE_DETACH, ret, NULL, NULL) < 0) {
> > +			pr_perror("ptrace detach failed");
> > +			kill(ret, SIGKILL);
> > +			goto err_unlock;
> > +		}
> > +#else
> > +		pr_err("trying to restore seccomp on a kernel without seccomp suspend\n");
> > +		goto err_unlock;
> > +#endif
> > +	}
> > +
> >  	if (opts.pidfile && root_item == item) {
> >  		int pid;
> >  
> > @@ -1690,6 +1769,11 @@ detach:
> >  				break;
> >  			}
> >  
> > +#ifdef CONFIG_HAS_SUSPEND_SECCOMP
> > +			if (ptrace(PTRACE_SUSPEND_SECCOMP, pid, NULL, 0) < 0)
> > +				pr_perror("failed resuming seccomp");
> > +#endif
> > +
> >  			if (ptrace(PTRACE_DETACH, pid, NULL, 0))
> >  				pr_perror("Unable to execute %d", pid);
> >  		}
> > diff --git a/include/prctl.h b/include/prctl.h
> > index 441135e..bfede51 100644
> > --- a/include/prctl.h
> > +++ b/include/prctl.h
> > @@ -9,6 +9,9 @@
> >  #ifndef PR_GET_NAME
> >  # define PR_GET_NAME		16
> >  #endif
> > +#ifndef PR_SET_SECCOMP
> > +# define PR_SET_SECCOMP		22
> > +#endif
> >  #ifndef PR_CAPBSET_READ
> >  # define PR_CAPBSET_READ	23
> >  #endif
> > diff --git a/include/proc_parse.h b/include/proc_parse.h
> > index ebb5351..3daf9a2 100644
> > --- a/include/proc_parse.h
> > +++ b/include/proc_parse.h
> > @@ -84,6 +84,8 @@ struct proc_status_creds {
> >  
> >  	char			state;
> >  	int			ppid;
> > +
> > +	int			seccomp_mode;
> >  };
> >  
> >  struct mount_info;
> > diff --git a/include/ptrace.h b/include/ptrace.h
> > index 0d89788..2803afb 100644
> > --- a/include/ptrace.h
> > +++ b/include/ptrace.h
> > @@ -11,6 +11,10 @@
> >  # define PTRACE_SEIZE		0x4206
> >  #endif
> >  
> > +#ifndef PTRACE_SUSPEND_SECCOMP
> > +# define PTRACE_SUSPEND_SECCOMP     10
> > +#endif
> > +
> >  #ifndef PTRACE_INTERRUPT
> >  # define PTRACE_INTERRUPT	0x4207
> >  #endif
> > diff --git a/pie/restorer.c b/pie/restorer.c
> > index 8713c6a..c07988d 100644
> > --- a/pie/restorer.c
> > +++ b/pie/restorer.c
> > @@ -40,6 +40,18 @@
> >  #define PR_SET_PDEATHSIG 1
> >  #endif
> >  
> > +#ifndef SECCOMP_MODE_DISABLED
> > +#define SECCOMP_MODE_DISABLED 0
> > +#endif
> > +
> > +#ifndef SECCOMP_MODE_STRICT
> > +#define SECCOMP_MODE_STRICT 1
> > +#endif
> > +
> > +#ifndef SECCOMP_MODE_FILTER
> > +#define SECCOMP_MODE_FILTER 2
> > +#endif
> > +
> >  #define sys_prctl_safe(opcode, val1, val2, val3)			\
> >  	({								\
> >  		long __ret = sys_prctl(opcode, val1, val2, val3, 0);	\
> > diff --git a/proc_parse.c b/proc_parse.c
> > index f5c870e..8ebd2d0 100644
> > --- a/proc_parse.c
> > +++ b/proc_parse.c
> > @@ -9,6 +9,7 @@
> >  #include <string.h>
> >  #include <ctype.h>
> >  #include <linux/fs.h>
> > +#include <linux/seccomp.h>
> >  
> >  #include "asm/types.h"
> >  #include "list.h"
> > @@ -763,7 +764,7 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
> >  	if (bfdopenr(&f))
> >  		return -1;
> >  
> > -	while (done < 8) {
> > +	while (done < 9) {
> >  		str = breadline(&f);
> >  		if (str == NULL)
> >  			break;
> > @@ -824,9 +825,22 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
> >  
> >  			done++;
> >  		}
> > +
> > +		if (!strncmp(str, "Seccomp:", 8)) {
> > +			if (sscanf(str + 9, "%d", &cr->seccomp_mode) != 1) {
> > +				goto err_parse;
> > +			}
> > +
> > +			if (cr->seccomp_mode == SECCOMP_MODE_FILTER) {
> > +				pr_err("SECCOMP_MODE_FILTER not currently supported\n");
> > +				goto err_parse;
> > +			}
> > +
> > +			done++;
> > +		}
> >  	}
> >  
> > -	if (done == 8)
> > +	if (done == 9)
> >  		ret = 0;
> >  
> >  err_parse:
> > diff --git a/protobuf/core.proto b/protobuf/core.proto
> > index 9f70da9..bbccee5 100644
> > --- a/protobuf/core.proto
> > +++ b/protobuf/core.proto
> > @@ -26,6 +26,8 @@ message task_core_entry {
> >  	optional uint32			cg_set		= 9;
> >  
> >  	optional signal_queue_entry	signals_s	= 10;
> > +
> > +	optional int32			seccomp_mode	= 11;
> 
> Should this rather be a protobuf enum?

Yes, I can do that (although I'll align it with the values from
linux/seccomp.h, so we can still use those macros too).

> >  }
> >  
> >  message task_kobj_ids_entry {
> > diff --git a/ptrace.c b/ptrace.c
> > index be6b67b..6f7bfbe 100644
> > --- a/ptrace.c
> > +++ b/ptrace.c
> > @@ -8,11 +8,14 @@
> >  #include <limits.h>
> >  #include <signal.h>
> >  
> > +#include <sys/ptrace.h>
> >  #include <sys/types.h>
> >  #include <sys/time.h>
> >  #include <sys/resource.h>
> >  #include <sys/wait.h>
> >  
> > +#include <linux/seccomp.h>
> > +
> >  #include "compiler.h"
> >  #include "asm/types.h"
> >  #include "util.h"
> > @@ -30,9 +33,20 @@ int unseize_task(pid_t pid, int orig_st, int st)
> >  	else if (st == TASK_STOPPED) {
> >  		if (orig_st == TASK_ALIVE)
> >  			kill(pid, SIGSTOP);
> > -	} else if (st == TASK_ALIVE)
> > -		/* do nothing */ ;
> > -	else
> > +	} else if (st == TASK_ALIVE) {
> > +
> > +#ifdef CONFIG_HAS_SUSPEND_SECCOMP
> > +		/*
> > +		 * we can always do this, it is a no-op if the task did not have any
> > +		 * seccomp mode enabled
> > +		 */
> > +		if (ptrace(PTRACE_SUSPEND_SECCOMP, pid, NULL, 0) < 0) {
> > +			pr_perror("failed resuming seccomp");
> > +			return -1;
> > +		}
> > +#endif
> 
> I guess we should _always_ restore the seccomp it, even if the
> target state is TASK_STOPPED. Actually, if in the kernel we make
> seccomp suspend turn off automatically upon ptrace detach, this
> hunk can be just dropped. I think that doing auto-seccomp-resume
> in the kernel is the correct way to go.

Yes, that's a good idea, thanks.

> > +
> > +	} else
> >  		pr_err("Unknown final state %d\n", st);
> >  
> >  	return ptrace(PTRACE_DETACH, pid, NULL, NULL);
> > @@ -125,6 +139,19 @@ try_again:
> >  		goto err;
> >  	}
> >  
> > +	if (cr.seccomp_mode != SECCOMP_MODE_DISABLED) {
> > +#ifdef CONFIG_HAS_SUSPEND_SECCOMP
> > +		if (ptrace(PTRACE_SUSPEND_SECCOMP, pid, NULL, 1) < 0) {
> > +			pr_perror("suspending seccomp failed");
> > +			goto err_stop;
> > +		}
> > +#else
> > +		pr_err("seccomp enabled and seccomp suspending not supported\n");
> > +		goto err_stop;
> > +#endif
> > +	}
> > +
> > +
> 
> I'm not sure this is the correct place for seccomp suspend as if
> we get the non _STOP event (below) we call PTRACE_CONT and continue
> running task w/o seccomp turned on.

Ah, right, it should be after the stop event check below.

Tycho

> >  	if (SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) {
> >  		/*
> >  		 * Kernel notifies us about the task being seized received some
> > diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak
> > index 519eb52..4c6f0d1 100644
> > --- a/scripts/feature-tests.mak
> > +++ b/scripts/feature-tests.mak
> > @@ -92,3 +92,14 @@ int main(int argc, char *argv[], char *envp[])
> >  }
> >  
> >  endef
> > +
> > +define PTRACE_SUSPEND_SECCOMP_TEST
> > +
> > +#include <linux/ptrace.h>
> > +
> > +int main(void)
> > +{
> > +	return PTRACE_SUSPEND_SECCOMP;
> > +}
> > +
> > +endef
> > 
> 


More information about the CRIU mailing list