[CRIU] [PATCH 1/6] seccomp: add initial support for SECCOMP_MODE_STRICT

Tycho Andersen tycho.andersen at canonical.com
Thu Jun 18 10:59:16 PDT 2015


Unfortunately, SECCOMP_MODE_FILTER is not currently exposed to userspace,
so we can't checkpoint that. In any case, this is what we need to do for
SECCOMP_MODE_STRICT, so let's do it.

This patch works by first disabling seccomp for any processes who are going
to have seccomp filters restored, then restoring the process (including the
seccomp filters), and finally resuming the seccomp filters before detaching
from the process.

v2 changes:

* update for kernel patch v2
* use protobuf enum for seccomp type
* don't parse /proc/pid/status twice

v3 changes:

* get rid of extra CR_STAGE_SECCOMP_SUSPEND stage
* only suspend seccomp in finalize_restore(), just before the unmap
* restore the (same) seccomp state in threads too; also add a note about
  how this is slightly wrong, and that we should at least check for a
  mismatch

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 Makefile.config           |  3 +++
 cr-dump.c                 | 21 ++++++++++++++++---
 cr-exec.c                 |  2 +-
 cr-restore.c              | 16 ++++++++++++++
 include/prctl.h           |  3 +++
 include/proc_parse.h      |  2 ++
 include/pstree.h          |  8 +++++++
 include/ptrace.h          |  7 ++++++-
 include/restorer.h        |  2 ++
 pie/restorer.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++
 proc_parse.c              | 18 ++++++++++++++--
 protobuf/core.proto       | 11 ++++++++++
 ptrace.c                  | 33 ++++++++++++++++++++++++++---
 scripts/feature-tests.mak | 11 ++++++++++
 14 files changed, 180 insertions(+), 10 deletions(-)

diff --git a/Makefile.config b/Makefile.config
index e1d2a3b..544e6ee 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -35,6 +35,9 @@ endif
 ifeq ($(call try-cc,$(PTRACE_PEEKSIGINFO_TEST),),y)
 	$(Q) @echo '#define CONFIG_HAS_PEEKSIGINFO_ARGS' >> $@
 endif
+ifeq ($(call try-cc,$(PTRACE_SUSPEND_SECCOMP_TEST),),y)
+	$(Q) @echo '#define CONFIG_HAS_SUSPEND_SECCOMP' >> $@
+endif
 ifeq ($(VDSO),y)
 	$(Q) @echo '#define CONFIG_VDSO' >> $@
 endif
diff --git a/cr-dump.c b/cr-dump.c
index f865967..ad78998 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -19,6 +19,8 @@
 #include <sched.h>
 #include <sys/resource.h>
 
+#include <linux/seccomp.h>
+
 #include "protobuf.h"
 #include "protobuf/fdinfo.pb-c.h"
 #include "protobuf/fs.pb-c.h"
@@ -672,6 +674,12 @@ static int dump_task_core_all(struct pstree_item *item,
 	if (ret < 0)
 		goto err;
 
+	if (item->seccomp_mode != SECCOMP_MODE_DISABLED) {
+		pr_info("got seccomp mode %d for %d\n", item->seccomp_mode, item->pid.virt);
+		core->tc->has_seccomp_mode = true;
+		core->tc->seccomp_mode = item->seccomp_mode;
+	}
+
 	strncpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
 	core->tc->flags = stat->flags;
 	core->tc->task_state = item->state;
@@ -801,7 +809,7 @@ static int collect_children(struct pstree_item *item)
 			goto free;
 		}
 
-		ret = seize_task(pid, item->pid.real);
+		ret = seize_task(pid, item->pid.real, &c->seccomp_mode);
 		if (ret < 0) {
 			/*
 			 * Here is a race window between parse_children() and seize(),
@@ -913,7 +921,14 @@ static int collect_threads(struct pstree_item *item)
 		pr_info("\tSeizing %d's %d thread\n",
 				item->pid.real, pid);
 
-		ret = seize_task(pid, item_ppid(item));
+		/*
+		 * FIXME: The NULL here is wrong; we really want to get the
+		 * seccomp state of this thread, but we have nowhere to put it,
+		 * so for now we ignore it. We should at least check to see
+		 * that the seccomp state is the same for all threads; we'll do
+		 * this in a future series.
+		 */
+		ret = seize_task(pid, item_ppid(item), NULL);
 		if (ret < 0) {
 			/*
 			 * Here is a race window between parse_threads() and seize(),
@@ -1063,7 +1078,7 @@ static int collect_pstree(pid_t pid)
 		return -1;
 
 	root_item->pid.real = pid;
-	ret = seize_task(pid, -1);
+	ret = seize_task(pid, -1, &root_item->seccomp_mode);
 	if (ret < 0)
 		goto err;
 	pr_info("Seized task %d, state %d\n", pid, ret);
diff --git a/cr-exec.c b/cr-exec.c
index 9f6ebfe..9d7162a 100644
--- a/cr-exec.c
+++ b/cr-exec.c
@@ -129,7 +129,7 @@ int cr_exec(int pid, char **opt)
 		goto out;
 	}
 
-	prev_state = ret = seize_task(pid, -1);
+	prev_state = ret = seize_task(pid, -1, NULL);
 	if (ret < 0) {
 		pr_err("Can't seize task %d\n", pid);
 		goto out;
diff --git a/cr-restore.c b/cr-restore.c
index 418c4ea..f93c814 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -24,6 +24,8 @@
 
 #include <sys/sendfile.h>
 
+#include <linux/seccomp.h>
+
 #include "ptrace.h"
 #include "compiler.h"
 #include "asm/types.h"
@@ -1060,6 +1062,7 @@ static inline int fork_with_pid(struct pstree_item *item)
 
 		item->state = ca.core->tc->task_state;
 		rsti(item)->cg_set = ca.core->tc->cg_set;
+		item->seccomp_mode = ca.core->tc->seccomp_mode;
 
 		if (item->state == TASK_DEAD)
 			rsti(item->parent)->nr_zombies++;
@@ -1682,6 +1685,17 @@ static void finalize_restore(int status)
 			goto detach;
 
 		/* Unmap the restorer blob */
+
+		/*
+		 * Suspend seccomp if necessary. We need to do this because
+		 * although seccomp is restored at the very end of the
+		 * restorer blob (and the final sigreturn is ok), here we're
+		 * doing an munmap in the process, which may be blocked by
+		 * seccomp and cause the task to be killed.
+		 */
+		if (item->seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0)
+			pr_err("failed to suspend seccomp, restore will probably fail...\n");
+
 		ctl = parasite_prep_ctl(pid, NULL);
 		if (ctl == NULL)
 			goto detach;
@@ -2886,6 +2900,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 
 #undef remap_array
 
+	task_args->seccomp_mode = core->tc->seccomp_mode;
+
 	if (lsm)
 		task_args->creds.lsm_profile = rst_mem_remap_ptr(lsm_pos, RM_PRIVATE);
 	else
diff --git a/include/prctl.h b/include/prctl.h
index 441135e..bfede51 100644
--- a/include/prctl.h
+++ b/include/prctl.h
@@ -9,6 +9,9 @@
 #ifndef PR_GET_NAME
 # define PR_GET_NAME		16
 #endif
+#ifndef PR_SET_SECCOMP
+# define PR_SET_SECCOMP		22
+#endif
 #ifndef PR_CAPBSET_READ
 # define PR_CAPBSET_READ	23
 #endif
diff --git a/include/proc_parse.h b/include/proc_parse.h
index ebb5351..3daf9a2 100644
--- a/include/proc_parse.h
+++ b/include/proc_parse.h
@@ -84,6 +84,8 @@ struct proc_status_creds {
 
 	char			state;
 	int			ppid;
+
+	int			seccomp_mode;
 };
 
 struct mount_info;
diff --git a/include/pstree.h b/include/pstree.h
index c0fdac6..f0cd53c 100644
--- a/include/pstree.h
+++ b/include/pstree.h
@@ -24,6 +24,14 @@ struct pstree_item {
 
 	int			state;		/* TASK_XXX constants */
 
+	/*
+	 * We keep the seccomp mode here temporarily between seizing and
+	 * dumping the task to avoid parsing /proc/pid/status twice. We also
+	 * use it on restore to hold the seccomp mode so that we don't have to
+	 * keep track of each task's core entry in the main criu process.
+	 */
+	int			seccomp_mode;
+
 	int			nr_threads;	/* number of threads */
 	struct pid		*threads;	/* array of threads */
 	CoreEntry		**core;
diff --git a/include/ptrace.h b/include/ptrace.h
index 0d89788..bb4411d 100644
--- a/include/ptrace.h
+++ b/include/ptrace.h
@@ -11,6 +11,10 @@
 # define PTRACE_SEIZE		0x4206
 #endif
 
+#ifndef PTRACE_O_SUSPEND_SECCOMP
+# define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
+#endif
+
 #ifndef PTRACE_INTERRUPT
 # define PTRACE_INTERRUPT	0x4207
 #endif
@@ -62,7 +66,8 @@ struct ptrace_peeksiginfo_args {
 
 #define SI_EVENT(_si_code)	(((_si_code) & 0xFFFF) >> 8)
 
-extern int seize_task(pid_t pid, pid_t ppid);
+extern int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode);
+extern int suspend_seccomp(pid_t pid);
 extern int unseize_task(pid_t pid, int orig_state, int state);
 extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
 extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
diff --git a/include/restorer.h b/include/restorer.h
index ab7d39d..dcfa175 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -165,6 +165,8 @@ struct task_restore_args {
 	 */
 	int				proc_fd;
 
+	int				seccomp_mode;
+
 #ifdef CONFIG_VDSO
 	unsigned long			vdso_rt_size;
 	struct vdso_symtable		vdso_sym_rt;		/* runtime vdso symbols */
diff --git a/pie/restorer.c b/pie/restorer.c
index fd860ca..4150b49 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -40,6 +40,18 @@
 #define PR_SET_PDEATHSIG 1
 #endif
 
+#ifndef SECCOMP_MODE_DISABLED
+#define SECCOMP_MODE_DISABLED 0
+#endif
+
+#ifndef SECCOMP_MODE_STRICT
+#define SECCOMP_MODE_STRICT 1
+#endif
+
+#ifndef SECCOMP_MODE_FILTER
+#define SECCOMP_MODE_FILTER 2
+#endif
+
 #define sys_prctl_safe(opcode, val1, val2, val3)			\
 	({								\
 		long __ret = sys_prctl(opcode, val1, val2, val3, 0);	\
@@ -339,6 +351,32 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
 	return 0;
 }
 
+static void restore_seccomp(int seccomp_mode)
+{
+	switch (seccomp_mode) {
+	case SECCOMP_MODE_DISABLED:
+		return;
+	case SECCOMP_MODE_STRICT:
+		if (sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0))
+			goto die;
+		return;
+	case SECCOMP_MODE_FILTER:
+		goto die;
+	default:
+		goto die;
+	}
+
+die:
+	/*
+	 * If preparing any seccomp state failed, we should make sure this
+	 * process doesn't continue so that it can't do things outside the
+	 * sandbox. Unfortunately, the rest of the restore has to continue
+	 * since we're too late in the process to stop it and have unlocked the
+	 * network.
+	 */
+	sys_exit_group(1);
+}
+
 static int restore_thread_common(struct rt_sigframe *sigframe,
 		struct thread_restore_args *args)
 {
@@ -417,9 +455,15 @@ long __export_restore_thread(struct thread_restore_args *args)
 
 	restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);
 	restore_pdeath_sig(args);
+
+	if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
+		pr_info("restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
+
 	restore_finish_stage(CR_STATE_RESTORE_CREDS);
 	futex_dec_and_wake(&thread_inprogress);
 
+	restore_seccomp(args->ta->seccomp_mode);
+
 	new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;
 	rst_sigreturn(new_sp);
 
@@ -1199,6 +1243,13 @@ long __export_restore_task(struct task_restore_args *args)
 
 	futex_set_and_wake(&thread_inprogress, args->nr_threads);
 
+	/*
+	 * We have to close the log before restoring seccomp, because
+	 * SECCOMP_MODE_STRICT blocks close().
+	 */
+	if (args->seccomp_mode != SECCOMP_MODE_DISABLED)
+		pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
+
 	restore_finish_stage(CR_STATE_RESTORE_CREDS);
 
 	if (ret)
@@ -1230,6 +1281,8 @@ long __export_restore_task(struct task_restore_args *args)
 
 	sys_munmap(args->rst_mem, args->rst_mem_size);
 
+	restore_seccomp(args->seccomp_mode);
+
 	/*
 	 * Sigframe stack.
 	 */
diff --git a/proc_parse.c b/proc_parse.c
index 9795689..9c14a27 100644
--- a/proc_parse.c
+++ b/proc_parse.c
@@ -9,6 +9,7 @@
 #include <string.h>
 #include <ctype.h>
 #include <linux/fs.h>
+#include <linux/seccomp.h>
 
 #include "asm/types.h"
 #include "list.h"
@@ -779,7 +780,7 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
 	if (bfdopenr(&f))
 		return -1;
 
-	while (done < 8) {
+	while (done < 9) {
 		str = breadline(&f);
 		if (str == NULL)
 			break;
@@ -840,9 +841,22 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
 
 			done++;
 		}
+
+		if (!strncmp(str, "Seccomp:", 8)) {
+			if (sscanf(str + 9, "%d", &cr->seccomp_mode) != 1) {
+				goto err_parse;
+			}
+
+			if (cr->seccomp_mode == SECCOMP_MODE_FILTER) {
+				pr_err("SECCOMP_MODE_FILTER not currently supported\n");
+				goto err_parse;
+			}
+
+			done++;
+		}
 	}
 
-	if (done == 8)
+	if (done == 9)
 		ret = 0;
 
 err_parse:
diff --git a/protobuf/core.proto b/protobuf/core.proto
index 9f70da9..fd78f5c 100644
--- a/protobuf/core.proto
+++ b/protobuf/core.proto
@@ -10,6 +10,15 @@ import "siginfo.proto";
 
 import "opts.proto";
 
+/*
+ * These match the SECCOMP_MODE_* flags from <linux/seccomp.h>.
+ */
+enum seccomp_mode {
+	disabled	= 0;
+	strict		= 1;
+	filter		= 2;
+};
+
 message task_core_entry {
 	required uint32			task_state	= 1;
 	required uint32			exit_code	= 2;
@@ -26,6 +35,8 @@ message task_core_entry {
 	optional uint32			cg_set		= 9;
 
 	optional signal_queue_entry	signals_s	= 10;
+
+	optional seccomp_mode		seccomp_mode	= 11;
 }
 
 message task_kobj_ids_entry {
diff --git a/ptrace.c b/ptrace.c
index be6b67b..f572f29 100644
--- a/ptrace.c
+++ b/ptrace.c
@@ -8,11 +8,14 @@
 #include <limits.h>
 #include <signal.h>
 
+#include <sys/ptrace.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/wait.h>
 
+#include <linux/seccomp.h>
+
 #include "compiler.h"
 #include "asm/types.h"
 #include "util.h"
@@ -30,14 +33,32 @@ int unseize_task(pid_t pid, int orig_st, int st)
 	else if (st == TASK_STOPPED) {
 		if (orig_st == TASK_ALIVE)
 			kill(pid, SIGSTOP);
-	} else if (st == TASK_ALIVE)
+	} else if (st == TASK_ALIVE) {
 		/* do nothing */ ;
-	else
+	} else
 		pr_err("Unknown final state %d\n", st);
 
 	return ptrace(PTRACE_DETACH, pid, NULL, NULL);
 }
 
+#ifdef CONFIG_HAS_SUSPEND_SECCOMP
+int suspend_seccomp(pid_t pid)
+{
+	if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
+		pr_perror("suspending seccomp failed");
+		return -1;
+	}
+
+	return 0;
+}
+#else
+int suspend_seccomp(pid_t pid)
+{
+	pr_err("seccomp enabled and seccomp suspending not supported\n");
+	return -1;
+}
+#endif
+
 /*
  * This routine seizes task putting it into a special
  * state where we can manipulate the task via ptrace
@@ -46,7 +67,7 @@ int unseize_task(pid_t pid, int orig_st, int st)
  * up with someone else.
  */
 
-int seize_task(pid_t pid, pid_t ppid)
+int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode)
 {
 	siginfo_t si;
 	int status;
@@ -90,6 +111,9 @@ try_again:
 	if (ret2)
 		goto err;
 
+	if (seccomp_mode)
+		*seccomp_mode = cr.seccomp_mode;
+
 	if (!may_dump(&cr)) {
 		pr_err("Check uid (pid: %d) failed\n", pid);
 		goto err;
@@ -142,6 +166,9 @@ try_again:
 		goto try_again;
 	}
 
+	if (cr.seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0)
+		goto err_stop;
+
 	if (si.si_signo == SIGTRAP)
 		return TASK_ALIVE;
 	else if (si.si_signo == SIGSTOP) {
diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak
index 519eb52..ec7972a 100644
--- a/scripts/feature-tests.mak
+++ b/scripts/feature-tests.mak
@@ -92,3 +92,14 @@ int main(int argc, char *argv[], char *envp[])
 }
 
 endef
+
+define PTRACE_SUSPEND_SECCOMP_TEST
+
+#include <linux/ptrace.h>
+
+int main(void)
+{
+	return PTRACE_O_SUSPEND_SECCOMP;
+}
+
+endef
-- 
2.1.4



More information about the CRIU mailing list