[CRIU] [PATCH 3/5] seccomp: add support for SECCOMP_MODE_FILTER

Tycho Andersen tycho.andersen at canonical.com
Fri Sep 4 09:22:13 PDT 2015


This commit adds basic support for dumping and restoring seccomp filters
via the new ptrace interface. There are two current known limitations with
this approach:

1. This approach doesn't support restoring tasks who first do a seccomp()
   and then a setuid(); the test elaborates on this, but I don't think it
   will be difficult to overcome and hopefully I'll have it fixed by the
   time the kernel stuff is worked out.

2. There is an issue with SECCOMP_FILTER_FLAG_TSYNC; when filters are added
   with this flag set, the filter's ancestry is checked to see if the
   theads share an ancestor; if they do, the new filter is added to both
   threads. If they don't, it is only added to the current thread.

   On restore, the current implementation doesn't share
   struct seccomp_filter objects in the kernel (this would essentially
   require restoring seccomp filters before forking, and ptracing the
   entire restore). Instead, we use separate but identical filters when
   restoring things, which makes the behavior of SECCOMP_FILTER_FLAG_TSYNC
   incorrect.

   A potential solution to this is to load the eBPF programs before
   forking, and modify the kernel to use the underlying eBPF pointer for
   ancestry checks instead of the struct seccomp_filter object. On dump, we
   would also need to do some bookkeeping with the eBPF's prog_id about
   which seccomp filters are shared (potentially necessitating a separate
   seccomp.img a. la. cgroup.img) so that we can do this correctly. In any
   case, this is a known limitation with the current set that is still
   being discussed with the seccomp folks.

Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
 cr-dump.c            | 104 +++++++++++++++++++++++++++++++++++++++++++++++++--
 cr-restore.c         |  40 ++++++++++++++++++++
 include/bpf.h        |  36 ++++++++++++++++++
 include/proc_parse.h |   2 +
 include/pstree.h     |   1 -
 include/ptrace.h     |   8 ++++
 include/restorer.h   |   7 ++++
 include/seccomp.h    |   8 ++++
 pie/restorer.c       |  22 ++++++++---
 proc_parse.c         |   9 ++---
 protobuf/core.proto  |   7 ++++
 11 files changed, 228 insertions(+), 16 deletions(-)
 create mode 100644 include/bpf.h

diff --git a/cr-dump.c b/cr-dump.c
index 3af077b..6a1d819 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -80,6 +80,7 @@
 #include "lsm.h"
 #include "seccomp.h"
 #include "seize.h"
+#include "bpf.h"
 
 #include "asm/dump.h"
 
@@ -659,7 +660,80 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
 	return ret;
 }
 
+static int dump_seccomp_filters(pid_t pid, struct proc_status_creds *creds)
+{
+	long fd;
+	int ret = -1;
+
+	creds->n_seccomp_filters = 0;
+	creds->seccomp_filters = NULL;
+
+	fd = ptrace(PTRACE_SECCOMP_GET_FILTER_FD, pid);
+	if (fd < 0) {
+		pr_perror("getting seccomp filter fd failed");
+		return -1;
+	}
+
+	while(1) {
+		bpf_dump_arg attr;
+		SeccompFilter *filter;
+		struct bpf_insn buf[BPF_MAXINSNS];
+
+		attr.prog_fd = fd;
+		attr.dump_insns = (long) buf;
+
+		if (syscall(__NR_bpf, BPF_PROG_DUMP, &attr, sizeof(attr)) < 0) {
+			pr_perror("error dumping seccomp filter");
+			goto out;
+		}
+
+		if (!creds->seccomp_filters) {
+			creds->seccomp_filters = xmalloc(sizeof(SeccompFilter));
+			if (!creds->seccomp_filters)
+				goto out;
+
+			creds->n_seccomp_filters = 1;
+		} else {
+			void *m;
+
+			m = xrealloc(creds->seccomp_filters, (creds->n_seccomp_filters + 1) * sizeof(SeccompFilter));
+			if (!m)
+				goto out;
+
+			creds->seccomp_filters = m;
+			creds->n_seccomp_filters++;
+		}
+
+		filter = &creds->seccomp_filters[creds->n_seccomp_filters - 1];
+		seccomp_filter__init(filter);
+
+		filter->filter.len = attr.dump_insn_cnt * sizeof(struct bpf_insn);
+		filter->filter.data = xmalloc(filter->filter.len);
+		if (!filter->filter.data)
+			goto out;
+
+		memcpy(filter->filter.data, (void *) attr.dump_insns, filter->filter.len);
+
+		filter->is_gpl = attr.gpl_compatible;
+		filter->prog_id = attr.prog_id;
+
+		if (ptrace(PTRACE_SECCOMP_NEXT_FILTER, pid, NULL, fd) < 0) {
+			/* ENOENT just means we reached the end of the list */
+			if (errno == ENOENT)
+				ret = 0;
+			else
+				pr_perror("getting next filter failed");
+			break;
+		}
+	}
+
+out:
+	close(fd);
+	return ret;
+}
+
 static int dump_task_core_all(struct pstree_item *item,
+		struct parasite_ctl *ctl,
 		const struct proc_pid_stat *stat,
 		const struct parasite_dump_misc *misc,
 		const struct cr_imgset *cr_imgset)
@@ -668,6 +742,7 @@ static int dump_task_core_all(struct pstree_item *item,
 	CoreEntry *core = item->core[0];
 	pid_t pid = item->pid.real;
 	int ret = -1;
+	struct proc_status_creds *creds;
 
 	pr_info("\n");
 	pr_info("Dumping core (pid: %d)\n", pid);
@@ -677,10 +752,25 @@ static int dump_task_core_all(struct pstree_item *item,
 	if (ret < 0)
 		goto err;
 
-	if (dmpi(item)->pi_creds->seccomp_mode != SECCOMP_MODE_DISABLED) {
-		pr_info("got seccomp mode %d for %d\n", dmpi(item)->pi_creds->seccomp_mode, item->pid.virt);
+	creds = dmpi(item)->pi_creds;
+	if (creds->seccomp_mode != SECCOMP_MODE_DISABLED) {
+		pr_info("got seccomp mode %d for %d\n", creds->seccomp_mode, item->pid.virt);
 		core->tc->has_seccomp_mode = true;
-		core->tc->seccomp_mode = dmpi(item)->pi_creds->seccomp_mode;
+		core->tc->seccomp_mode = creds->seccomp_mode;
+
+		if (creds->seccomp_mode == SECCOMP_MODE_FILTER) {
+			int i;
+			TaskCoreEntry *tc = core->tc;
+
+			tc->n_seccomp_filters = creds->n_seccomp_filters;
+			tc->seccomp_filters = xmalloc(creds->n_seccomp_filters * sizeof(*tc->seccomp_filters));
+			if (!tc->seccomp_filters)
+				goto err;
+
+			for (i = 0; i < tc->n_seccomp_filters; i++) {
+				tc->seccomp_filters[i] = &creds->seccomp_filters[i];
+			}
+		}
 	}
 
 	strncpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
@@ -1187,6 +1277,12 @@ static int dump_one_task(struct pstree_item *item)
 		goto err;
 	}
 
+	if (dmpi(item)->pi_creds->seccomp_mode == SECCOMP_MODE_FILTER &&
+			dump_seccomp_filters(pid, dmpi(item)->pi_creds) < 0) {
+		pr_err("Dump %d seccomp filters failed\n", pid);
+		goto err;
+	}
+
 	parasite_ctl = parasite_infect_seized(pid, item, &vmas);
 	if (!parasite_ctl) {
 		pr_err("Can't infect (pid: %d) with parasite\n", pid);
@@ -1279,7 +1375,7 @@ static int dump_one_task(struct pstree_item *item)
 		goto err_cure;
 	}
 
-	ret = dump_task_core_all(item, &pps_buf, &misc, cr_imgset);
+	ret = dump_task_core_all(item, parasite_ctl, &pps_buf, &misc, cr_imgset);
 	if (ret) {
 		pr_err("Dump core (pid: %d) failed with %d\n", pid, ret);
 		goto err_cure;
diff --git a/cr-restore.c b/cr-restore.c
index 9b71e36..fff0d91 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -76,6 +76,7 @@
 #include "security.h"
 #include "lsm.h"
 #include "seccomp.h"
+#include "bpf.h"
 
 #include "parasite-syscall.h"
 
@@ -2680,6 +2681,10 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	int lsm_profile_len = 0;
 	unsigned long lsm_pos = 0;
 
+	int *seccomp_filters = NULL;
+	int seccomp_filters_len = 0;
+	unsigned long seccomp_filters_pos;
+
 	struct vm_area_list self_vmas;
 	struct vm_area_list *vmas = &rsti(current)->vmas;
 	int i;
@@ -2786,6 +2791,40 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 
 	}
 
+	seccomp_filters_len = core->tc->n_seccomp_filters;
+	seccomp_filters_pos = rst_mem_cpos(RM_PRIVATE);
+	seccomp_filters = rst_mem_alloc(seccomp_filters_len * sizeof(int), RM_PRIVATE);
+	if (!seccomp_filters)
+		goto err_nv;
+
+	for (i = 0; i < seccomp_filters_len; i++) {
+		SeccompFilter *filter = core->tc->seccomp_filters[i];
+		int fd;
+
+		union bpf_attr attr = {
+			.prog_type = BPF_PROG_TYPE_SECCOMP,
+			.insn_cnt = filter->filter.len / sizeof(struct bpf_insn),
+			.insns = (long) filter->filter.data,
+			.license = (long) (filter->is_gpl ? "GPL" : "Not GPL"),
+			.log_level = 0,
+			.log_size = 0,
+			.log_buf = 0
+		};
+
+		/* assign one field outside of struct init to make sure any
+		 * padding is zero initialized
+		 */
+		attr.kern_version = 0;
+
+		fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+		if (fd < 0) {
+			pr_perror("error loading bpf");
+			goto err_nv;
+		}
+
+		seccomp_filters[i] = fd;
+	}
+
 	rst_mem_size = rst_mem_lock();
 	restore_bootstrap_len = restorer_len + args_len + rst_mem_size;
 
@@ -2908,6 +2947,7 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	remap_array(rlims,	  rlims_nr, rlims_cpos);
 	remap_array(helpers,	  n_helpers, helpers_pos);
 	remap_array(zombies,	  n_zombies, zombies_pos);
+	remap_array(seccomp_filters,	  seccomp_filters_len, seccomp_filters_pos);
 
 #undef remap_array
 
diff --git a/include/bpf.h b/include/bpf.h
new file mode 100644
index 0000000..00858fe
--- /dev/null
+++ b/include/bpf.h
@@ -0,0 +1,36 @@
+#ifndef __CR_BPF_H__
+#define __CR_BPF_H__
+
+#include <linux/unistd.h>
+#include <linux/bpf.h>
+
+#ifndef BPF_PROG_TYPE_SECCOMP
+#define BPF_PROG_TYPE_SECCOMP 5
+#endif
+
+#ifndef BPF_PROG_LOAD
+#define BPF_PROG_LOAD 5
+#endif
+
+#ifndef BPF_PROG_DUMP
+#define BPF_PROG_DUMP 6
+
+/*
+ * The bpf syscall API uses a giant union of anonymous structs as its
+ * interface; if we don't have BPF_PROG_DUMP, we also won't have the members
+ * that are new for it. We define a struct here and use that instead.
+*/
+struct criu_bpf_dump {
+                __u32		prog_fd;
+                __u32		dump_insn_cnt;
+                __aligned_u64	dump_insns;
+                __u8		gpl_compatible;
+		__u64		prog_id;
+} __attribute__((aligned(8)));
+
+typedef struct criu_bpf_dump bpf_dump_arg;
+#else
+typedef union bpf_attr bpf_dump_arg;
+#endif /* BPF_PROG_DUMP */
+
+#endif
diff --git a/include/proc_parse.h b/include/proc_parse.h
index b4fb57d..4ec2425 100644
--- a/include/proc_parse.h
+++ b/include/proc_parse.h
@@ -87,6 +87,8 @@ struct proc_status_creds {
 	int			ppid;
 
 	int			seccomp_mode;
+	int			n_seccomp_filters;
+	SeccompFilter		*seccomp_filters;
 };
 
 bool proc_status_creds_eq(struct proc_status_creds *o1, struct proc_status_creds *o2);
diff --git a/include/pstree.h b/include/pstree.h
index a09e956..9fb6718 100644
--- a/include/pstree.h
+++ b/include/pstree.h
@@ -44,7 +44,6 @@ struct dmp_info {
 	 * threads. Dumping tasks with different creds is not supported.
 	 */
 	struct proc_status_creds *pi_creds;
-
 };
 
 static inline struct dmp_info *dmpi(struct pstree_item *i)
diff --git a/include/ptrace.h b/include/ptrace.h
index 079ad63..dc2d11c 100644
--- a/include/ptrace.h
+++ b/include/ptrace.h
@@ -7,6 +7,14 @@
 #include "config.h"
 #include "proc_parse.h"
 
+#ifndef PTRACE_SECCOMP_GET_FILTER_FD
+#define PTRACE_SECCOMP_GET_FILTER_FD	40
+#endif
+
+#ifndef PTRACE_SECCOMP_NEXT_FILTER
+#define PTRACE_SECCOMP_NEXT_FILTER	41
+#endif
+
 /* some constants for ptrace */
 #ifndef PTRACE_SEIZE
 # define PTRACE_SEIZE		0x4206
diff --git a/include/restorer.h b/include/restorer.h
index 56b9938..f19b447 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -137,6 +137,13 @@ struct task_restore_args {
 
 	pid_t				*zombies;
 	unsigned int			zombies_n;
+
+	/*
+	 * The bpf fds corresponding to the seccomp filters in the order the
+	 * filters are applied.
+	 */
+	int				*seccomp_filters;
+	unsigned int			seccomp_filters_n;
 	/* * * * * * * * * * * * * * * * * * * * */
 
 	unsigned long			task_size;
diff --git a/include/seccomp.h b/include/seccomp.h
index 017dcd4..8b4a143 100644
--- a/include/seccomp.h
+++ b/include/seccomp.h
@@ -13,4 +13,12 @@
 #define SECCOMP_MODE_FILTER 2
 #endif
 
+#ifndef SECCOMP_SET_MODE_FILTER
+#define SECCOMP_SET_MODE_FILTER 1
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_EBPF
+#define SECCOMP_FILTER_FLAG_EBPF (1 << 1)
+#endif
+
 #endif
diff --git a/pie/restorer.c b/pie/restorer.c
index b5121db..5b8aedc 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -334,17 +334,27 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
 	return 0;
 }
 
-static void restore_seccomp(int seccomp_mode)
+static void restore_seccomp(struct task_restore_args *args)
 {
-	switch (seccomp_mode) {
+	switch (args->seccomp_mode) {
 	case SECCOMP_MODE_DISABLED:
 		return;
 	case SECCOMP_MODE_STRICT:
 		if (sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0))
 			goto die;
 		return;
-	case SECCOMP_MODE_FILTER:
-		goto die;
+	case SECCOMP_MODE_FILTER: {
+		int i;
+
+		for (i = args->seccomp_filters_n - 1; i >= 0; i--) {
+			if (sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_EBPF,
+					(char *) (long) args->seccomp_filters[i]))
+				goto die;
+
+			sys_close(args->seccomp_filters[i]);
+		}
+		return;
+	}
 	default:
 		goto die;
 	}
@@ -445,7 +455,7 @@ long __export_restore_thread(struct thread_restore_args *args)
 	restore_finish_stage(CR_STATE_RESTORE_CREDS);
 	futex_dec_and_wake(&thread_inprogress);
 
-	restore_seccomp(args->ta->seccomp_mode);
+	restore_seccomp(args->ta);
 
 	new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;
 	rst_sigreturn(new_sp);
@@ -1283,7 +1293,7 @@ long __export_restore_task(struct task_restore_args *args)
 
 	restore_posix_timers(args);
 
-	restore_seccomp(args->seccomp_mode);
+	restore_seccomp(args);
 
 	sys_munmap(args->rst_mem, args->rst_mem_size);
 
diff --git a/proc_parse.c b/proc_parse.c
index cf70a92..778340c 100644
--- a/proc_parse.c
+++ b/proc_parse.c
@@ -823,11 +823,6 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
 				goto err_parse;
 			}
 
-			if (cr->seccomp_mode == SECCOMP_MODE_FILTER) {
-				pr_err("SECCOMP_MODE_FILTER not currently supported\n");
-				goto err_parse;
-			}
-
 			done++;
 		}
 	}
@@ -2111,6 +2106,10 @@ int aufs_parse(struct mount_info *new)
 
 bool proc_status_creds_eq(struct proc_status_creds *o1, struct proc_status_creds *o2)
 {
+	/* FIXME: this is a little too strict, we should do semantic comparison
+	 * of seccomp filters instead of forcing them to be exactly identical.
+	 * It's not unsafe, though, so let's be lazy for now.
+	 */
 	return memcmp(o1, o2, sizeof(struct proc_status_creds)) == 0;
 }
 
diff --git a/protobuf/core.proto b/protobuf/core.proto
index fd78f5c..3e0277c 100644
--- a/protobuf/core.proto
+++ b/protobuf/core.proto
@@ -19,6 +19,12 @@ enum seccomp_mode {
 	filter		= 2;
 };
 
+message seccomp_filter {
+	required bytes			filter		= 1;
+	required bool			is_gpl		= 2;
+	required uint64			prog_id		= 3;
+}
+
 message task_core_entry {
 	required uint32			task_state	= 1;
 	required uint32			exit_code	= 2;
@@ -37,6 +43,7 @@ message task_core_entry {
 	optional signal_queue_entry	signals_s	= 10;
 
 	optional seccomp_mode		seccomp_mode	= 11;
+	repeated seccomp_filter		seccomp_filters = 12;
 }
 
 message task_kobj_ids_entry {
-- 
2.5.0



More information about the CRIU mailing list