[CRIU] [PATCH v2 1/3] seccomp: add initial support for SECCOMP_MODE_STRICT
Tycho Andersen
tycho.andersen at canonical.com
Wed Jun 3 16:00:24 PDT 2015
Unfortunately, SECCOMP_MODE_FILTER is not currently exposed to userspace,
so we can't checkpoint that. In any case, this is what we need to do for
SECCOMP_MODE_STRICT, so let's do it.
This patch works by first disabling seccomp for any processes who are going
to have seccomp filters restored, then restoring the process (including the
seccomp filters), and finally resuming the seccomp filters before detaching
from the process.
v2 changes:
* update for kernel patch v2
* use protobuf enum for seccomp type
* don't parse /proc/pid/status twice
Signed-off-by: Tycho Andersen <tycho.andersen at canonical.com>
---
Makefile.config | 3 +++
cr-dump.c | 14 +++++++++---
cr-exec.c | 2 +-
cr-restore.c | 16 ++++++++++----
include/prctl.h | 3 +++
include/proc_parse.h | 2 ++
include/pstree.h | 6 ++++++
include/ptrace.h | 7 +++++-
include/restorer.h | 10 +++++++++
pie/restorer.c | 54 +++++++++++++++++++++++++++++++++++++++++++++--
proc_parse.c | 18 ++++++++++++++--
protobuf/core.proto | 11 ++++++++++
ptrace.c | 33 ++++++++++++++++++++++++++---
scripts/feature-tests.mak | 11 ++++++++++
14 files changed, 174 insertions(+), 16 deletions(-)
diff --git a/Makefile.config b/Makefile.config
index e1d2a3b..544e6ee 100644
--- a/Makefile.config
+++ b/Makefile.config
@@ -35,6 +35,9 @@ endif
ifeq ($(call try-cc,$(PTRACE_PEEKSIGINFO_TEST),),y)
$(Q) @echo '#define CONFIG_HAS_PEEKSIGINFO_ARGS' >> $@
endif
+ifeq ($(call try-cc,$(PTRACE_SUSPEND_SECCOMP_TEST),),y)
+ $(Q) @echo '#define CONFIG_HAS_SUSPEND_SECCOMP' >> $@
+endif
ifeq ($(VDSO),y)
$(Q) @echo '#define CONFIG_VDSO' >> $@
endif
diff --git a/cr-dump.c b/cr-dump.c
index f865967..dd94812 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -19,6 +19,8 @@
#include <sched.h>
#include <sys/resource.h>
+#include <linux/seccomp.h>
+
#include "protobuf.h"
#include "protobuf/fdinfo.pb-c.h"
#include "protobuf/fs.pb-c.h"
@@ -672,6 +674,12 @@ static int dump_task_core_all(struct pstree_item *item,
if (ret < 0)
goto err;
+ if (item->seccomp_mode != SECCOMP_MODE_DISABLED) {
+ pr_info("got seccomp mode %d for %d\n", item->seccomp_mode, item->pid.virt);
+ core->tc->has_seccomp_mode = true;
+ core->tc->seccomp_mode = item->seccomp_mode;
+ }
+
strncpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
core->tc->flags = stat->flags;
core->tc->task_state = item->state;
@@ -801,7 +809,7 @@ static int collect_children(struct pstree_item *item)
goto free;
}
- ret = seize_task(pid, item->pid.real);
+ ret = seize_task(pid, item->pid.real, &c->seccomp_mode);
if (ret < 0) {
/*
* Here is a race window between parse_children() and seize(),
@@ -913,7 +921,7 @@ static int collect_threads(struct pstree_item *item)
pr_info("\tSeizing %d's %d thread\n",
item->pid.real, pid);
- ret = seize_task(pid, item_ppid(item));
+ ret = seize_task(pid, item_ppid(item), NULL);
if (ret < 0) {
/*
* Here is a race window between parse_threads() and seize(),
@@ -1063,7 +1071,7 @@ static int collect_pstree(pid_t pid)
return -1;
root_item->pid.real = pid;
- ret = seize_task(pid, -1);
+ ret = seize_task(pid, -1, &root_item->seccomp_mode);
if (ret < 0)
goto err;
pr_info("Seized task %d, state %d\n", pid, ret);
diff --git a/cr-exec.c b/cr-exec.c
index 9f6ebfe..9d7162a 100644
--- a/cr-exec.c
+++ b/cr-exec.c
@@ -129,7 +129,7 @@ int cr_exec(int pid, char **opt)
goto out;
}
- prev_state = ret = seize_task(pid, -1);
+ prev_state = ret = seize_task(pid, -1, NULL);
if (ret < 0) {
pr_err("Can't seize task %d\n", pid);
goto out;
diff --git a/cr-restore.c b/cr-restore.c
index aa00dc2..d8331a4 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -24,6 +24,8 @@
#include <sys/sendfile.h>
+#include <linux/seccomp.h>
+
#include "ptrace.h"
#include "compiler.h"
#include "asm/types.h"
@@ -1135,7 +1137,6 @@ static inline int fork_with_pid(struct pstree_item *item)
goto err_unlock;
}
-
if (item == root_item) {
item->pid.real = ret;
pr_debug("PID: real %d virt %d\n",
@@ -1558,6 +1559,7 @@ static inline int stage_participants(int next_stage)
case CR_STATE_RESTORE_SIGCHLD:
return task_entries->nr_threads;
case CR_STATE_RESTORE_CREDS:
+ case CR_STATE_SECCOMP_SUSPENDED:
return task_entries->nr_threads;
}
@@ -1632,6 +1634,9 @@ static int attach_to_tasks(bool root_seized, enum trace_flags *flag)
return -1;
}
+ if (suspend_seccomp(pid) < 0)
+ return -1;
+
ret = ptrace_stop_pie(pid, rsti(item)->breakpoint, flag);
if (ret < 0)
return -1;
@@ -1847,14 +1852,15 @@ static int restore_root_task(struct pstree_item *init)
timing_stop(TIME_RESTORE);
ret = attach_to_tasks(root_as_sibling, &flag);
-
- pr_info("Restore finished successfully. Resuming tasks.\n");
- futex_set_and_wake(&task_entries->start, CR_STATE_COMPLETE);
+ futex_set_and_wake(&task_entries->start, CR_STATE_SECCOMP_SUSPENDED);
if (ret == 0)
ret = parasite_stop_on_syscall(task_entries->nr_threads,
__NR_rt_sigreturn, flag);
+ pr_info("Restore finished successfully. Resuming tasks.\n");
+ futex_set_and_wake(&task_entries->start, CR_STATE_COMPLETE);
+
if (clear_breakpoints())
pr_err("Unable to flush breakpoints\n");
@@ -2873,6 +2879,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
task_args->nr_rings = mm->n_aios;
task_args->rings = rst_mem_remap_ptr(aio_rings, RM_PRIVATE);
+ task_args->seccomp_mode = core->tc->seccomp_mode;
+
task_args->n_helpers = n_helpers;
if (n_helpers > 0)
task_args->helpers = rst_mem_remap_ptr(helpers_pos, RM_PRIVATE);
diff --git a/include/prctl.h b/include/prctl.h
index 441135e..bfede51 100644
--- a/include/prctl.h
+++ b/include/prctl.h
@@ -9,6 +9,9 @@
#ifndef PR_GET_NAME
# define PR_GET_NAME 16
#endif
+#ifndef PR_SET_SECCOMP
+# define PR_SET_SECCOMP 22
+#endif
#ifndef PR_CAPBSET_READ
# define PR_CAPBSET_READ 23
#endif
diff --git a/include/proc_parse.h b/include/proc_parse.h
index ebb5351..3daf9a2 100644
--- a/include/proc_parse.h
+++ b/include/proc_parse.h
@@ -84,6 +84,8 @@ struct proc_status_creds {
char state;
int ppid;
+
+ int seccomp_mode;
};
struct mount_info;
diff --git a/include/pstree.h b/include/pstree.h
index c0fdac6..e0a74dc 100644
--- a/include/pstree.h
+++ b/include/pstree.h
@@ -24,6 +24,12 @@ struct pstree_item {
int state; /* TASK_XXX constants */
+ /*
+ * We keep the seccomp mode here temporarily between seizing and
+ * dumping the task to avoid parsing /proc/pid/status twice.
+ */
+ int seccomp_mode;
+
int nr_threads; /* number of threads */
struct pid *threads; /* array of threads */
CoreEntry **core;
diff --git a/include/ptrace.h b/include/ptrace.h
index 0d89788..bb4411d 100644
--- a/include/ptrace.h
+++ b/include/ptrace.h
@@ -11,6 +11,10 @@
# define PTRACE_SEIZE 0x4206
#endif
+#ifndef PTRACE_O_SUSPEND_SECCOMP
+# define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
+#endif
+
#ifndef PTRACE_INTERRUPT
# define PTRACE_INTERRUPT 0x4207
#endif
@@ -62,7 +66,8 @@ struct ptrace_peeksiginfo_args {
#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
-extern int seize_task(pid_t pid, pid_t ppid);
+extern int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode);
+extern int suspend_seccomp(pid_t pid);
extern int unseize_task(pid_t pid, int orig_state, int state);
extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
diff --git a/include/restorer.h b/include/restorer.h
index 34396e3..6332215 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -158,6 +158,8 @@ struct task_restore_args {
char *lsm_profile;
int lsm_profile_len;
+ int seccomp_mode;
+
#ifdef CONFIG_VDSO
unsigned long vdso_rt_size;
struct vdso_symtable vdso_sym_rt; /* runtime vdso symbols */
@@ -188,6 +190,14 @@ enum {
* some code.
*/
CR_STATE_RESTORE_CREDS,
+ /*
+ * We need to restore seccomp after all the tasks have been ptraced
+ * and seccomp temporarily suspended. Also, once ptrace detatches,
+ * seccomp is restored and any additional syscalls made by CRIU might
+ * be killed, so this should be the very last step before the final
+ * sigreturn.
+ */
+ CR_STATE_SECCOMP_SUSPENDED,
CR_STATE_COMPLETE
};
diff --git a/pie/restorer.c b/pie/restorer.c
index 8713c6a..9ef93ef 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -40,6 +40,18 @@
#define PR_SET_PDEATHSIG 1
#endif
+#ifndef SECCOMP_MODE_DISABLED
+#define SECCOMP_MODE_DISABLED 0
+#endif
+
+#ifndef SECCOMP_MODE_STRICT
+#define SECCOMP_MODE_STRICT 1
+#endif
+
+#ifndef SECCOMP_MODE_FILTER
+#define SECCOMP_MODE_FILTER 2
+#endif
+
#define sys_prctl_safe(opcode, val1, val2, val3) \
({ \
long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
@@ -759,6 +771,34 @@ static int lsm_set_label(struct task_restore_args *args)
return ret;
}
+static int prepare_seccomp(pid_t pid, int seccomp_mode)
+{
+ int ret;
+
+ if (seccomp_mode == SECCOMP_MODE_DISABLED)
+ return 0;
+
+ pr_info("restoring seccomp mode %d for %d\n", seccomp_mode, pid);
+
+ switch (seccomp_mode) {
+ case SECCOMP_MODE_STRICT:
+ if ((ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0))) {
+ pr_err("setting seccomp failed %d", ret);
+ return -1;
+ }
+ break;
+ case SECCOMP_MODE_FILTER:
+ pr_err("seccomp mode 2 not supported\n");
+ return -1;
+ default:
+ pr_err("unknown seccomp mode %d\n", seccomp_mode);
+ return -1;
+ }
+
+ return 0;
+}
+
+
/*
* The main routine to restore task via sigreturn.
* This one is very special, we never return there
@@ -1194,8 +1234,6 @@ long __export_restore_task(struct task_restore_args *args)
/* Wait until children stop to use args->task_entries */
futex_wait_while_gt(&thread_inprogress, 1);
- log_set_fd(-1);
-
/*
* The code that prepared the itimers makes shure the
* code below doesn't fail due to bad timing values.
@@ -1214,6 +1252,18 @@ long __export_restore_task(struct task_restore_args *args)
restore_posix_timers(args);
+ /*
+ * Finally, restore seccomp just before the final sigreturn. A slight
+ * abuse of the stage mecahnism here: usually we wait for all the
+ * chidlren to be done, but in this case we're waiting for the parent
+ * to switch to the SECCOMP_SUSPENDED stage to indicate that it is
+ * safe for us to restore seccomp.
+ */
+ futex_wait_while(&task_entries->start, CR_STATE_RESTORE_CREDS);
+ prepare_seccomp(sys_getpid(), args->seccomp_mode);
+
+ log_set_fd(-1);
+
sys_munmap(args->rst_mem, args->rst_mem_size);
/*
diff --git a/proc_parse.c b/proc_parse.c
index e6ea957..769ac3c 100644
--- a/proc_parse.c
+++ b/proc_parse.c
@@ -9,6 +9,7 @@
#include <string.h>
#include <ctype.h>
#include <linux/fs.h>
+#include <linux/seccomp.h>
#include "asm/types.h"
#include "list.h"
@@ -763,7 +764,7 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
if (bfdopenr(&f))
return -1;
- while (done < 8) {
+ while (done < 9) {
str = breadline(&f);
if (str == NULL)
break;
@@ -824,9 +825,22 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
done++;
}
+
+ if (!strncmp(str, "Seccomp:", 8)) {
+ if (sscanf(str + 9, "%d", &cr->seccomp_mode) != 1) {
+ goto err_parse;
+ }
+
+ if (cr->seccomp_mode == SECCOMP_MODE_FILTER) {
+ pr_err("SECCOMP_MODE_FILTER not currently supported\n");
+ goto err_parse;
+ }
+
+ done++;
+ }
}
- if (done == 8)
+ if (done == 9)
ret = 0;
err_parse:
diff --git a/protobuf/core.proto b/protobuf/core.proto
index 9f70da9..fd78f5c 100644
--- a/protobuf/core.proto
+++ b/protobuf/core.proto
@@ -10,6 +10,15 @@ import "siginfo.proto";
import "opts.proto";
+/*
+ * These match the SECCOMP_MODE_* flags from <linux/seccomp.h>.
+ */
+enum seccomp_mode {
+ disabled = 0;
+ strict = 1;
+ filter = 2;
+};
+
message task_core_entry {
required uint32 task_state = 1;
required uint32 exit_code = 2;
@@ -26,6 +35,8 @@ message task_core_entry {
optional uint32 cg_set = 9;
optional signal_queue_entry signals_s = 10;
+
+ optional seccomp_mode seccomp_mode = 11;
}
message task_kobj_ids_entry {
diff --git a/ptrace.c b/ptrace.c
index be6b67b..4448a26 100644
--- a/ptrace.c
+++ b/ptrace.c
@@ -8,11 +8,14 @@
#include <limits.h>
#include <signal.h>
+#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/wait.h>
+#include <linux/seccomp.h>
+
#include "compiler.h"
#include "asm/types.h"
#include "util.h"
@@ -30,14 +33,32 @@ int unseize_task(pid_t pid, int orig_st, int st)
else if (st == TASK_STOPPED) {
if (orig_st == TASK_ALIVE)
kill(pid, SIGSTOP);
- } else if (st == TASK_ALIVE)
+ } else if (st == TASK_ALIVE) {
/* do nothing */ ;
- else
+ } else
pr_err("Unknown final state %d\n", st);
return ptrace(PTRACE_DETACH, pid, NULL, NULL);
}
+#ifdef CONFIG_HAS_SUSPEND_SECCOMP
+int suspend_seccomp(pid_t pid)
+{
+ if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
+ pr_perror("suspending seccomp failed");
+ return -1;
+ }
+
+ return 0;
+}
+#else
+int suspend_seccomp(pid_t pid)
+{
+ pr_err("seccomp enabled and seccomp suspending not supported\n");
+ return -1;
+}
+#endif
+
/*
* This routine seizes task putting it into a special
* state where we can manipulate the task via ptrace
@@ -46,7 +67,7 @@ int unseize_task(pid_t pid, int orig_st, int st)
* up with someone else.
*/
-int seize_task(pid_t pid, pid_t ppid)
+int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode)
{
siginfo_t si;
int status;
@@ -90,6 +111,9 @@ try_again:
if (ret2)
goto err;
+ if (seccomp_mode)
+ *seccomp_mode = cr.seccomp_mode;
+
if (!may_dump(&cr)) {
pr_err("Check uid (pid: %d) failed\n", pid);
goto err;
@@ -142,6 +166,9 @@ try_again:
goto try_again;
}
+ if (suspend_seccomp(pid))
+ goto err_stop;
+
if (si.si_signo == SIGTRAP)
return TASK_ALIVE;
else if (si.si_signo == SIGSTOP) {
diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak
index 519eb52..ec7972a 100644
--- a/scripts/feature-tests.mak
+++ b/scripts/feature-tests.mak
@@ -92,3 +92,14 @@ int main(int argc, char *argv[], char *envp[])
}
endef
+
+define PTRACE_SUSPEND_SECCOMP_TEST
+
+#include <linux/ptrace.h>
+
+int main(void)
+{
+ return PTRACE_O_SUSPEND_SECCOMP;
+}
+
+endef
--
2.1.4
More information about the CRIU
mailing list