[CRIU] [PATCH 19/28] seccomp: Add restore of per-thread filters
Cyrill Gorcunov
gorcunov at gmail.com
Wed Mar 21 00:43:04 MSK 2018
From: Cyrill Gorcunov <gorcunov at virtuozzo.com>
https://jira.sw.ru/browse/PSBM-78762
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
---
criu/cr-restore.c | 10 ++--
criu/include/restorer.h | 15 ++++-
criu/include/seccomp.h | 9 ++-
criu/pie/restorer.c | 106 +++++++++++++++++++++-------------
criu/seccomp.c | 149 ++++++++++++++++++++++++++++++++++--------------
5 files changed, 195 insertions(+), 94 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index b9f8de5e82b1..3025ec4032c2 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -331,7 +331,7 @@ static int root_prepare_shared(void)
if (prepare_remaps())
return -1;
- if (prepare_seccomp_filters())
+ if (seccomp_read_image())
return -1;
if (collect_images(cinfos, ARRAY_SIZE(cinfos)))
@@ -1031,7 +1031,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
if (prepare_timerfds(ta))
return -1;
- if (seccomp_filters_get_rst_pos(core, ta) < 0)
+ if (seccomp_prepare_threads(current, ta) < 0)
return -1;
if (prepare_itimers(pid, ta, core) < 0)
@@ -3668,12 +3668,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
RST_MEM_FIXUP_PPTR(task_args->rlims);
RST_MEM_FIXUP_PPTR(task_args->helpers);
RST_MEM_FIXUP_PPTR(task_args->zombies);
- RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
RST_MEM_FIXUP_PPTR(task_args->vma_ios);
- if (core->thread_core->has_seccomp_mode)
- task_args->seccomp_mode = core->thread_core->seccomp_mode;
-
task_args->compatible_mode = core_is_compat(core);
if (opts.check_only)
@@ -3763,6 +3759,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
if (ret)
goto err;
+ seccomp_rst_reloc(&thread_args[i]);
+
thread_args[i].mz = mz + i;
sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe;
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 15307d9c0701..3767d9d25088 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -4,6 +4,7 @@
#include <signal.h>
#include <limits.h>
#include <sys/resource.h>
+#include <linux/filter.h>
#include "common/config.h"
#include "types.h"
@@ -76,6 +77,11 @@ struct thread_creds_args {
unsigned long mem_pos_next;
};
+struct thread_seccomp_filter {
+ struct sock_fprog sock_fprog;
+ unsigned int flags;
+};
+
struct thread_restore_args {
struct restore_mem_zone *mz;
@@ -100,6 +106,12 @@ struct thread_restore_args {
bool check_only;
struct thread_creds_args *creds_args;
+
+ int seccomp_mode;
+ unsigned long seccomp_filters_pos;
+ struct thread_seccomp_filter *seccomp_filters;
+ void *seccomp_filters_data;
+ unsigned int seccomp_filters_n;
} __aligned(64);
typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
@@ -163,9 +175,6 @@ struct task_restore_args {
pid_t *zombies;
unsigned int zombies_n;
- struct sock_fprog *seccomp_filters;
- unsigned int seccomp_filters_n;
-
/* * * * * * * * * * * * * * * * * * * * */
unsigned long task_size;
diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
index 47c24c9719c1..1808e3d610c3 100644
--- a/criu/include/seccomp.h
+++ b/criu/include/seccomp.h
@@ -27,6 +27,8 @@
#define SECCOMP_FILTER_FLAG_TSYNC 1
#endif
+struct thread_restore_args;
+struct task_restore_args;
struct pstree_item;
struct rb_node;
@@ -69,7 +71,8 @@ extern void seccomp_free_entries(void);
extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core);
extern int seccomp_collect_dump_filters(void);
-extern int prepare_seccomp_filters(void);
-struct task_restore_args;
-extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *);
+extern int seccomp_read_image(void);
+extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
+extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg);
+
#endif
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 091026103805..5ede206c55ef 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -395,54 +395,82 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
return 0;
}
-static int restore_seccomp(struct task_restore_args *args)
+static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
{
+ size_t i;
int ret;
- switch (args->seccomp_mode) {
- case SECCOMP_MODE_DISABLED:
- return 0;
- case SECCOMP_MODE_STRICT:
- ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+ for (i = 0; i < args->seccomp_filters_n; i++) {
+ struct thread_seccomp_filter *filter = &args->seccomp_filters[i];
+
+ ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags, (void *)&filter->sock_fprog);
if (ret < 0) {
- pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
- goto die;
+ if (ret == -ENOSYS) {
+ pr_debug("seccomp: sys_seccomp is not supported in kernel, "
+ "switching to prctl interface\n");
+ ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+ (long)(void *)&filter->sock_fprog, 0, 0);
+ if (ret) {
+ pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n",
+ ret, tid);
+ return -1;
+ }
+ } else {
+ pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n",
+ ret, tid);
+ return -1;
+ }
}
- return 0;
- case SECCOMP_MODE_FILTER: {
- int i;
- void *filter_data;
-
- filter_data = &args->seccomp_filters[args->seccomp_filters_n];
-
- for (i = 0; i < args->seccomp_filters_n; i++) {
- struct sock_fprog *fprog = &args->seccomp_filters[i];
+ }
- fprog->filter = filter_data;
+ return 0;
+}
- /* We always TSYNC here, since we require that the
- * creds for all threads be the same; this means we
- * don't have to restore_seccomp() in threads, and that
- * future TSYNC behavior will be correct.
- */
- ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
- if (ret < 0) {
- pr_err("sys_seccomp() returned %d\n", ret);
- goto die;
- }
+static int restore_seccomp(struct thread_restore_args *args)
+{
+ pid_t tid = 0;
+ int ret, i;
- filter_data += fprog->len * sizeof(struct sock_filter);
+ for (i = 0; i < MAX_NS_NESTING; i++) {
+ if (args->pid[i] == 0) {
+ tid = args->pid[i - 1];
+ break;
}
+ }
- return 0;
+ if (tid != sys_gettid()) {
+ pr_err("seccomp: Unexpected tid %d != %d\n",
+ tid, (pid_t)sys_gettid());
+ return -1;
}
+
+ switch (args->seccomp_mode) {
+ case SECCOMP_MODE_DISABLED:
+ return 0;
+ break;
+ case SECCOMP_MODE_STRICT:
+ ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+ if (ret < 0) {
+ pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n",
+ ret, tid);
+ }
+ break;
+ case SECCOMP_MODE_FILTER:
+ ret = restore_seccomp_filter(tid, args);
+ break;
default:
- goto die;
+ pr_err("seccomp: Unknown seccomp mode %d on tid %d\n",
+ args->seccomp_mode, tid);
+ ret = -1;
+ break;
}
- return 0;
-die:
- return -1;
+ if (!ret) {
+ pr_debug("seccomp: Restored mode %d on tid %d\n",
+ args->seccomp_mode, tid);
+ }
+
+ return ret;
}
static int restore_robust_futex(struct thread_restore_args *args)
@@ -541,6 +569,10 @@ long __export_restore_thread(struct thread_restore_args *args)
sys_close(fd);
}
+ /* Make sure it's before creds restore */
+ if (restore_seccomp(args))
+ goto core_restore_end;
+
ret = restore_creds(args->creds_args, args->ta->proc_fd);
if (ret)
goto core_restore_end;
@@ -559,9 +591,6 @@ long __export_restore_thread(struct thread_restore_args *args)
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
restore_pdeath_sig(args);
- if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
- pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
-
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
futex_dec_and_wake(&thread_inprogress);
@@ -1680,8 +1709,7 @@ long __export_restore_task(struct task_restore_args *args)
/* The kernel restricts setting seccomp to uid 0 in the current user
* ns, so we must do this before restore_creds.
*/
- pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
- if (restore_seccomp(args))
+ if (restore_seccomp(args->t))
goto core_restore_end;
/*
diff --git a/criu/seccomp.c b/criu/seccomp.c
index a4304e645de7..dd608265731a 100644
--- a/criu/seccomp.c
+++ b/criu/seccomp.c
@@ -24,6 +24,8 @@
static struct rb_root seccomp_tid_rb_root = RB_ROOT;
static struct seccomp_entry *seccomp_tid_entry_root;
+static SeccompEntry *seccomp_img_entry;
+
struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory)
{
struct seccomp_entry *entry = NULL;
@@ -293,10 +295,8 @@ int seccomp_collect_dump_filters(void)
return 0;
}
-/* Populated on restore by prepare_seccomp_filters */
-static SeccompEntry *se;
-
-int prepare_seccomp_filters(void)
+/* The seccomp_img_entry will be shared between all children */
+int seccomp_read_image(void)
{
struct cr_img *img;
int ret;
@@ -305,66 +305,129 @@ int prepare_seccomp_filters(void)
if (!img)
return -1;
- ret = pb_read_one_eof(img, &se, PB_SECCOMP);
+ ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP);
close_image(img);
if (ret <= 0)
return 0; /* there were no filters */
- BUG_ON(!se);
+ BUG_ON(!seccomp_img_entry);
return 0;
}
-int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta)
+/* seccomp_img_entry will be freed per-children after forking */
+static void free_seccomp_filters(void)
{
- SeccompFilter *sf = NULL;
- struct sock_fprog *arr = NULL;
- void *filter_data = NULL;
- int ret = -1, i, n_filters;
- size_t filter_size = 0;
+ if (seccomp_img_entry) {
+ seccomp_entry__free_unpacked(seccomp_img_entry, NULL);
+ seccomp_img_entry = NULL;
+ }
+}
- ta->seccomp_filters_n = 0;
+void seccomp_rst_reloc(struct thread_restore_args *args)
+{
+ size_t j, off;
- if (!core->thread_core->has_seccomp_filter)
- return 0;
+ if (!args->seccomp_filters_n)
+ return;
- ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE);
+ args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE);
+ args->seccomp_filters_data = (void *)args->seccomp_filters +
+ args->seccomp_filters_n * sizeof(struct thread_seccomp_filter);
- BUG_ON(core->thread_core->seccomp_filter > se->n_seccomp_filters);
- sf = se->seccomp_filters[core->thread_core->seccomp_filter];
+ for (j = off = 0; j < args->seccomp_filters_n; j++) {
+ struct thread_seccomp_filter *f = &args->seccomp_filters[j];
- while (1) {
- ta->seccomp_filters_n++;
- filter_size += sf->filter.len;
+ f->sock_fprog.filter = args->seccomp_filters_data + off;
+ off += f->sock_fprog.len * sizeof(struct sock_filter);
+ }
+}
- if (!sf->has_prev)
- break;
+int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta)
+{
+ struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]);
+ size_t i, j, nr_filters, filters_size, rst_size, off;
- sf = se->seccomp_filters[sf->prev];
- }
+ for (i = 0; i < item->nr_threads; i++) {
+ ThreadCoreEntry *thread_core = item->core[i]->thread_core;
+ struct thread_restore_args *args = &args_array[i];
+ SeccompFilter *sf;
- n_filters = ta->seccomp_filters_n;
- arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE);
- if (!arr)
- goto out;
+ args->seccomp_mode = SECCOMP_MODE_DISABLED;
+ args->seccomp_filters_pos = 0;
+ args->seccomp_filters_n = 0;
+ args->seccomp_filters = NULL;
+ args->seccomp_filters_data = NULL;
- filter_data = &arr[n_filters];
- sf = se->seccomp_filters[core->thread_core->seccomp_filter];
- for (i = 0; i < n_filters; i++) {
- struct sock_fprog *fprog = &arr[i];
+ if (thread_core->has_seccomp_mode)
+ args->seccomp_mode = thread_core->seccomp_mode;
- BUG_ON(sf->filter.len % sizeof(struct sock_filter));
- fprog->len = sf->filter.len / sizeof(struct sock_filter);
+ if (args->seccomp_mode != SECCOMP_MODE_FILTER)
+ continue;
- memcpy(filter_data, sf->filter.data, sf->filter.len);
+ if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) {
+ pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+ item->threads[i]->ns[0].virt, thread_core->seccomp_filter,
+ seccomp_img_entry->n_seccomp_filters);
+ return -1;
+ }
- filter_data += sf->filter.len;
- sf = se->seccomp_filters[sf->prev];
- }
+ sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+ if (sf->filter.len % (sizeof(struct sock_filter))) {
+ pr_err("Corrupted filter len on tid %d (index %u)\n",
+ item->threads[i]->ns[0].virt,
+ thread_core->seccomp_filter);
+ return -1;
+ }
+ filters_size = sf->filter.len;
+ nr_filters = 1;
+
+ while (sf->has_prev) {
+ if (sf->prev >= seccomp_img_entry->n_seccomp_filters) {
+ pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+ item->threads[i]->ns[0].virt, sf->prev,
+ seccomp_img_entry->n_seccomp_filters);
+ return -1;
+ }
+
+ sf = seccomp_img_entry->seccomp_filters[sf->prev];
+ if (sf->filter.len % (sizeof(struct sock_filter))) {
+ pr_err("Corrupted filter len on tid %d (index %u)\n",
+ item->threads[i]->ns[0].virt, sf->prev);
+ return -1;
+ }
+ filters_size += sf->filter.len;
+ nr_filters++;
+ }
- ret = 0;
+ args->seccomp_filters_n = nr_filters;
-out:
- seccomp_entry__free_unpacked(se, NULL);
- return ret;
+ rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter);
+ args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE);
+ args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE);
+ if (!args->seccomp_filters) {
+ pr_err("Can't allocate %zu bytes for filters on tid %d\n",
+ rst_size, item->threads[i]->ns[0].virt);
+ return -ENOMEM;
+ }
+ args->seccomp_filters_data = (void *)args->seccomp_filters +
+ nr_filters * sizeof(struct thread_seccomp_filter);
+
+ sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+ for (j = off = 0; j < nr_filters; j++) {
+ struct thread_seccomp_filter *f = &args->seccomp_filters[j];
+
+ f->sock_fprog.len = sf->filter.len / sizeof(struct sock_filter);
+ f->sock_fprog.filter = args->seccomp_filters_data + off;
+ f->flags = sf->flags;
+
+ memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len);
+
+ off += sf->filter.len;
+ sf = seccomp_img_entry->seccomp_filters[sf->prev];
+ }
+ }
+
+ free_seccomp_filters();
+ return 0;
}
--
2.14.3
More information about the CRIU
mailing list