[CRIU] [PATCH 05/10] seccomp: Add engine to restore per-thread seccomp chains
Cyrill Gorcunov
gorcunov at gmail.com
Mon May 7 11:42:45 MSK 2018
At now we pretend that all threads are sharing seccomp chains
and at checkpoint moment we test seccomp modes to make sure
if this assumption is valid refusing to dump otherwise.
Still the kernel tacks seccomp filter chains per each thread
and now we've faced applications (such as java) where per-thread
chains are actively used. Thus we need to bring support of handling
filters via per-thread basis.
In this a bit intrusive patch the restore engine is lifted up
to treat each thread separately. Here what is done:
- Image core file is modified to keep seccomp filters
inside thread_core_entry. For backward compatibility
former seccomp_mode and seccomp_filter members in
task_core_entry are renamed to have old_ prefix and
on restore we test if we're dealing with old images.
Since per-thread dump is not yet implemeneted the
dumping procedure continue operating with old_ members.
- In pie restorer code memory containing filters are addressed
from inside thread_restore_args structure which now
contains seccomp mode itself and chain attributes
(number of filters and etc).
Reading of per-thread data is done in seccomp_prepare_threads
helper -- we take one pstree_item and walks over every thread
inside to allocate pie memory and pin data there.
Because of PIE specific, before jumping into pie code
we have to relocate this memory into new place and
for this seccomp_rst_reloc is served.
In restorer itself we check if thread_restore_args provides
us enabled seccomp mode (strict or filter passed) and call
for restore_seccomp_filter if needed.
- To unify names we start using seccomp_ prefix for all related
stuff involved into this change (prepare_seccomp_filters renamed
to seccomp_read_image because it only reads image and nothing
more, image handler is renamed to seccomp_img_entry instead
of too short 'se'.
With this change we're now allowed to start collecting and
dumping seccomp filters per each thread, which will be
done in next patch.
Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
criu/cr-dump.c | 8 +--
criu/cr-restore.c | 33 ++++++++---
criu/include/restorer.h | 16 +++++-
criu/include/rst_info.h | 5 ++
criu/include/seccomp.h | 10 +++-
criu/pie/restorer.c | 119 +++++++++++++++++++++++++-------------
criu/seccomp.c | 149 ++++++++++++++++++++++++++++++++++--------------
images/core.proto | 8 ++-
8 files changed, 245 insertions(+), 103 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index b48a38b8b260..9fcf3b7c5e00 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -762,12 +762,12 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
creds = dmpi(item)->pi_creds;
if (creds->s.seccomp_mode != SECCOMP_MODE_DISABLED) {
pr_info("got seccomp mode %d for %d\n", creds->s.seccomp_mode, vpid(item));
- core->tc->has_seccomp_mode = true;
- core->tc->seccomp_mode = creds->s.seccomp_mode;
+ core->tc->has_old_seccomp_mode = true;
+ core->tc->old_seccomp_mode = creds->s.seccomp_mode;
if (creds->s.seccomp_mode == SECCOMP_MODE_FILTER) {
- core->tc->has_seccomp_filter = true;
- core->tc->seccomp_filter = creds->last_filter;
+ core->tc->has_old_seccomp_filter = true;
+ core->tc->old_seccomp_filter = creds->last_filter;
}
}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 54216b8ea359..a7a232b2e028 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -331,7 +331,7 @@ static int root_prepare_shared(void)
if (prepare_remaps())
return -1;
- if (prepare_seccomp_filters())
+ if (seccomp_read_image())
return -1;
if (collect_images(cinfos, ARRAY_SIZE(cinfos)))
@@ -1031,7 +1031,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
if (prepare_timerfds(ta))
return -1;
- if (seccomp_filters_get_rst_pos(core, ta) < 0)
+ if (seccomp_prepare_threads(current, ta) < 0)
return -1;
if (prepare_itimers(pid, ta, core) < 0)
@@ -1236,6 +1236,21 @@ static int check_core(CoreEntry *core, struct pstree_item *me)
pr_err("Core info data missed for non-zombie\n");
goto out;
}
+
+ /*
+ * Seccomp are moved to per-thread origin,
+ * so for old images we need to move per-task
+ * data into proper place.
+ */
+ if (core->tc->has_old_seccomp_mode) {
+ core->thread_core->has_seccomp_mode = core->tc->has_old_seccomp_mode;
+ core->thread_core->seccomp_mode = core->tc->old_seccomp_mode;
+ }
+ if (core->tc->has_old_seccomp_filter) {
+ core->thread_core->has_seccomp_filter = core->tc->has_old_seccomp_filter;
+ core->thread_core->seccomp_filter = core->tc->old_seccomp_filter;
+ rsti(me)->has_old_seccomp_filter = true;
+ }
}
ret = 0;
@@ -1511,13 +1526,16 @@ static inline int fork_with_pid(struct pstree_item *item)
item->pid->state = ca.core->tc->task_state;
rsti(item)->cg_set = ca.core->tc->cg_set;
- rsti(item)->has_seccomp = ca.core->tc->seccomp_mode != SECCOMP_MODE_DISABLED;
-
if (item->pid->state != TASK_DEAD && !task_alive(item)) {
pr_err("Unknown task state %d\n", item->pid->state);
return -1;
}
+ if (item->pid->state != TASK_DEAD)
+ rsti(item)->has_seccomp = ca.core->thread_core->seccomp_mode != SECCOMP_MODE_DISABLED;
+ else
+ rsti(item)->has_seccomp = false;
+
if (unlikely(item == root_item))
maybe_clone_parent(item, &ca);
} else {
@@ -3660,12 +3678,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
RST_MEM_FIXUP_PPTR(task_args->rlims);
RST_MEM_FIXUP_PPTR(task_args->helpers);
RST_MEM_FIXUP_PPTR(task_args->zombies);
- RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
RST_MEM_FIXUP_PPTR(task_args->vma_ios);
- if (core->tc->has_seccomp_mode)
- task_args->seccomp_mode = core->tc->seccomp_mode;
-
task_args->compatible_mode = core_is_compat(core);
if (opts.check_only)
@@ -3755,6 +3769,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
if (ret)
goto err;
+ seccomp_rst_reloc(&thread_args[i]);
+ thread_args[i].seccomp_force_tsync = rsti(current)->has_old_seccomp_filter;
+
thread_args[i].mz = mz + i;
sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe;
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 15307d9c0701..b75d6687ba2d 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -4,6 +4,7 @@
#include <signal.h>
#include <limits.h>
#include <sys/resource.h>
+#include <linux/filter.h>
#include "common/config.h"
#include "types.h"
@@ -76,6 +77,11 @@ struct thread_creds_args {
unsigned long mem_pos_next;
};
+struct thread_seccomp_filter {
+ struct sock_fprog sock_fprog;
+ unsigned int flags;
+};
+
struct thread_restore_args {
struct restore_mem_zone *mz;
@@ -100,6 +106,13 @@ struct thread_restore_args {
bool check_only;
struct thread_creds_args *creds_args;
+
+ int seccomp_mode;
+ unsigned long seccomp_filters_pos;
+ struct thread_seccomp_filter *seccomp_filters;
+ void *seccomp_filters_data;
+ unsigned int seccomp_filters_n;
+ bool seccomp_force_tsync;
} __aligned(64);
typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
@@ -163,9 +176,6 @@ struct task_restore_args {
pid_t *zombies;
unsigned int zombies_n;
- struct sock_fprog *seccomp_filters;
- unsigned int seccomp_filters_n;
-
/* * * * * * * * * * * * * * * * * * * * */
unsigned long task_size;
diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h
index f9840d1681ff..07c634f4adfd 100644
--- a/criu/include/rst_info.h
+++ b/criu/include/rst_info.h
@@ -61,6 +61,11 @@ struct rst_info {
* restorer blob.
*/
bool has_seccomp;
+ /*
+ * To be compatible with old images where filters
+ * are bound to group leader and we need to use tsync flag.
+ */
+ bool has_old_seccomp_filter;
bool has_thp_enabled;
diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
index b50ea34e20bb..0791597fefd6 100644
--- a/criu/include/seccomp.h
+++ b/criu/include/seccomp.h
@@ -27,6 +27,9 @@
#define SECCOMP_FILTER_FLAG_TSYNC 1
#endif
+struct thread_restore_args;
+struct task_restore_args;
+
struct seccomp_info {
struct seccomp_info *prev;
int id;
@@ -35,6 +38,9 @@ struct seccomp_info {
extern int collect_seccomp_filters(void);
extern int prepare_seccomp_filters(void);
-struct task_restore_args;
-extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *);
+
+extern int seccomp_read_image(void);
+extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
+extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg);
+
#endif
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index ab0c0d713216..2ba0bcf2f72f 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -395,54 +395,87 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
return 0;
}
-static int restore_seccomp(struct task_restore_args *args)
+static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
{
+ unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0;
+ size_t i;
int ret;
- switch (args->seccomp_mode) {
- case SECCOMP_MODE_DISABLED:
- return 0;
- case SECCOMP_MODE_STRICT:
- ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
- if (ret < 0) {
- pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
- goto die;
- }
- return 0;
- case SECCOMP_MODE_FILTER: {
- int i;
- void *filter_data;
+ for (i = 0; i < args->seccomp_filters_n; i++) {
+ struct thread_seccomp_filter *filter = &args->seccomp_filters[i];
- filter_data = &args->seccomp_filters[args->seccomp_filters_n];
+ pr_debug("seccomp: Restoring mode %d flags %x on tid %d filter %d\n",
+ SECCOMP_SET_MODE_FILTER, (filter->flags | flags), tid, (int)i);
- for (i = 0; i < args->seccomp_filters_n; i++) {
- struct sock_fprog *fprog = &args->seccomp_filters[i];
+ ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags | flags, (void *)&filter->sock_fprog);
+ if (ret < 0) {
+ if (ret == -ENOSYS) {
+ pr_debug("seccomp: sys_seccomp is not supported in kernel, "
+ "switching to prctl interface\n");
+ ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+ (long)(void *)&filter->sock_fprog, 0, 0);
+ if (ret) {
+ pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n",
+ ret, tid);
+ return -1;
+ }
+ } else {
+ pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n",
+ ret, tid);
+ return -1;
+ }
+ }
+ }
- fprog->filter = filter_data;
+ return 0;
+}
- /* We always TSYNC here, since we require that the
- * creds for all threads be the same; this means we
- * don't have to restore_seccomp() in threads, and that
- * future TSYNC behavior will be correct.
- */
- ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
- if (ret < 0) {
- pr_err("sys_seccomp() returned %d\n", ret);
- goto die;
- }
+static int restore_seccomp(struct thread_restore_args *args)
+{
+ pid_t tid = 0;
+ int ret, i;
- filter_data += fprog->len * sizeof(struct sock_filter);
+ for (i = 0; i < MAX_NS_NESTING; i++) {
+ if (args->pid[i] == 0) {
+ tid = args->pid[i - 1];
+ break;
}
+ }
- return 0;
+ if (tid != sys_gettid()) {
+ pr_err("seccomp: Unexpected tid %d != %d\n",
+ tid, (pid_t)sys_gettid());
+ return -1;
}
+
+ switch (args->seccomp_mode) {
+ case SECCOMP_MODE_DISABLED:
+ pr_debug("seccomp: mode %d on tid %d\n", SECCOMP_MODE_DISABLED, tid);
+ return 0;
+ break;
+ case SECCOMP_MODE_STRICT:
+ ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+ if (ret < 0) {
+ pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n",
+ ret, tid);
+ }
+ break;
+ case SECCOMP_MODE_FILTER:
+ ret = restore_seccomp_filter(tid, args);
+ break;
default:
- goto die;
+ pr_err("seccomp: Unknown seccomp mode %d on tid %d\n",
+ args->seccomp_mode, tid);
+ ret = -1;
+ break;
}
- return 0;
-die:
- return -1;
+ if (!ret) {
+ pr_debug("seccomp: Restored mode %d on tid %d\n",
+ args->seccomp_mode, tid);
+ }
+
+ return ret;
}
static int restore_robust_futex(struct thread_restore_args *args)
@@ -541,6 +574,13 @@ long __export_restore_thread(struct thread_restore_args *args)
sys_close(fd);
}
+ /*
+ * Make sure it's before creds, since it's privileged
+ * operation bound to uid 0 in current user ns.
+ */
+ if (restore_seccomp(args))
+ goto core_restore_end;
+
ret = restore_creds(args->creds_args, args->ta->proc_fd);
if (ret)
goto core_restore_end;
@@ -559,9 +599,6 @@ long __export_restore_thread(struct thread_restore_args *args)
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
restore_pdeath_sig(args);
- if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
- pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
-
restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
futex_dec_and_wake(&thread_inprogress);
@@ -1677,11 +1714,11 @@ long __export_restore_task(struct task_restore_args *args)
sys_close(fd);
}
- /* The kernel restricts setting seccomp to uid 0 in the current user
- * ns, so we must do this before restore_creds.
+ /*
+ * Make sure it's before creds, since it's privileged
+ * operation bound to uid 0 in current user ns.
*/
- pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
- if (restore_seccomp(args))
+ if (restore_seccomp(args->t))
goto core_restore_end;
/*
diff --git a/criu/seccomp.c b/criu/seccomp.c
index c8cd35f9ae46..8da5a2932e83 100644
--- a/criu/seccomp.c
+++ b/criu/seccomp.c
@@ -21,6 +21,8 @@
#undef LOG_PREFIX
#define LOG_PREFIX "seccomp: "
+static SeccompEntry *seccomp_img_entry;
+
/* populated on dump during collect_seccomp_filters() */
static int next_filter_id = 0;
static struct seccomp_info **filters = NULL;
@@ -233,10 +235,8 @@ int collect_seccomp_filters(void)
return 0;
}
-/* Populated on restore by prepare_seccomp_filters */
-static SeccompEntry *se;
-
-int prepare_seccomp_filters(void)
+/* The seccomp_img_entry will be shared between all children */
+int seccomp_read_image(void)
{
struct cr_img *img;
int ret;
@@ -245,66 +245,129 @@ int prepare_seccomp_filters(void)
if (!img)
return -1;
- ret = pb_read_one_eof(img, &se, PB_SECCOMP);
+ ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP);
close_image(img);
if (ret <= 0)
return 0; /* there were no filters */
- BUG_ON(!se);
+ BUG_ON(!seccomp_img_entry);
return 0;
}
-int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta)
+/* seccomp_img_entry will be freed per-children after forking */
+static void free_seccomp_filters(void)
{
- SeccompFilter *sf = NULL;
- struct sock_fprog *arr = NULL;
- void *filter_data = NULL;
- int ret = -1, i, n_filters;
- size_t filter_size = 0;
+ if (seccomp_img_entry) {
+ seccomp_entry__free_unpacked(seccomp_img_entry, NULL);
+ seccomp_img_entry = NULL;
+ }
+}
- ta->seccomp_filters_n = 0;
+void seccomp_rst_reloc(struct thread_restore_args *args)
+{
+ size_t j, off;
- if (!core->tc->has_seccomp_filter)
- return 0;
+ if (!args->seccomp_filters_n)
+ return;
- ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE);
+ args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE);
+ args->seccomp_filters_data = (void *)args->seccomp_filters +
+ args->seccomp_filters_n * sizeof(struct thread_seccomp_filter);
- BUG_ON(core->tc->seccomp_filter > se->n_seccomp_filters);
- sf = se->seccomp_filters[core->tc->seccomp_filter];
+ for (j = off = 0; j < args->seccomp_filters_n; j++) {
+ struct thread_seccomp_filter *f = &args->seccomp_filters[j];
- while (1) {
- ta->seccomp_filters_n++;
- filter_size += sf->filter.len;
+ f->sock_fprog.filter = args->seccomp_filters_data + off;
+ off += f->sock_fprog.len * sizeof(struct sock_filter);
+ }
+}
- if (!sf->has_prev)
- break;
+int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta)
+{
+ struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]);
+ size_t i, j, nr_filters, filters_size, rst_size, off;
- sf = se->seccomp_filters[sf->prev];
- }
+ for (i = 0; i < item->nr_threads; i++) {
+ ThreadCoreEntry *thread_core = item->core[i]->thread_core;
+ struct thread_restore_args *args = &args_array[i];
+ SeccompFilter *sf;
- n_filters = ta->seccomp_filters_n;
- arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE);
- if (!arr)
- goto out;
+ args->seccomp_mode = SECCOMP_MODE_DISABLED;
+ args->seccomp_filters_pos = 0;
+ args->seccomp_filters_n = 0;
+ args->seccomp_filters = NULL;
+ args->seccomp_filters_data = NULL;
- filter_data = &arr[n_filters];
- sf = se->seccomp_filters[core->tc->seccomp_filter];
- for (i = 0; i < n_filters; i++) {
- struct sock_fprog *fprog = &arr[i];
+ if (thread_core->has_seccomp_mode)
+ args->seccomp_mode = thread_core->seccomp_mode;
- BUG_ON(sf->filter.len % sizeof(struct sock_filter));
- fprog->len = sf->filter.len / sizeof(struct sock_filter);
+ if (args->seccomp_mode != SECCOMP_MODE_FILTER)
+ continue;
- memcpy(filter_data, sf->filter.data, sf->filter.len);
+ if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) {
+ pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+ item->threads[i]->ns[0].virt, thread_core->seccomp_filter,
+ seccomp_img_entry->n_seccomp_filters);
+ return -1;
+ }
- filter_data += sf->filter.len;
- sf = se->seccomp_filters[sf->prev];
- }
+ sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+ if (sf->filter.len % (sizeof(struct sock_filter))) {
+ pr_err("Corrupted filter len on tid %d (index %u)\n",
+ item->threads[i]->ns[0].virt,
+ thread_core->seccomp_filter);
+ return -1;
+ }
+ filters_size = sf->filter.len;
+ nr_filters = 1;
+
+ while (sf->has_prev) {
+ if (sf->prev >= seccomp_img_entry->n_seccomp_filters) {
+ pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
+ item->threads[i]->ns[0].virt, sf->prev,
+ seccomp_img_entry->n_seccomp_filters);
+ return -1;
+ }
- ret = 0;
+ sf = seccomp_img_entry->seccomp_filters[sf->prev];
+ if (sf->filter.len % (sizeof(struct sock_filter))) {
+ pr_err("Corrupted filter len on tid %d (index %u)\n",
+ item->threads[i]->ns[0].virt, sf->prev);
+ return -1;
+ }
+ filters_size += sf->filter.len;
+ nr_filters++;
+ }
-out:
- seccomp_entry__free_unpacked(se, NULL);
- return ret;
+ args->seccomp_filters_n = nr_filters;
+
+ rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter);
+ args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE);
+ args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE);
+ if (!args->seccomp_filters) {
+ pr_err("Can't allocate %zu bytes for filters on tid %d\n",
+ rst_size, item->threads[i]->ns[0].virt);
+ return -ENOMEM;
+ }
+ args->seccomp_filters_data = (void *)args->seccomp_filters +
+ nr_filters * sizeof(struct thread_seccomp_filter);
+
+ sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
+ for (j = off = 0; j < nr_filters; j++) {
+ struct thread_seccomp_filter *f = &args->seccomp_filters[j];
+
+ f->sock_fprog.len = sf->filter.len / sizeof(struct sock_filter);
+ f->sock_fprog.filter = args->seccomp_filters_data + off;
+ f->flags = sf->flags;
+
+ memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len);
+
+ off += sf->filter.len;
+ sf = seccomp_img_entry->seccomp_filters[sf->prev];
+ }
+ }
+
+ free_seccomp_filters();
+ return 0;
}
diff --git a/images/core.proto b/images/core.proto
index 0291fae68ea8..726803646444 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -40,8 +40,9 @@ message task_core_entry {
optional signal_queue_entry signals_s = 10;
- optional seccomp_mode seccomp_mode = 11;
- optional uint32 seccomp_filter = 12;
+ /* These two are deprecated, should be per-thread */
+ optional seccomp_mode old_seccomp_mode = 11;
+ optional uint32 old_seccomp_filter = 12;
optional uint32 loginuid = 13;
@@ -87,6 +88,9 @@ message thread_core_entry {
optional signal_queue_entry signals_p = 9;
optional creds_entry creds = 10;
+
+ optional seccomp_mode seccomp_mode = 11;
+ optional uint32 seccomp_filter = 12;
}
message task_rlimits_entry {
--
2.14.3
More information about the CRIU
mailing list