[CRIU] [PATCH 19/28] seccomp: Add restore of per-thread filters
Andrey Vagin
avagin at virtuozzo.com
Fri Mar 23 01:13:24 MSK 2018
On Wed, Mar 21, 2018 at 12:43:04AM +0300, Cyrill Gorcunov wrote:
> From: Cyrill Gorcunov <gorcunov at virtuozzo.com>
>
> https://jira.sw.ru/browse/PSBM-78762
>
> Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
> ---
> criu/cr-restore.c | 10 ++--
> criu/include/restorer.h | 15 ++++-
> criu/include/seccomp.h | 9 ++-
> criu/pie/restorer.c | 106 +++++++++++++++++++++-------------
> criu/seccomp.c | 149 ++++++++++++++++++++++++++++++++++--------------
> 5 files changed, 195 insertions(+), 94 deletions(-)
200 lines of code!!! Pls, write what is going on here in a commit
message.
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index b9f8de5e82b1..3025ec4032c2 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -331,7 +331,7 @@ static int root_prepare_shared(void)
> if (prepare_remaps())
> return -1;
>
> - if (prepare_seccomp_filters())
> + if (seccomp_read_image())
> return -1;
>
> if (collect_images(cinfos, ARRAY_SIZE(cinfos)))
> @@ -1031,7 +1031,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
> if (prepare_timerfds(ta))
> return -1;
>
> - if (seccomp_filters_get_rst_pos(core, ta) < 0)
> + if (seccomp_prepare_threads(current, ta) < 0)
> return -1;
>
> if (prepare_itimers(pid, ta, core) < 0)
> @@ -3668,12 +3668,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> RST_MEM_FIXUP_PPTR(task_args->rlims);
> RST_MEM_FIXUP_PPTR(task_args->helpers);
> RST_MEM_FIXUP_PPTR(task_args->zombies);
> - RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
> RST_MEM_FIXUP_PPTR(task_args->vma_ios);
>
> - if (core->thread_core->has_seccomp_mode)
> - task_args->seccomp_mode = core->thread_core->seccomp_mode;
> -
> task_args->compatible_mode = core_is_compat(core);
>
> if (opts.check_only)
> @@ -3763,6 +3759,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> if (ret)
> goto err;
>
> + seccomp_rst_reloc(&thread_args[i]);
> +
> thread_args[i].mz = mz + i;
> sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe;
>
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index 15307d9c0701..3767d9d25088 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -4,6 +4,7 @@
> #include <signal.h>
> #include <limits.h>
> #include <sys/resource.h>
> +#include <linux/filter.h>
>
> #include "common/config.h"
> #include "types.h"
> @@ -76,6 +77,11 @@ struct thread_creds_args {
> unsigned long mem_pos_next;
> };
>
> +struct thread_seccomp_filter {
> + struct sock_fprog sock_fprog;
> + unsigned int flags;
> +};
> +
> struct thread_restore_args {
> struct restore_mem_zone *mz;
>
> @@ -100,6 +106,12 @@ struct thread_restore_args {
>
> bool check_only;
> struct thread_creds_args *creds_args;
> +
> + int seccomp_mode;
> + unsigned long seccomp_filters_pos;
> + struct thread_seccomp_filter *seccomp_filters;
> + void *seccomp_filters_data;
> + unsigned int seccomp_filters_n;
> } __aligned(64);
>
> typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
> @@ -163,9 +175,6 @@ struct task_restore_args {
> pid_t *zombies;
> unsigned int zombies_n;
>
> - struct sock_fprog *seccomp_filters;
> - unsigned int seccomp_filters_n;
> -
> /* * * * * * * * * * * * * * * * * * * * */
>
> unsigned long task_size;
> diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
> index 47c24c9719c1..1808e3d610c3 100644
> --- a/criu/include/seccomp.h
> +++ b/criu/include/seccomp.h
> @@ -27,6 +27,8 @@
> #define SECCOMP_FILTER_FLAG_TSYNC 1
> #endif
>
> +struct thread_restore_args;
> +struct task_restore_args;
> struct pstree_item;
> struct rb_node;
>
> @@ -69,7 +71,8 @@ extern void seccomp_free_entries(void);
> extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core);
> extern int seccomp_collect_dump_filters(void);
>
> -extern int prepare_seccomp_filters(void);
> -struct task_restore_args;
> -extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *);
> +extern int seccomp_read_image(void);
> +extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
> +extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg);
> +
> #endif
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 091026103805..5ede206c55ef 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -395,54 +395,82 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
> return 0;
> }
>
> -static int restore_seccomp(struct task_restore_args *args)
> +static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
> {
> + size_t i;
> int ret;
>
> - switch (args->seccomp_mode) {
> - case SECCOMP_MODE_DISABLED:
> - return 0;
> - case SECCOMP_MODE_STRICT:
> - ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
> + for (i = 0; i < args->seccomp_filters_n; i++) {
> + struct thread_seccomp_filter *filter = &args->seccomp_filters[i];
> +
> + ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags, (void *)&filter->sock_fprog);
> if (ret < 0) {
> - pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
> - goto die;
> + if (ret == -ENOSYS) {
> + pr_debug("seccomp: sys_seccomp is not supported in kernel, "
> + "switching to prctl interface\n");
> + ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
> + (long)(void *)&filter->sock_fprog, 0, 0);
> + if (ret) {
> + pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n",
> + ret, tid);
> + return -1;
> + }
> + } else {
> + pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n",
> + ret, tid);
> + return -1;
> + }
> }
> - return 0;
> - case SECCOMP_MODE_FILTER: {
> - int i;
> - void *filter_data;
> -
> - filter_data = &args->seccomp_filters[args->seccomp_filters_n];
> -
> - for (i = 0; i < args->seccomp_filters_n; i++) {
> - struct sock_fprog *fprog = &args->seccomp_filters[i];
> + }
>
> - fprog->filter = filter_data;
> + return 0;
> +}
>
> - /* We always TSYNC here, since we require that the
> - * creds for all threads be the same; this means we
> - * don't have to restore_seccomp() in threads, and that
> - * future TSYNC behavior will be correct.
> - */
> - ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
> - if (ret < 0) {
> - pr_err("sys_seccomp() returned %d\n", ret);
> - goto die;
> - }
> +static int restore_seccomp(struct thread_restore_args *args)
> +{
> + pid_t tid = 0;
> + int ret, i;
>
> - filter_data += fprog->len * sizeof(struct sock_filter);
> + for (i = 0; i < MAX_NS_NESTING; i++) {
> + if (args->pid[i] == 0) {
> + tid = args->pid[i - 1];
> + break;
> }
> + }
>
> - return 0;
> + if (tid != sys_gettid()) {
> + pr_err("seccomp: Unexpected tid %d != %d\n",
> + tid, (pid_t)sys_gettid());
> + return -1;
> }
> +
> + switch (args->seccomp_mode) {
> + case SECCOMP_MODE_DISABLED:
> + return 0;
> + break;
> + case SECCOMP_MODE_STRICT:
> + ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
> + if (ret < 0) {
> + pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n",
> + ret, tid);
> + }
> + break;
> + case SECCOMP_MODE_FILTER:
> + ret = restore_seccomp_filter(tid, args);
> + break;
> default:
> - goto die;
> + pr_err("seccomp: Unknown seccomp mode %d on tid %d\n",
> + args->seccomp_mode, tid);
> + ret = -1;
> + break;
> }
>
> - return 0;
> -die:
> - return -1;
> + if (!ret) {
> + pr_debug("seccomp: Restored mode %d on tid %d\n",
> + args->seccomp_mode, tid);
> + }
> +
> + return ret;
> }
>
> static int restore_robust_futex(struct thread_restore_args *args)
> @@ -541,6 +569,10 @@ long __export_restore_thread(struct thread_restore_args *args)
> sys_close(fd);
> }
>
> + /* Make sure it's before creds restore */
> + if (restore_seccomp(args))
> + goto core_restore_end;
> +
> ret = restore_creds(args->creds_args, args->ta->proc_fd);
> if (ret)
> goto core_restore_end;
> @@ -559,9 +591,6 @@ long __export_restore_thread(struct thread_restore_args *args)
> restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
> restore_pdeath_sig(args);
>
> - if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
> - pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
> -
> restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
>
> futex_dec_and_wake(&thread_inprogress);
> @@ -1680,8 +1709,7 @@ long __export_restore_task(struct task_restore_args *args)
> /* The kernel restricts setting seccomp to uid 0 in the current user
> * ns, so we must do this before restore_creds.
> */
> - pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
> - if (restore_seccomp(args))
> + if (restore_seccomp(args->t))
> goto core_restore_end;
>
> /*
> diff --git a/criu/seccomp.c b/criu/seccomp.c
> index a4304e645de7..dd608265731a 100644
> --- a/criu/seccomp.c
> +++ b/criu/seccomp.c
> @@ -24,6 +24,8 @@
> static struct rb_root seccomp_tid_rb_root = RB_ROOT;
> static struct seccomp_entry *seccomp_tid_entry_root;
>
> +static SeccompEntry *seccomp_img_entry;
> +
> struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory)
> {
> struct seccomp_entry *entry = NULL;
> @@ -293,10 +295,8 @@ int seccomp_collect_dump_filters(void)
> return 0;
> }
>
> -/* Populated on restore by prepare_seccomp_filters */
> -static SeccompEntry *se;
> -
> -int prepare_seccomp_filters(void)
> +/* The seccomp_img_entry will be shared between all children */
> +int seccomp_read_image(void)
> {
> struct cr_img *img;
> int ret;
> @@ -305,66 +305,129 @@ int prepare_seccomp_filters(void)
> if (!img)
> return -1;
>
> - ret = pb_read_one_eof(img, &se, PB_SECCOMP);
> + ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP);
> close_image(img);
> if (ret <= 0)
> return 0; /* there were no filters */
>
> - BUG_ON(!se);
> + BUG_ON(!seccomp_img_entry);
>
> return 0;
> }
>
> -int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta)
> +/* seccomp_img_entry will be freed per-children after forking */
> +static void free_seccomp_filters(void)
> {
> - SeccompFilter *sf = NULL;
> - struct sock_fprog *arr = NULL;
> - void *filter_data = NULL;
> - int ret = -1, i, n_filters;
> - size_t filter_size = 0;
> + if (seccomp_img_entry) {
> + seccomp_entry__free_unpacked(seccomp_img_entry, NULL);
> + seccomp_img_entry = NULL;
> + }
> +}
>
> - ta->seccomp_filters_n = 0;
> +void seccomp_rst_reloc(struct thread_restore_args *args)
> +{
> + size_t j, off;
>
> - if (!core->thread_core->has_seccomp_filter)
> - return 0;
> + if (!args->seccomp_filters_n)
> + return;
>
> - ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE);
> + args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE);
> + args->seccomp_filters_data = (void *)args->seccomp_filters +
> + args->seccomp_filters_n * sizeof(struct thread_seccomp_filter);
>
> - BUG_ON(core->thread_core->seccomp_filter > se->n_seccomp_filters);
> - sf = se->seccomp_filters[core->thread_core->seccomp_filter];
> + for (j = off = 0; j < args->seccomp_filters_n; j++) {
> + struct thread_seccomp_filter *f = &args->seccomp_filters[j];
>
> - while (1) {
> - ta->seccomp_filters_n++;
> - filter_size += sf->filter.len;
> + f->sock_fprog.filter = args->seccomp_filters_data + off;
> + off += f->sock_fprog.len * sizeof(struct sock_filter);
> + }
> +}
>
> - if (!sf->has_prev)
> - break;
> +int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta)
> +{
> + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]);
> + size_t i, j, nr_filters, filters_size, rst_size, off;
>
> - sf = se->seccomp_filters[sf->prev];
> - }
> + for (i = 0; i < item->nr_threads; i++) {
> + ThreadCoreEntry *thread_core = item->core[i]->thread_core;
> + struct thread_restore_args *args = &args_array[i];
> + SeccompFilter *sf;
>
> - n_filters = ta->seccomp_filters_n;
> - arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE);
> - if (!arr)
> - goto out;
> + args->seccomp_mode = SECCOMP_MODE_DISABLED;
> + args->seccomp_filters_pos = 0;
> + args->seccomp_filters_n = 0;
> + args->seccomp_filters = NULL;
> + args->seccomp_filters_data = NULL;
>
> - filter_data = &arr[n_filters];
> - sf = se->seccomp_filters[core->thread_core->seccomp_filter];
> - for (i = 0; i < n_filters; i++) {
> - struct sock_fprog *fprog = &arr[i];
> + if (thread_core->has_seccomp_mode)
> + args->seccomp_mode = thread_core->seccomp_mode;
>
> - BUG_ON(sf->filter.len % sizeof(struct sock_filter));
> - fprog->len = sf->filter.len / sizeof(struct sock_filter);
> + if (args->seccomp_mode != SECCOMP_MODE_FILTER)
> + continue;
>
> - memcpy(filter_data, sf->filter.data, sf->filter.len);
> + if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) {
> + pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
> + item->threads[i]->ns[0].virt, thread_core->seccomp_filter,
> + seccomp_img_entry->n_seccomp_filters);
> + return -1;
> + }
>
> - filter_data += sf->filter.len;
> - sf = se->seccomp_filters[sf->prev];
> - }
> + sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
> + if (sf->filter.len % (sizeof(struct sock_filter))) {
> + pr_err("Corrupted filter len on tid %d (index %u)\n",
> + item->threads[i]->ns[0].virt,
> + thread_core->seccomp_filter);
> + return -1;
> + }
> + filters_size = sf->filter.len;
> + nr_filters = 1;
> +
> + while (sf->has_prev) {
> + if (sf->prev >= seccomp_img_entry->n_seccomp_filters) {
> + pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
> + item->threads[i]->ns[0].virt, sf->prev,
> + seccomp_img_entry->n_seccomp_filters);
> + return -1;
> + }
> +
> + sf = seccomp_img_entry->seccomp_filters[sf->prev];
> + if (sf->filter.len % (sizeof(struct sock_filter))) {
> + pr_err("Corrupted filter len on tid %d (index %u)\n",
> + item->threads[i]->ns[0].virt, sf->prev);
> + return -1;
> + }
> + filters_size += sf->filter.len;
> + nr_filters++;
> + }
>
> - ret = 0;
> + args->seccomp_filters_n = nr_filters;
>
> -out:
> - seccomp_entry__free_unpacked(se, NULL);
> - return ret;
> + rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter);
> + args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE);
> + args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE);
> + if (!args->seccomp_filters) {
> + pr_err("Can't allocate %zu bytes for filters on tid %d\n",
> + rst_size, item->threads[i]->ns[0].virt);
> + return -ENOMEM;
> + }
> + args->seccomp_filters_data = (void *)args->seccomp_filters +
> + nr_filters * sizeof(struct thread_seccomp_filter);
> +
> + sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
> + for (j = off = 0; j < nr_filters; j++) {
> + struct thread_seccomp_filter *f = &args->seccomp_filters[j];
> +
> + f->sock_fprog.len = sf->filter.len / sizeof(struct sock_filter);
> + f->sock_fprog.filter = args->seccomp_filters_data + off;
> + f->flags = sf->flags;
> +
> + memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len);
> +
> + off += sf->filter.len;
> + sf = seccomp_img_entry->seccomp_filters[sf->prev];
> + }
> + }
> +
> + free_seccomp_filters();
> + return 0;
> }
> --
> 2.14.3
>
More information about the CRIU
mailing list