[CRIU] [PATCH 19/28] seccomp: Add restore of per-thread filters

Andrey Vagin avagin at virtuozzo.com
Fri Mar 23 01:13:24 MSK 2018


On Wed, Mar 21, 2018 at 12:43:04AM +0300, Cyrill Gorcunov wrote:
> From: Cyrill Gorcunov <gorcunov at virtuozzo.com>
> 
> https://jira.sw.ru/browse/PSBM-78762
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
> ---
>  criu/cr-restore.c       |  10 ++--
>  criu/include/restorer.h |  15 ++++-
>  criu/include/seccomp.h  |   9 ++-
>  criu/pie/restorer.c     | 106 +++++++++++++++++++++-------------
>  criu/seccomp.c          | 149 ++++++++++++++++++++++++++++++++++--------------
>  5 files changed, 195 insertions(+), 94 deletions(-)


200 lines of code!!! Pls, write what is going on here in a commit
message.

> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index b9f8de5e82b1..3025ec4032c2 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -331,7 +331,7 @@ static int root_prepare_shared(void)
>  	if (prepare_remaps())
>  		return -1;
>  
> -	if (prepare_seccomp_filters())
> +	if (seccomp_read_image())
>  		return -1;
>  
>  	if (collect_images(cinfos, ARRAY_SIZE(cinfos)))
> @@ -1031,7 +1031,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
>  	if (prepare_timerfds(ta))
>  		return -1;
>  
> -	if (seccomp_filters_get_rst_pos(core, ta) < 0)
> +	if (seccomp_prepare_threads(current, ta) < 0)
>  		return -1;
>  
>  	if (prepare_itimers(pid, ta, core) < 0)
> @@ -3668,12 +3668,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  	RST_MEM_FIXUP_PPTR(task_args->rlims);
>  	RST_MEM_FIXUP_PPTR(task_args->helpers);
>  	RST_MEM_FIXUP_PPTR(task_args->zombies);
> -	RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
>  	RST_MEM_FIXUP_PPTR(task_args->vma_ios);
>  
> -	if (core->thread_core->has_seccomp_mode)
> -		task_args->seccomp_mode = core->thread_core->seccomp_mode;
> -
>  	task_args->compatible_mode = core_is_compat(core);
>  
>  	if (opts.check_only)
> @@ -3763,6 +3759,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  		if (ret)
>  			goto err;
>  
> +		seccomp_rst_reloc(&thread_args[i]);
> +
>  		thread_args[i].mz = mz + i;
>  		sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe;
>  
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index 15307d9c0701..3767d9d25088 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -4,6 +4,7 @@
>  #include <signal.h>
>  #include <limits.h>
>  #include <sys/resource.h>
> +#include <linux/filter.h>
>  
>  #include "common/config.h"
>  #include "types.h"
> @@ -76,6 +77,11 @@ struct thread_creds_args {
>  	unsigned long			mem_pos_next;
>  };
>  
> +struct thread_seccomp_filter {
> +	struct sock_fprog		sock_fprog;
> +	unsigned int			flags;
> +};
> +
>  struct thread_restore_args {
>  	struct restore_mem_zone		*mz;
>  
> @@ -100,6 +106,12 @@ struct thread_restore_args {
>  
>  	bool				check_only;
>  	struct thread_creds_args	*creds_args;
> +
> +	int				seccomp_mode;
> +	unsigned long			seccomp_filters_pos;
> +	struct thread_seccomp_filter	*seccomp_filters;
> +	void				*seccomp_filters_data;
> +	unsigned int			seccomp_filters_n;
>  } __aligned(64);
>  
>  typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
> @@ -163,9 +175,6 @@ struct task_restore_args {
>  	pid_t				*zombies;
>  	unsigned int			zombies_n;
>  
> -	struct sock_fprog		*seccomp_filters;
> -	unsigned int			seccomp_filters_n;
> -
>  	/* * * * * * * * * * * * * * * * * * * * */
>  
>  	unsigned long			task_size;
> diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
> index 47c24c9719c1..1808e3d610c3 100644
> --- a/criu/include/seccomp.h
> +++ b/criu/include/seccomp.h
> @@ -27,6 +27,8 @@
>  #define SECCOMP_FILTER_FLAG_TSYNC 1
>  #endif
>  
> +struct thread_restore_args;
> +struct task_restore_args;
>  struct pstree_item;
>  struct rb_node;
>  
> @@ -69,7 +71,8 @@ extern void seccomp_free_entries(void);
>  extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core);
>  extern int seccomp_collect_dump_filters(void);
>  
> -extern int prepare_seccomp_filters(void);
> -struct task_restore_args;
> -extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *);
> +extern int seccomp_read_image(void);
> +extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
> +extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg);
> +
>  #endif
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 091026103805..5ede206c55ef 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -395,54 +395,82 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
>  	return 0;
>  }
>  
> -static int restore_seccomp(struct task_restore_args *args)
> +static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
>  {
> +	size_t i;
>  	int ret;
>  
> -	switch (args->seccomp_mode) {
> -	case SECCOMP_MODE_DISABLED:
> -		return 0;
> -	case SECCOMP_MODE_STRICT:
> -		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
> +	for (i = 0; i < args->seccomp_filters_n; i++) {
> +		struct thread_seccomp_filter *filter = &args->seccomp_filters[i];
> +
> +		ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags, (void *)&filter->sock_fprog);
>  		if (ret < 0) {
> -			pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
> -			goto die;
> +			if (ret == -ENOSYS) {
> +				pr_debug("seccomp: sys_seccomp is not supported in kernel, "
> +					 "switching to prctl interface\n");
> +				ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
> +						(long)(void *)&filter->sock_fprog, 0, 0);
> +				if (ret) {
> +					pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n",
> +					       ret, tid);
> +					return -1;
> +				}
> +			} else {
> +				pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n",
> +				       ret, tid);
> +				return -1;
> +			}
>  		}
> -		return 0;
> -	case SECCOMP_MODE_FILTER: {
> -		int i;
> -		void *filter_data;
> -
> -		filter_data = &args->seccomp_filters[args->seccomp_filters_n];
> -
> -		for (i = 0; i < args->seccomp_filters_n; i++) {
> -			struct sock_fprog *fprog = &args->seccomp_filters[i];
> +	}
>  
> -			fprog->filter = filter_data;
> +	return 0;
> +}
>  
> -			/* We always TSYNC here, since we require that the
> -			 * creds for all threads be the same; this means we
> -			 * don't have to restore_seccomp() in threads, and that
> -			 * future TSYNC behavior will be correct.
> -			 */
> -			ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
> -			if (ret < 0) {
> -				pr_err("sys_seccomp() returned %d\n", ret);
> -				goto die;
> -			}
> +static int restore_seccomp(struct thread_restore_args *args)
> +{
> +	pid_t tid = 0;
> +	int ret, i;
>  
> -			filter_data += fprog->len * sizeof(struct sock_filter);
> +	for (i = 0; i < MAX_NS_NESTING; i++) {
> +		if (args->pid[i] == 0) {
> +			tid = args->pid[i - 1];
> +			break;
>  		}
> +	}
>  
> -		return 0;
> +	if (tid != sys_gettid()) {
> +		pr_err("seccomp: Unexpected tid %d != %d\n",
> +		       tid, (pid_t)sys_gettid());
> +		return -1;
>  	}
> +
> +	switch (args->seccomp_mode) {
> +	case SECCOMP_MODE_DISABLED:
> +		return 0;
> +		break;
> +	case SECCOMP_MODE_STRICT:
> +		ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
> +		if (ret < 0) {
> +			pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n",
> +			       ret, tid);
> +		}
> +		break;
> +	case SECCOMP_MODE_FILTER:
> +		ret = restore_seccomp_filter(tid, args);
> +		break;
>  	default:
> -		goto die;
> +		pr_err("seccomp: Unknown seccomp mode %d on tid %d\n",
> +		       args->seccomp_mode, tid);
> +		ret = -1;
> +		break;
>  	}
>  
> -	return 0;
> -die:
> -	return -1;
> +	if (!ret) {
> +		pr_debug("seccomp: Restored mode %d on tid %d\n",
> +			 args->seccomp_mode, tid);
> +	}
> +
> +	return ret;
>  }
>  
>  static int restore_robust_futex(struct thread_restore_args *args)
> @@ -541,6 +569,10 @@ long __export_restore_thread(struct thread_restore_args *args)
>  		sys_close(fd);
>  	}
>  
> +	/* Make sure it's before creds restore */
> +	if (restore_seccomp(args))
> +		goto core_restore_end;
> +
>  	ret = restore_creds(args->creds_args, args->ta->proc_fd);
>  	if (ret)
>  		goto core_restore_end;
> @@ -559,9 +591,6 @@ long __export_restore_thread(struct thread_restore_args *args)
>  	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
>  	restore_pdeath_sig(args);
>  
> -	if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
> -		pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
> -
>  	restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS);
>  
>  	futex_dec_and_wake(&thread_inprogress);
> @@ -1680,8 +1709,7 @@ long __export_restore_task(struct task_restore_args *args)
>  	/* The kernel restricts setting seccomp to uid 0 in the current user
>  	 * ns, so we must do this before restore_creds.
>  	 */
> -	pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
> -	if (restore_seccomp(args))
> +	if (restore_seccomp(args->t))
>  		goto core_restore_end;
>  
>  	/*
> diff --git a/criu/seccomp.c b/criu/seccomp.c
> index a4304e645de7..dd608265731a 100644
> --- a/criu/seccomp.c
> +++ b/criu/seccomp.c
> @@ -24,6 +24,8 @@
>  static struct rb_root seccomp_tid_rb_root = RB_ROOT;
>  static struct seccomp_entry *seccomp_tid_entry_root;
>  
> +static SeccompEntry *seccomp_img_entry;
> +
>  struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory)
>  {
>  	struct seccomp_entry *entry = NULL;
> @@ -293,10 +295,8 @@ int seccomp_collect_dump_filters(void)
>  	return 0;
>  }
>  
> -/* Populated on restore by prepare_seccomp_filters */
> -static SeccompEntry *se;
> -
> -int prepare_seccomp_filters(void)
> +/* The seccomp_img_entry will be shared between all children */
> +int seccomp_read_image(void)
>  {
>  	struct cr_img *img;
>  	int ret;
> @@ -305,66 +305,129 @@ int prepare_seccomp_filters(void)
>  	if (!img)
>  		return -1;
>  
> -	ret = pb_read_one_eof(img, &se, PB_SECCOMP);
> +	ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP);
>  	close_image(img);
>  	if (ret <= 0)
>  		return 0; /* there were no filters */
>  
> -	BUG_ON(!se);
> +	BUG_ON(!seccomp_img_entry);
>  
>  	return 0;
>  }
>  
> -int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta)
> +/* seccomp_img_entry will be freed per-children after forking */
> +static void free_seccomp_filters(void)
>  {
> -	SeccompFilter *sf = NULL;
> -	struct sock_fprog *arr = NULL;
> -	void *filter_data = NULL;
> -	int ret = -1, i, n_filters;
> -	size_t filter_size = 0;
> +	if (seccomp_img_entry) {
> +		seccomp_entry__free_unpacked(seccomp_img_entry, NULL);
> +		seccomp_img_entry = NULL;
> +	}
> +}
>  
> -	ta->seccomp_filters_n = 0;
> +void seccomp_rst_reloc(struct thread_restore_args *args)
> +{
> +	size_t j, off;
>  
> -	if (!core->thread_core->has_seccomp_filter)
> -		return 0;
> +	if (!args->seccomp_filters_n)
> +		return;
>  
> -	ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE);
> +	args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE);
> +	args->seccomp_filters_data = (void *)args->seccomp_filters +
> +			args->seccomp_filters_n * sizeof(struct thread_seccomp_filter);
>  
> -	BUG_ON(core->thread_core->seccomp_filter > se->n_seccomp_filters);
> -	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
> +	for (j = off = 0; j < args->seccomp_filters_n; j++) {
> +		struct thread_seccomp_filter *f = &args->seccomp_filters[j];
>  
> -	while (1) {
> -		ta->seccomp_filters_n++;
> -		filter_size += sf->filter.len;
> +		f->sock_fprog.filter = args->seccomp_filters_data + off;
> +		off += f->sock_fprog.len * sizeof(struct sock_filter);
> +	}
> +}
>  
> -		if (!sf->has_prev)
> -			break;
> +int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta)
> +{
> +	struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]);
> +	size_t i, j, nr_filters, filters_size, rst_size, off;
>  
> -		sf = se->seccomp_filters[sf->prev];
> -	}
> +	for (i = 0; i < item->nr_threads; i++) {
> +		ThreadCoreEntry *thread_core = item->core[i]->thread_core;
> +		struct thread_restore_args *args = &args_array[i];
> +		SeccompFilter *sf;
>  
> -	n_filters = ta->seccomp_filters_n;
> -	arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE);
> -	if (!arr)
> -		goto out;
> +		args->seccomp_mode		= SECCOMP_MODE_DISABLED;
> +		args->seccomp_filters_pos	= 0;
> +		args->seccomp_filters_n		= 0;
> +		args->seccomp_filters		= NULL;
> +		args->seccomp_filters_data	= NULL;
>  
> -	filter_data = &arr[n_filters];
> -	sf = se->seccomp_filters[core->thread_core->seccomp_filter];
> -	for (i = 0; i < n_filters; i++) {
> -		struct sock_fprog *fprog = &arr[i];
> +		if (thread_core->has_seccomp_mode)
> +			args->seccomp_mode = thread_core->seccomp_mode;
>  
> -		BUG_ON(sf->filter.len % sizeof(struct sock_filter));
> -		fprog->len = sf->filter.len / sizeof(struct sock_filter);
> +		if (args->seccomp_mode != SECCOMP_MODE_FILTER)
> +			continue;
>  
> -		memcpy(filter_data, sf->filter.data, sf->filter.len);
> +		if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) {
> +			pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
> +			       item->threads[i]->ns[0].virt, thread_core->seccomp_filter,
> +			       seccomp_img_entry->n_seccomp_filters);
> +			return -1;
> +		}
>  
> -		filter_data += sf->filter.len;
> -		sf = se->seccomp_filters[sf->prev];
> -	}
> +		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
> +		if (sf->filter.len % (sizeof(struct sock_filter))) {
> +			pr_err("Corrupted filter len on tid %d (index %u)\n",
> +			       item->threads[i]->ns[0].virt,
> +			       thread_core->seccomp_filter);
> +			return -1;
> +		}
> +		filters_size = sf->filter.len;
> +		nr_filters = 1;
> +
> +		while (sf->has_prev) {
> +			if (sf->prev >= seccomp_img_entry->n_seccomp_filters) {
> +				pr_err("Corrupted filter index on tid %d (%u > %zu)\n",
> +				       item->threads[i]->ns[0].virt, sf->prev,
> +				       seccomp_img_entry->n_seccomp_filters);
> +				return -1;
> +			}
> +
> +			sf = seccomp_img_entry->seccomp_filters[sf->prev];
> +			if (sf->filter.len % (sizeof(struct sock_filter))) {
> +				pr_err("Corrupted filter len on tid %d (index %u)\n",
> +				       item->threads[i]->ns[0].virt, sf->prev);
> +				return -1;
> +			}
> +			filters_size += sf->filter.len;
> +			nr_filters++;
> +		}
>  
> -	ret = 0;
> +		args->seccomp_filters_n = nr_filters;
>  
> -out:
> -	seccomp_entry__free_unpacked(se, NULL);
> -	return ret;
> +		rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter);
> +		args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE);
> +		args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE);
> +		if (!args->seccomp_filters) {
> +			pr_err("Can't allocate %zu bytes for filters on tid %d\n",
> +			       rst_size, item->threads[i]->ns[0].virt);
> +			return -ENOMEM;
> +		}
> +		args->seccomp_filters_data = (void *)args->seccomp_filters +
> +			nr_filters * sizeof(struct thread_seccomp_filter);
> +
> +		sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter];
> +		for (j = off = 0; j < nr_filters; j++) {
> +			struct thread_seccomp_filter *f = &args->seccomp_filters[j];
> +
> +			f->sock_fprog.len	= sf->filter.len / sizeof(struct sock_filter);
> +			f->sock_fprog.filter	= args->seccomp_filters_data + off;
> +			f->flags		= sf->flags;
> +
> +			memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len);
> +
> +			off += sf->filter.len;
> +			sf = seccomp_img_entry->seccomp_filters[sf->prev];
> +		}
> +	}
> +
> +	free_seccomp_filters();
> +	return 0;
>  }
> -- 
> 2.14.3
> 


More information about the CRIU mailing list