[CRIU] [PATCH 2/3] shmem: rework getting file descriptors for shared memory regions

Fri Oct 10 11:30:48 PDT 2014

On 10/10/2014 08:39 PM, Andrey Vagin wrote:
> /proc/PID/map_files are protected by the global CAP_SYS_ADMIN, so we
> need to avoid using them to support user namespaces.
> 
> We are going to use memfd_create() to get the first file descriptor and
> then all others processes will able to open it via /proc/PID/fd/X.
> 
> This patch reworks slave processes to not use map_files.
> 
> Signed-off-by: Andrey Vagin <avagin at openvz.org>
> ---
>  include/shmem.h |  2 ++
>  pie/restorer.c  | 10 ----------
>  shmem.c         | 32 ++++++++++++++++++++++++--------
>  3 files changed, 26 insertions(+), 18 deletions(-)
> 
> diff --git a/include/shmem.h b/include/shmem.h
> index 2526e3e..4b482e8 100644
> --- a/include/shmem.h
> +++ b/include/shmem.h
> @@ -19,6 +19,8 @@ struct shmem_info {
>  	int		pid;
>  	int		fd;
>  	futex_t		lock;
> +	int		count;		/* the number of regions */
> +	int		self_count;	/* the number of regions, which belongs to "pid" */
>  };
>  
>  struct _VmaEntry;
> diff --git a/pie/restorer.c b/pie/restorer.c
> index 6c9d0a3..7196a3e 100644
> --- a/pie/restorer.c
> +++ b/pie/restorer.c
> @@ -881,16 +881,6 @@ long __export_restore_task(struct task_restore_args *args)
>  		if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
>  			continue;
>  
> -		if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
> -			struct shmem_info *entry;
> -
> -			entry = find_shmem(args->shmems, args->nr_shmems,
> -						  vma_entry->shmid);
> -			if (entry && entry->pid == my_pid &&
> -			    entry->start == vma_entry->start)
> -				futex_set_and_wake(&entry->lock, 1);
> -		}
> -
>  		if (vma_entry->prot & PROT_WRITE)
>  			continue;
>  
> diff --git a/shmem.c b/shmem.c
> index 526d9a9..bfe2743 100644
> --- a/shmem.c
> +++ b/shmem.c
> @@ -55,6 +55,7 @@ int collect_shmem(int pid, VmaEntry *vi)
>  
>  		if (si->size < size)
>  			si->size = size;
> +		si->count++;
>  
>  		/*
>  		 * Only the shared mapping with a lowest
> @@ -62,12 +63,17 @@ int collect_shmem(int pid, VmaEntry *vi)
>  		 * will wait until the kernel propagate this mapping
>  		 * into /proc
>  		 */
> -		if (!pid_rst_prio(pid, si->pid))
> +		if (!pid_rst_prio(pid, si->pid)) {
> +			if (si->pid == pid)
> +				si->self_count++;
> +
>  			return 0;
> +		}
>  
>  		si->pid	 = pid;
>  		si->start = vi->start;
>  		si->end	 = vi->end;
> +		si->self_count = 1;
>  
>  		return 0;
>  	}
> @@ -85,6 +91,8 @@ int collect_shmem(int pid, VmaEntry *vi)
>  	si->pid	  = pid;
>  	si->size  = size;
>  	si->fd    = -1;
> +	si->count = 1;
> +	si->self_count = 1;
>  
>  	nr_shmems++;
>  	futex_init(&si->lock);
> @@ -97,17 +105,18 @@ static int shmem_wait_and_open(int pid, struct shmem_info *si)
>  	char path[128];
>  	int ret;
>  
> -	snprintf(path, sizeof(path), "/proc/%d/map_files/%lx-%lx",
> -		si->pid, si->start, si->end);
> +	pr_info("Waiting for the %lx shmem to appear\n", si->shmid);
> +	futex_wait_while(&si->lock, 0);
>  
> -	pr_info("Waiting for [%s] to appear\n", path);
> -	futex_wait_until(&si->lock, 1);

What's the point in changing wait_while(0) into wait_until(1)?

> +	snprintf(path, sizeof(path), "/proc/%d/fd/%d",
> +		si->pid, si->fd);
>  
>  	pr_info("Opening shmem [%s] \n", path);
> -	ret = open_proc_rw(si->pid, "map_files/%lx-%lx", si->start, si->end);
> +	ret = open_proc_rw(si->pid, "fd/%d", si->fd);
>  	if (ret < 0)
>  		pr_perror("     %d: Can't stat shmem at %s",
>  				si->pid, path);
> +	futex_inc_and_wake(&si->lock);

Huh? Slave wakes up the rest?

>  	return ret;
>  }
>  
> @@ -207,10 +216,17 @@ int get_shmem_fd(int pid, VmaEntry *vi)
>  			(unsigned long) addr,
>  			(unsigned long) addr + si->size);
>  	munmap(addr, si->size);
> -	if (f < 0)
> -		return -1;
>  
>  	si->fd = f;
> +
> +	/* Send signal to slaves, that they can open fd for this shmem */
> +	futex_inc_and_wake(&si->lock);
> +	/*
> +	 * All other regions in this process will duplicate
> +	 * the file descriptor, so we don't wait them.
> +	 */
> +	futex_wait_until(&si->lock, si->count - si->self_count + 1);

I don't see any place where count and self_count are used
one without another. Can we use one counter?

Or even better, can we use just one lock counter? Like this:

* initially lock is -nr (not opened) where nr is the amount of tasks
  waiting for fd to appear
* all tasks wait for lock to become non-negative
* master opens fd, gets the nr value and set is to zero
* every slave opening an fd increments one
* master waits for the counter to become nr

?

> +
>  	return f;
>  }
>  
>