[CRIU] [PATCH 2/3] shmem: rework getting file descriptors for shared memory regions

Andrew Vagin avagin at parallels.com
Fri Oct 10 11:39:30 PDT 2014


On Fri, Oct 10, 2014 at 10:30:48PM +0400, Pavel Emelyanov wrote:
> On 10/10/2014 08:39 PM, Andrey Vagin wrote:
> > /proc/PID/map_files are protected by the global CAP_SYS_ADMIN, so we
> > need to avoid using them to support user namespaces.
> > 
> > We are going to use memfd_create() to get the first file descriptor and
> > then all others processes will able to open it via /proc/PID/fd/X.
> > 
> > This patch reworks slave processes to not use map_files.
> > 
> > Signed-off-by: Andrey Vagin <avagin at openvz.org>
> > ---
> >  include/shmem.h |  2 ++
> >  pie/restorer.c  | 10 ----------
> >  shmem.c         | 32 ++++++++++++++++++++++++--------
> >  3 files changed, 26 insertions(+), 18 deletions(-)
> > 
> > diff --git a/include/shmem.h b/include/shmem.h
> > index 2526e3e..4b482e8 100644
> > --- a/include/shmem.h
> > +++ b/include/shmem.h
> > @@ -19,6 +19,8 @@ struct shmem_info {
> >  	int		pid;
> >  	int		fd;
> >  	futex_t		lock;
> > +	int		count;		/* the number of regions */
> > +	int		self_count;	/* the number of regions, which belongs to "pid" */
> >  };
> >  
> >  struct _VmaEntry;
> > diff --git a/pie/restorer.c b/pie/restorer.c
> > index 6c9d0a3..7196a3e 100644
> > --- a/pie/restorer.c
> > +++ b/pie/restorer.c
> > @@ -881,16 +881,6 @@ long __export_restore_task(struct task_restore_args *args)
> >  		if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
> >  			continue;
> >  
> > -		if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
> > -			struct shmem_info *entry;
> > -
> > -			entry = find_shmem(args->shmems, args->nr_shmems,
> > -						  vma_entry->shmid);
> > -			if (entry && entry->pid == my_pid &&
> > -			    entry->start == vma_entry->start)
> > -				futex_set_and_wake(&entry->lock, 1);
> > -		}
> > -
> >  		if (vma_entry->prot & PROT_WRITE)
> >  			continue;
> >  
> > diff --git a/shmem.c b/shmem.c
> > index 526d9a9..bfe2743 100644
> > --- a/shmem.c
> > +++ b/shmem.c
> > @@ -55,6 +55,7 @@ int collect_shmem(int pid, VmaEntry *vi)
> >  
> >  		if (si->size < size)
> >  			si->size = size;
> > +		si->count++;
> >  
> >  		/*
> >  		 * Only the shared mapping with a lowest
> > @@ -62,12 +63,17 @@ int collect_shmem(int pid, VmaEntry *vi)
> >  		 * will wait until the kernel propagate this mapping
> >  		 * into /proc
> >  		 */
> > -		if (!pid_rst_prio(pid, si->pid))
> > +		if (!pid_rst_prio(pid, si->pid)) {
> > +			if (si->pid == pid)
> > +				si->self_count++;
> > +
> >  			return 0;
> > +		}
> >  
> >  		si->pid	 = pid;
> >  		si->start = vi->start;
> >  		si->end	 = vi->end;
> > +		si->self_count = 1;
> >  
> >  		return 0;
> >  	}
> > @@ -85,6 +91,8 @@ int collect_shmem(int pid, VmaEntry *vi)
> >  	si->pid	  = pid;
> >  	si->size  = size;
> >  	si->fd    = -1;
> > +	si->count = 1;
> > +	si->self_count = 1;
> >  
> >  	nr_shmems++;
> >  	futex_init(&si->lock);
> > @@ -97,17 +105,18 @@ static int shmem_wait_and_open(int pid, struct shmem_info *si)
> >  	char path[128];
> >  	int ret;
> >  
> > -	snprintf(path, sizeof(path), "/proc/%d/map_files/%lx-%lx",
> > -		si->pid, si->start, si->end);
> > +	pr_info("Waiting for the %lx shmem to appear\n", si->shmid);
> > +	futex_wait_while(&si->lock, 0);
> >  
> > -	pr_info("Waiting for [%s] to appear\n", path);
> > -	futex_wait_until(&si->lock, 1);
> 
> What's the point in changing wait_while(0) into wait_until(1)?

* the master opens a descriptor and set the futex to 1
* slaves open their descriptors and increment the futex
* master waits all slaves on this futex

> 
> > +	snprintf(path, sizeof(path), "/proc/%d/fd/%d",
> > +		si->pid, si->fd);
> >  
> >  	pr_info("Opening shmem [%s] \n", path);
> > -	ret = open_proc_rw(si->pid, "map_files/%lx-%lx", si->start, si->end);
> > +	ret = open_proc_rw(si->pid, "fd/%d", si->fd);
> >  	if (ret < 0)
> >  		pr_perror("     %d: Can't stat shmem at %s",
> >  				si->pid, path);
> > +	futex_inc_and_wake(&si->lock);
> 
> Huh? Slave wakes up the rest?

Slaves wake up the master. A master must understand, when all slave
opened descriptors. After that the master can restore the region and close
the descriptor.

> 
> >  	return ret;
> >  }
> >  
> > @@ -207,10 +216,17 @@ int get_shmem_fd(int pid, VmaEntry *vi)
> >  			(unsigned long) addr,
> >  			(unsigned long) addr + si->size);
> >  	munmap(addr, si->size);
> > -	if (f < 0)
> > -		return -1;
> >  
> >  	si->fd = f;
> > +
> > +	/* Send signal to slaves, that they can open fd for this shmem */
> > +	futex_inc_and_wake(&si->lock);
> > +	/*
> > +	 * All other regions in this process will duplicate
> > +	 * the file descriptor, so we don't wait them.
> > +	 */
> > +	futex_wait_until(&si->lock, si->count - si->self_count + 1);
> 
> I don't see any place where count and self_count are used
> one without another. Can we use one counter?

Look at the place where they are calculated. Here is a problem, that we
don't know, which process will restore an region. Each time when we
found a process with a smaller pid, we reset self_count.

> 
> Or even better, can we use just one lock counter? Like this:
> 
> * initially lock is -nr (not opened) where nr is the amount of tasks
>   waiting for fd to appear

initally lock is 0

> * all tasks wait for lock to become non-negative

all tasks except master wait for lock to become 1

> * master opens fd, gets the nr value and set is to zero

and set it to 1

> * every slave opening an fd increments one
> * master waits for the counter to become nr
> 
> ?
> 
> > +
> >  	return f;
> >  }
> >  
> > 
> 


More information about the CRIU mailing list