[CRIU] [PATCH 2/3] shmem: rework getting file descriptors for shared memory regions

Pavel Emelyanov xemul at parallels.com
Fri Oct 10 11:45:38 PDT 2014


On 10/10/2014 10:39 PM, Andrew Vagin wrote:
> On Fri, Oct 10, 2014 at 10:30:48PM +0400, Pavel Emelyanov wrote:
>> On 10/10/2014 08:39 PM, Andrey Vagin wrote:
>>> /proc/PID/map_files are protected by the global CAP_SYS_ADMIN, so we
>>> need to avoid using them to support user namespaces.
>>>
>>> We are going to use memfd_create() to get the first file descriptor and
>>> then all others processes will able to open it via /proc/PID/fd/X.
>>>
>>> This patch reworks slave processes to not use map_files.
>>>
>>> Signed-off-by: Andrey Vagin <avagin at openvz.org>
>>> ---
>>>  include/shmem.h |  2 ++
>>>  pie/restorer.c  | 10 ----------
>>>  shmem.c         | 32 ++++++++++++++++++++++++--------
>>>  3 files changed, 26 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/include/shmem.h b/include/shmem.h
>>> index 2526e3e..4b482e8 100644
>>> --- a/include/shmem.h
>>> +++ b/include/shmem.h
>>> @@ -19,6 +19,8 @@ struct shmem_info {
>>>  	int		pid;
>>>  	int		fd;
>>>  	futex_t		lock;
>>> +	int		count;		/* the number of regions */
>>> +	int		self_count;	/* the number of regions, which belongs to "pid" */
>>>  };
>>>  
>>>  struct _VmaEntry;
>>> diff --git a/pie/restorer.c b/pie/restorer.c
>>> index 6c9d0a3..7196a3e 100644
>>> --- a/pie/restorer.c
>>> +++ b/pie/restorer.c
>>> @@ -881,16 +881,6 @@ long __export_restore_task(struct task_restore_args *args)
>>>  		if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
>>>  			continue;
>>>  
>>> -		if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
>>> -			struct shmem_info *entry;
>>> -
>>> -			entry = find_shmem(args->shmems, args->nr_shmems,
>>> -						  vma_entry->shmid);
>>> -			if (entry && entry->pid == my_pid &&
>>> -			    entry->start == vma_entry->start)
>>> -				futex_set_and_wake(&entry->lock, 1);
>>> -		}
>>> -
>>>  		if (vma_entry->prot & PROT_WRITE)
>>>  			continue;
>>>  
>>> diff --git a/shmem.c b/shmem.c
>>> index 526d9a9..bfe2743 100644
>>> --- a/shmem.c
>>> +++ b/shmem.c
>>> @@ -55,6 +55,7 @@ int collect_shmem(int pid, VmaEntry *vi)
>>>  
>>>  		if (si->size < size)
>>>  			si->size = size;
>>> +		si->count++;
>>>  
>>>  		/*
>>>  		 * Only the shared mapping with a lowest
>>> @@ -62,12 +63,17 @@ int collect_shmem(int pid, VmaEntry *vi)
>>>  		 * will wait until the kernel propagate this mapping
>>>  		 * into /proc
>>>  		 */
>>> -		if (!pid_rst_prio(pid, si->pid))
>>> +		if (!pid_rst_prio(pid, si->pid)) {
>>> +			if (si->pid == pid)
>>> +				si->self_count++;
>>> +
>>>  			return 0;
>>> +		}
>>>  
>>>  		si->pid	 = pid;
>>>  		si->start = vi->start;
>>>  		si->end	 = vi->end;
>>> +		si->self_count = 1;
>>>  
>>>  		return 0;
>>>  	}
>>> @@ -85,6 +91,8 @@ int collect_shmem(int pid, VmaEntry *vi)
>>>  	si->pid	  = pid;
>>>  	si->size  = size;
>>>  	si->fd    = -1;
>>> +	si->count = 1;
>>> +	si->self_count = 1;
>>>  
>>>  	nr_shmems++;
>>>  	futex_init(&si->lock);
>>> @@ -97,17 +105,18 @@ static int shmem_wait_and_open(int pid, struct shmem_info *si)
>>>  	char path[128];
>>>  	int ret;
>>>  
>>> -	snprintf(path, sizeof(path), "/proc/%d/map_files/%lx-%lx",
>>> -		si->pid, si->start, si->end);
>>> +	pr_info("Waiting for the %lx shmem to appear\n", si->shmid);
>>> +	futex_wait_while(&si->lock, 0);
>>>  
>>> -	pr_info("Waiting for [%s] to appear\n", path);
>>> -	futex_wait_until(&si->lock, 1);
>>
>> What's the point in changing wait_while(0) into wait_until(1)?
> 
> * the master opens a descriptor and set the futex to 1
> * slaves open their descriptors and increment the futex
> * master waits all slaves on this futex

Write it in the comment to "lock".

>>
>>> +	snprintf(path, sizeof(path), "/proc/%d/fd/%d",
>>> +		si->pid, si->fd);
>>>  
>>>  	pr_info("Opening shmem [%s] \n", path);
>>> -	ret = open_proc_rw(si->pid, "map_files/%lx-%lx", si->start, si->end);
>>> +	ret = open_proc_rw(si->pid, "fd/%d", si->fd);
>>>  	if (ret < 0)
>>>  		pr_perror("     %d: Can't stat shmem at %s",
>>>  				si->pid, path);
>>> +	futex_inc_and_wake(&si->lock);
>>
>> Huh? Slave wakes up the rest?
> 
> Slaves wake up the master. A master must understand, when all slave
> opened descriptors. After that the master can restore the region and close
> the descriptor.
> 
>>
>>>  	return ret;
>>>  }
>>>  
>>> @@ -207,10 +216,17 @@ int get_shmem_fd(int pid, VmaEntry *vi)
>>>  			(unsigned long) addr,
>>>  			(unsigned long) addr + si->size);
>>>  	munmap(addr, si->size);
>>> -	if (f < 0)
>>> -		return -1;
>>>  
>>>  	si->fd = f;
>>> +
>>> +	/* Send signal to slaves, that they can open fd for this shmem */
>>> +	futex_inc_and_wake(&si->lock);
>>> +	/*
>>> +	 * All other regions in this process will duplicate
>>> +	 * the file descriptor, so we don't wait them.
>>> +	 */
>>> +	futex_wait_until(&si->lock, si->count - si->self_count + 1);
>>
>> I don't see any place where count and self_count are used
>> one without another. Can we use one counter?
> 
> Look at the place where they are calculated. Here is a problem, that we
> don't know, which process will restore an region. Each time when we
> found a process with a smaller pid, we reset self_count.

OK, I see. Add this comment to the sources.

>>
>> Or even better, can we use just one lock counter? Like this:
>>
>> * initially lock is -nr (not opened) where nr is the amount of tasks
>>   waiting for fd to appear
> 
> initally lock is 0
> 
>> * all tasks wait for lock to become non-negative
> 
> all tasks except master wait for lock to become 1
> 
>> * master opens fd, gets the nr value and set is to zero
> 
> and set it to 1
> 
>> * every slave opening an fd increments one
>> * master waits for the counter to become nr
>>
>> ?
>>
>>> +
>>>  	return f;
>>>  }
>>>  
>>>
>>
> .
> 



More information about the CRIU mailing list