[CRIU] [PATCH 2/3] shmem: rework getting file descriptors for shared memory regions
Pavel Emelyanov
xemul at parallels.com
Fri Oct 10 11:30:48 PDT 2014
On 10/10/2014 08:39 PM, Andrey Vagin wrote:
> /proc/PID/map_files are protected by the global CAP_SYS_ADMIN, so we
> need to avoid using them to support user namespaces.
>
> We are going to use memfd_create() to get the first file descriptor and
> then all others processes will able to open it via /proc/PID/fd/X.
>
> This patch reworks slave processes to not use map_files.
>
> Signed-off-by: Andrey Vagin <avagin at openvz.org>
> ---
> include/shmem.h | 2 ++
> pie/restorer.c | 10 ----------
> shmem.c | 32 ++++++++++++++++++++++++--------
> 3 files changed, 26 insertions(+), 18 deletions(-)
>
> diff --git a/include/shmem.h b/include/shmem.h
> index 2526e3e..4b482e8 100644
> --- a/include/shmem.h
> +++ b/include/shmem.h
> @@ -19,6 +19,8 @@ struct shmem_info {
> int pid;
> int fd;
> futex_t lock;
> + int count; /* the number of regions */
> + int self_count; /* the number of regions, which belongs to "pid" */
> };
>
> struct _VmaEntry;
> diff --git a/pie/restorer.c b/pie/restorer.c
> index 6c9d0a3..7196a3e 100644
> --- a/pie/restorer.c
> +++ b/pie/restorer.c
> @@ -881,16 +881,6 @@ long __export_restore_task(struct task_restore_args *args)
> if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
> continue;
>
> - if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
> - struct shmem_info *entry;
> -
> - entry = find_shmem(args->shmems, args->nr_shmems,
> - vma_entry->shmid);
> - if (entry && entry->pid == my_pid &&
> - entry->start == vma_entry->start)
> - futex_set_and_wake(&entry->lock, 1);
> - }
> -
> if (vma_entry->prot & PROT_WRITE)
> continue;
>
> diff --git a/shmem.c b/shmem.c
> index 526d9a9..bfe2743 100644
> --- a/shmem.c
> +++ b/shmem.c
> @@ -55,6 +55,7 @@ int collect_shmem(int pid, VmaEntry *vi)
>
> if (si->size < size)
> si->size = size;
> + si->count++;
>
> /*
> * Only the shared mapping with a lowest
> @@ -62,12 +63,17 @@ int collect_shmem(int pid, VmaEntry *vi)
> * will wait until the kernel propagate this mapping
> * into /proc
> */
> - if (!pid_rst_prio(pid, si->pid))
> + if (!pid_rst_prio(pid, si->pid)) {
> + if (si->pid == pid)
> + si->self_count++;
> +
> return 0;
> + }
>
> si->pid = pid;
> si->start = vi->start;
> si->end = vi->end;
> + si->self_count = 1;
>
> return 0;
> }
> @@ -85,6 +91,8 @@ int collect_shmem(int pid, VmaEntry *vi)
> si->pid = pid;
> si->size = size;
> si->fd = -1;
> + si->count = 1;
> + si->self_count = 1;
>
> nr_shmems++;
> futex_init(&si->lock);
> @@ -97,17 +105,18 @@ static int shmem_wait_and_open(int pid, struct shmem_info *si)
> char path[128];
> int ret;
>
> - snprintf(path, sizeof(path), "/proc/%d/map_files/%lx-%lx",
> - si->pid, si->start, si->end);
> + pr_info("Waiting for the %lx shmem to appear\n", si->shmid);
> + futex_wait_while(&si->lock, 0);
>
> - pr_info("Waiting for [%s] to appear\n", path);
> - futex_wait_until(&si->lock, 1);
What's the point in changing wait_while(0) into wait_until(1)?
> + snprintf(path, sizeof(path), "/proc/%d/fd/%d",
> + si->pid, si->fd);
>
> pr_info("Opening shmem [%s] \n", path);
> - ret = open_proc_rw(si->pid, "map_files/%lx-%lx", si->start, si->end);
> + ret = open_proc_rw(si->pid, "fd/%d", si->fd);
> if (ret < 0)
> pr_perror(" %d: Can't stat shmem at %s",
> si->pid, path);
> + futex_inc_and_wake(&si->lock);
Huh? Slave wakes up the rest?
> return ret;
> }
>
> @@ -207,10 +216,17 @@ int get_shmem_fd(int pid, VmaEntry *vi)
> (unsigned long) addr,
> (unsigned long) addr + si->size);
> munmap(addr, si->size);
> - if (f < 0)
> - return -1;
>
> si->fd = f;
> +
> + /* Send signal to slaves, that they can open fd for this shmem */
> + futex_inc_and_wake(&si->lock);
> + /*
> + * All other regions in this process will duplicate
> + * the file descriptor, so we don't wait them.
> + */
> + futex_wait_until(&si->lock, si->count - si->self_count + 1);
I don't see any place where count and self_count are used
one without another. Can we use one counter?
Or even better, can we use just one lock counter? Like this:
* initially lock is -nr (not opened) where nr is the amount of tasks
waiting for fd to appear
* all tasks wait for lock to become non-negative
* master opens fd, gets the nr value and set is to zero
* every slave opening an fd increments one
* master waits for the counter to become nr
?
> +
> return f;
> }
>
>
More information about the CRIU
mailing list