[CRIU] [PATCH 04/11 v3] criu: lazy-pages: replace page list with IOVs list

Mike Rapoport mike.rapoport at gmail.com
Tue Nov 15 07:25:55 PST 2016


On Tue, Nov 15, 2016 at 5:03 PM, Pavel Emelyanov <xemul at virtuozzo.com> wrote:
> On 11/15/2016 03:07 PM, Mike Rapoport wrote:
>> Instead of tracking memory handled by userfaultfd on the page basis we can
>> use IOVs for continious chunks.
>>
>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
>> ---
>> v3: s/min(/min_t(uint64_t should make arm compilation happy
>>
>>  criu/uffd.c | 261 +++++++++++++++++++++++++++++++++---------------------------
>>  1 file changed, 143 insertions(+), 118 deletions(-)
>>
>> diff --git a/criu/uffd.c b/criu/uffd.c
>> index 6250bf1..eae42fd 100644
>> --- a/criu/uffd.c
>> +++ b/criu/uffd.c
>> @@ -46,11 +46,17 @@
>>
>>  static mutex_t *lazy_sock_mutex;
>>
>> +struct lazy_iovec {
>> +     struct list_head l;
>> +     unsigned long base;
>> +     unsigned long len;
>> +};
>> +
>>  struct lazy_pages_info {
>>       int pid;
>>       int uffd;
>>
>> -     struct list_head pages;
>> +     struct list_head iovs;
>>
>>       struct page_read pr;
>>
>> @@ -72,7 +78,7 @@ static struct lazy_pages_info *lpi_init(void)
>>               return NULL;
>>
>>       memset(lpi, 0, sizeof(*lpi));
>> -     INIT_LIST_HEAD(&lpi->pages);
>> +     INIT_LIST_HEAD(&lpi->iovs);
>>       INIT_HLIST_NODE(&lpi->hash);
>>
>>       return lpi;
>> @@ -80,8 +86,12 @@ static struct lazy_pages_info *lpi_init(void)
>>
>>  static void lpi_fini(struct lazy_pages_info *lpi)
>>  {
>> +     struct lazy_iovec *p, *n;
>> +
>>       if (!lpi)
>>               return;
>> +     list_for_each_entry_safe(p, n, &lpi->iovs, l)
>> +             xfree(p);
>>       if (lpi->uffd > 0)
>>               close(lpi->uffd);
>>       if (lpi->pr.close)
>> @@ -320,118 +330,135 @@ out:
>>       return -1;
>>  }
>>
>> -#define UFFD_FLAG_SENT       0x1
>> +static MmEntry *init_mm_entry(struct lazy_pages_info *lpi)
>> +{
>> +     struct cr_img *img;
>> +     MmEntry *mm;
>> +     int ret;
>>
>> -struct uffd_pages_struct {
>> -     struct list_head list;
>> -     unsigned long addr;
>> -     int flags;
>> -};
>> +     img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
>> +     if (!img)
>> +             return NULL;
>> +
>> +     ret = pb_read_one_eof(img, &mm, PB_MM);
>> +     close_image(img);
>> +     if (ret == -1)
>> +             return NULL;
>> +     pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
>> +
>> +     return mm;
>> +}
>>
>> -static int collect_uffd_pages(struct lazy_pages_info *lpi, MmEntry *mm)
>> +static int update_lazy_iovecs(struct lazy_pages_info *lpi, unsigned long addr,
>> +                           int len)
>>  {
>> -     unsigned long base;
>> -     int i, j;
>> -     struct iovec iov;
>> -     unsigned long nr_pages;
>> -     unsigned long ps;
>> -     int rc;
>> -     struct uffd_pages_struct *uffd_pages;
>> -     struct page_read *pr = &lpi->pr;
>> +     struct lazy_iovec *lazy_iov, *n;
>>
>> -     rc = pr->get_pagemap(pr, &iov);
>> -     if (rc <= 0)
>> -             return 0;
>> +     list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l) {
>> +             unsigned long start = lazy_iov->base;
>> +             unsigned long end = start + lazy_iov->len;
>>
>> -     ps = page_size();
>> -     nr_pages = iov.iov_len / ps;
>> -     base = (unsigned long) iov.iov_base;
>> -     pr_debug("iov.iov_base 0x%lx (%ld pages)\n", base, nr_pages);
>> +             if (len <= 0)
>> +                     break;
>>
>> -     for (i = 0; i < nr_pages; i++) {
>> -             bool uffd_page = false;
>> -             base = (unsigned long) iov.iov_base + (i * ps);
>> -             /*
>> -              * Only pages which are MAP_ANONYMOUS and MAP_PRIVATE
>> -              * are relevant for userfaultfd handling.
>> -              * Loop over all VMAs to see if the flags matching.
>> -              */
>> -             for (j = 0; j < mm->n_vmas; j++) {
>> -                     VmaEntry *vma = mm->vmas[j];
>> -                     /*
>> -                      * This loop assumes that base can actually be found
>> -                      * in the VMA list.
>> -                      */
>> -                     if (base >= vma->start && base < vma->end) {
>> -                             if (vma_entry_can_be_lazy(vma)) {
>> -                                     if(!pagemap_in_parent(pr->pe))
>> -                                             uffd_page = true;
>> -                                     break;
>> -                             }
>> +             if (addr < start || addr >= end)
>> +                     continue;
>> +
>> +             if (addr + len < end) {
>> +                     if (addr == start) {
>> +                             lazy_iov->base += len;
>> +                             lazy_iov->len -= len;
>> +                     } else {
>> +                             struct lazy_iovec *new_iov;
>> +
>> +                             lazy_iov->len -= (end - addr);
>> +
>> +                             new_iov = xzalloc(sizeof(*new_iov));
>> +                             if (!new_iov)
>> +                                     return -1;
>> +
>> +                             new_iov->base = addr + len;
>> +                             new_iov->len = end - (addr + len);
>> +
>> +                             list_add(&new_iov->l, &lazy_iov->l);
>>                       }
>> +                     break;
>>               }
>>
>> -             /* This is not a page we are looking for. Move along */
>> -             if (!uffd_page)
>> -                     continue;
>> -
>> -             pr_debug("Adding 0x%lx to our list\n", base);
>> +             if (addr == start) {
>> +                     list_del(&lazy_iov->l);
>> +                     xfree(lazy_iov);
>> +             } else {
>> +                     lazy_iov->len -= (end - addr);
>> +             }
>>
>> -             uffd_pages = xzalloc(sizeof(struct uffd_pages_struct));
>> -             if (!uffd_pages)
>> -                     return -1;
>> -             uffd_pages->addr = base;
>> -             list_add(&uffd_pages->list, &lpi->pages);
>> +             len -= (end - addr);
>> +             addr = end;
>>       }
>>
>> -     return 1;
>> +     return 0;
>>  }
>>
>>  /*
>> - *  Setting up criu infrastructure and scan for VMAs.
>> + * Create a list of IOVs that can be handled using userfaultfd. The
>> + * IOVs generally correspond to lazy pagemap entries, except the cases
>> + * when a single pagemap entry covers several VMAs. In those cases
>> + * IOVs are split at VMA boundaries because UFFDIO_COPY may be done
>> + * only inside a single VMA.
>> + * We assume here that pagemaps and VMAs are sorted.
>>   */
>> -static int find_vmas(struct lazy_pages_info *lpi)
>> +static int collect_lazy_iovecs(struct lazy_pages_info *lpi)
>>  {
>> -     struct cr_img *img;
>> -     int ret;
>> +     struct page_read *pr = &lpi->pr;
>> +     struct lazy_iovec *lazy_iov, *n;
>>       MmEntry *mm;
>> -     struct uffd_pages_struct *uffd_pages;
>> +     int nr_pages = 0, n_vma = 0;
>> +     int ret = -1;
>> +     unsigned long start, end, len;
>>
>> -     img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
>> -     if (!img)
>> +     mm = init_mm_entry(lpi);
>> +     if (!mm)
>>               return -1;
>>
>> -     ret = pb_read_one_eof(img, &mm, PB_MM);
>> -     close_image(img);
>> -     if (ret == -1)
>> -             return -1;
>> -     pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
>> +     while (pr->advance(pr)) {
>> +             if (!pagemap_lazy(pr->pe))
>> +                     continue;
>>
>> -     ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK);
>> -     if (ret <= 0) {
>> -             ret = -1;
>> -             goto out;
>> -     }
>> -     /*
>> -      * This puts all pages which should be handled by userfaultfd
>> -      * in the list uffd_list. This list is later used to detect if
>> -      * a page has already been transferred or if it needs to be
>> -      * pushed into the process using userfaultfd.
>> -      */
>> -     do {
>> -             ret = collect_uffd_pages(lpi, mm);
>> -             if (ret == -1) {
>> -                     goto out;
>> +             start = pr->pe->vaddr;
>> +             end = start + pr->pe->nr_pages * page_size();
>> +             nr_pages += pr->pe->nr_pages;
>> +
>> +             for (; n_vma < mm->n_vmas; n_vma++) {
>> +                     VmaEntry *vma = mm->vmas[n_vma];
>> +
>> +                     if (start >= vma->end)
>> +                             continue;
>> +
>> +                     lazy_iov = xzalloc(sizeof(*lazy_iov));
>> +                     if (!lazy_iov)
>> +                             goto free_iovs;
>> +
>> +                     len = min_t(uint64_t, end, vma->end) - start;
>> +                     lazy_iov->base = start;
>> +                     lazy_iov->len = len;
>> +                     list_add_tail(&lazy_iov->l, &lpi->iovs);
>> +
>> +                     if (end <= vma->end)
>> +                             break;
>> +
>> +                     start = vma->end;
>>               }
>> -     } while (ret);
>> +     }
>>
>> -     /* Count detected pages */
>> -     list_for_each_entry(uffd_pages, &lpi->pages, list)
>> -         ret++;
>> +     ret = nr_pages;
>> +     goto free_mm;
>>
>> -     pr_debug("Found %d pages to be handled by UFFD\n", ret);
>> +free_iovs:
>> +     list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l)
>> +             xfree(lazy_iov);
>> +free_mm:
>> +     mm_entry__free_unpacked(mm, NULL);
>>
>> -out:
>>       return ret;
>>  }
>>
>> @@ -473,13 +500,22 @@ static int ud_open(int client, struct lazy_pages_info **_lpi)
>>       uffd_flags = fcntl(lpi->uffd, F_GETFD, NULL);
>>       pr_debug("uffd_flags are 0x%x\n", uffd_flags);
>>
>> +     ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK);
>> +     if (ret <= 0) {
>> +             ret = -1;
>> +             goto out;
>> +     }
>> +
>>       /*
>>        * Find the memory pages belonging to the restored process
>>        * so that it is trackable when all pages have been transferred.
>>        */
>> -     if ((lpi->total_pages = find_vmas(lpi)) == -1)
>> +     lpi->total_pages = collect_lazy_iovecs(lpi);
>> +     if (lpi->total_pages < 0)
>>               goto out;
>>
>> +     pr_debug("Found %ld pages to be handled by UFFD\n", lpi->total_pages);
>> +
>>       hlist_add_head(&lpi->hash, &lpi_hash[lpi->uffd % LPI_HASH_SIZE]);
>>       *_lpi = lpi;
>>
>> @@ -591,33 +627,33 @@ static int uffd_handle_page(struct lazy_pages_info *lpi, __u64 address,
>>
>>  static int handle_remaining_pages(struct lazy_pages_info *lpi, void *dest)
>>  {
>> -     struct uffd_pages_struct *uffd_pages;
>> -     int rc;
>> +     struct lazy_iovec *lazy_iov;
>> +     int nr_pages, i, err;
>> +     unsigned long addr;
>>
>> -     list_for_each_entry(uffd_pages, &lpi->pages, list) {
>> -             pr_debug("Checking remaining pages 0x%lx (flags 0x%x)\n",
>> -                      uffd_pages->addr, uffd_pages->flags);
>> -             if (uffd_pages->flags & UFFD_FLAG_SENT)
>> -                     continue;
>> +     lpi->pr.reset(&lpi->pr);
>>
>> -             rc = uffd_handle_page(lpi, uffd_pages->addr, dest);
>> -             if (rc < 0) {
>> -                     pr_err("Error during UFFD copy\n");
>> -                     return -1;
>> -             }
>> +     list_for_each_entry(lazy_iov, &lpi->iovs, l) {
>> +             nr_pages = lazy_iov->len / PAGE_SIZE;
>> +
>> +             for (i = 0; i < nr_pages; i++) {
>> +                     addr = lazy_iov->base + i * PAGE_SIZE;
>>
>> -             uffd_pages->flags |= UFFD_FLAG_SENT;
>> +                     err = uffd_handle_page(lpi, addr, dest);
>
> This place looks dangerous. The iovs can be quite large and requesting the whole
> IOV during single #PF handling may slow the restored tree down significantly.

We never request more than one page at #PF time. The entire IOVs are
requested only after 5 secs timeout

> Can we either keep the per-page API or make IOV-based work on IOV fractions if
> needed?
>
>> +                     if (err < 0) {
>> +                             pr_err("Error during UFFD copy\n");
>> +                             return -1;
>> +                     }
>> +             }
>>       }
>>
>>       return 0;
>>  }
>>
>> -
>>  static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
>>                               __u64 address)
>>  {
>>       int rc;
>> -     struct uffd_pages_struct *uffd_pages;
>>
>>       rc = uffd_handle_page(lpi, address, dest);
>>       if (rc < 0) {
>> @@ -625,14 +661,9 @@ static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
>>               return -1;
>>       }
>>
>> -     /*
>> -      * Mark this page as having been already transferred, so
>> -      * that it has not to be copied again later.
>> -      */
>> -     list_for_each_entry(uffd_pages, &lpi->pages, list) {
>> -             if (uffd_pages->addr == address)
>> -                     uffd_pages->flags |= UFFD_FLAG_SENT;
>> -     }
>> +     rc = update_lazy_iovecs(lpi, address, PAGE_SIZE);
>> +     if (rc < 0)
>> +             return -1;
>>
>>       return 0;
>>  }
>> @@ -642,7 +673,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
>>       struct uffd_msg msg;
>>       __u64 flags;
>>       __u64 address;
>> -     struct uffd_pages_struct *uffd_pages;
>>       int ret;
>>
>>       ret = read(lpi->uffd, &msg, sizeof(msg));
>> @@ -662,11 +692,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
>>       address = msg.arg.pagefault.address & ~(page_size() - 1);
>>       pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
>>
>> -     /* Make sure to not transfer a page twice */
>> -     list_for_each_entry(uffd_pages, &lpi->pages, list)
>> -             if ((uffd_pages->addr == address) && (uffd_pages->flags & UFFD_FLAG_SENT))
>> -                     return 0;
>> -
>>       /* Now handle the pages actually requested. */
>>       flags = msg.arg.pagefault.flags;
>>       pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
>>
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu



-- 
Sincerely yours,
Mike.


More information about the CRIU mailing list