[CRIU] [PATCH 04/11 v3] criu: lazy-pages: replace page list with IOVs list
Mike Rapoport
mike.rapoport at gmail.com
Tue Nov 15 07:25:55 PST 2016
On Tue, Nov 15, 2016 at 5:03 PM, Pavel Emelyanov <xemul at virtuozzo.com> wrote:
> On 11/15/2016 03:07 PM, Mike Rapoport wrote:
>> Instead of tracking memory handled by userfaultfd on the page basis we can
>> use IOVs for continious chunks.
>>
>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
>> ---
>> v3: s/min(/min_t(uint64_t should make arm compilation happy
>>
>> criu/uffd.c | 261 +++++++++++++++++++++++++++++++++---------------------------
>> 1 file changed, 143 insertions(+), 118 deletions(-)
>>
>> diff --git a/criu/uffd.c b/criu/uffd.c
>> index 6250bf1..eae42fd 100644
>> --- a/criu/uffd.c
>> +++ b/criu/uffd.c
>> @@ -46,11 +46,17 @@
>>
>> static mutex_t *lazy_sock_mutex;
>>
>> +struct lazy_iovec {
>> + struct list_head l;
>> + unsigned long base;
>> + unsigned long len;
>> +};
>> +
>> struct lazy_pages_info {
>> int pid;
>> int uffd;
>>
>> - struct list_head pages;
>> + struct list_head iovs;
>>
>> struct page_read pr;
>>
>> @@ -72,7 +78,7 @@ static struct lazy_pages_info *lpi_init(void)
>> return NULL;
>>
>> memset(lpi, 0, sizeof(*lpi));
>> - INIT_LIST_HEAD(&lpi->pages);
>> + INIT_LIST_HEAD(&lpi->iovs);
>> INIT_HLIST_NODE(&lpi->hash);
>>
>> return lpi;
>> @@ -80,8 +86,12 @@ static struct lazy_pages_info *lpi_init(void)
>>
>> static void lpi_fini(struct lazy_pages_info *lpi)
>> {
>> + struct lazy_iovec *p, *n;
>> +
>> if (!lpi)
>> return;
>> + list_for_each_entry_safe(p, n, &lpi->iovs, l)
>> + xfree(p);
>> if (lpi->uffd > 0)
>> close(lpi->uffd);
>> if (lpi->pr.close)
>> @@ -320,118 +330,135 @@ out:
>> return -1;
>> }
>>
>> -#define UFFD_FLAG_SENT 0x1
>> +static MmEntry *init_mm_entry(struct lazy_pages_info *lpi)
>> +{
>> + struct cr_img *img;
>> + MmEntry *mm;
>> + int ret;
>>
>> -struct uffd_pages_struct {
>> - struct list_head list;
>> - unsigned long addr;
>> - int flags;
>> -};
>> + img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
>> + if (!img)
>> + return NULL;
>> +
>> + ret = pb_read_one_eof(img, &mm, PB_MM);
>> + close_image(img);
>> + if (ret == -1)
>> + return NULL;
>> + pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
>> +
>> + return mm;
>> +}
>>
>> -static int collect_uffd_pages(struct lazy_pages_info *lpi, MmEntry *mm)
>> +static int update_lazy_iovecs(struct lazy_pages_info *lpi, unsigned long addr,
>> + int len)
>> {
>> - unsigned long base;
>> - int i, j;
>> - struct iovec iov;
>> - unsigned long nr_pages;
>> - unsigned long ps;
>> - int rc;
>> - struct uffd_pages_struct *uffd_pages;
>> - struct page_read *pr = &lpi->pr;
>> + struct lazy_iovec *lazy_iov, *n;
>>
>> - rc = pr->get_pagemap(pr, &iov);
>> - if (rc <= 0)
>> - return 0;
>> + list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l) {
>> + unsigned long start = lazy_iov->base;
>> + unsigned long end = start + lazy_iov->len;
>>
>> - ps = page_size();
>> - nr_pages = iov.iov_len / ps;
>> - base = (unsigned long) iov.iov_base;
>> - pr_debug("iov.iov_base 0x%lx (%ld pages)\n", base, nr_pages);
>> + if (len <= 0)
>> + break;
>>
>> - for (i = 0; i < nr_pages; i++) {
>> - bool uffd_page = false;
>> - base = (unsigned long) iov.iov_base + (i * ps);
>> - /*
>> - * Only pages which are MAP_ANONYMOUS and MAP_PRIVATE
>> - * are relevant for userfaultfd handling.
>> - * Loop over all VMAs to see if the flags matching.
>> - */
>> - for (j = 0; j < mm->n_vmas; j++) {
>> - VmaEntry *vma = mm->vmas[j];
>> - /*
>> - * This loop assumes that base can actually be found
>> - * in the VMA list.
>> - */
>> - if (base >= vma->start && base < vma->end) {
>> - if (vma_entry_can_be_lazy(vma)) {
>> - if(!pagemap_in_parent(pr->pe))
>> - uffd_page = true;
>> - break;
>> - }
>> + if (addr < start || addr >= end)
>> + continue;
>> +
>> + if (addr + len < end) {
>> + if (addr == start) {
>> + lazy_iov->base += len;
>> + lazy_iov->len -= len;
>> + } else {
>> + struct lazy_iovec *new_iov;
>> +
>> + lazy_iov->len -= (end - addr);
>> +
>> + new_iov = xzalloc(sizeof(*new_iov));
>> + if (!new_iov)
>> + return -1;
>> +
>> + new_iov->base = addr + len;
>> + new_iov->len = end - (addr + len);
>> +
>> + list_add(&new_iov->l, &lazy_iov->l);
>> }
>> + break;
>> }
>>
>> - /* This is not a page we are looking for. Move along */
>> - if (!uffd_page)
>> - continue;
>> -
>> - pr_debug("Adding 0x%lx to our list\n", base);
>> + if (addr == start) {
>> + list_del(&lazy_iov->l);
>> + xfree(lazy_iov);
>> + } else {
>> + lazy_iov->len -= (end - addr);
>> + }
>>
>> - uffd_pages = xzalloc(sizeof(struct uffd_pages_struct));
>> - if (!uffd_pages)
>> - return -1;
>> - uffd_pages->addr = base;
>> - list_add(&uffd_pages->list, &lpi->pages);
>> + len -= (end - addr);
>> + addr = end;
>> }
>>
>> - return 1;
>> + return 0;
>> }
>>
>> /*
>> - * Setting up criu infrastructure and scan for VMAs.
>> + * Create a list of IOVs that can be handled using userfaultfd. The
>> + * IOVs generally correspond to lazy pagemap entries, except the cases
>> + * when a single pagemap entry covers several VMAs. In those cases
>> + * IOVs are split at VMA boundaries because UFFDIO_COPY may be done
>> + * only inside a single VMA.
>> + * We assume here that pagemaps and VMAs are sorted.
>> */
>> -static int find_vmas(struct lazy_pages_info *lpi)
>> +static int collect_lazy_iovecs(struct lazy_pages_info *lpi)
>> {
>> - struct cr_img *img;
>> - int ret;
>> + struct page_read *pr = &lpi->pr;
>> + struct lazy_iovec *lazy_iov, *n;
>> MmEntry *mm;
>> - struct uffd_pages_struct *uffd_pages;
>> + int nr_pages = 0, n_vma = 0;
>> + int ret = -1;
>> + unsigned long start, end, len;
>>
>> - img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
>> - if (!img)
>> + mm = init_mm_entry(lpi);
>> + if (!mm)
>> return -1;
>>
>> - ret = pb_read_one_eof(img, &mm, PB_MM);
>> - close_image(img);
>> - if (ret == -1)
>> - return -1;
>> - pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
>> + while (pr->advance(pr)) {
>> + if (!pagemap_lazy(pr->pe))
>> + continue;
>>
>> - ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK);
>> - if (ret <= 0) {
>> - ret = -1;
>> - goto out;
>> - }
>> - /*
>> - * This puts all pages which should be handled by userfaultfd
>> - * in the list uffd_list. This list is later used to detect if
>> - * a page has already been transferred or if it needs to be
>> - * pushed into the process using userfaultfd.
>> - */
>> - do {
>> - ret = collect_uffd_pages(lpi, mm);
>> - if (ret == -1) {
>> - goto out;
>> + start = pr->pe->vaddr;
>> + end = start + pr->pe->nr_pages * page_size();
>> + nr_pages += pr->pe->nr_pages;
>> +
>> + for (; n_vma < mm->n_vmas; n_vma++) {
>> + VmaEntry *vma = mm->vmas[n_vma];
>> +
>> + if (start >= vma->end)
>> + continue;
>> +
>> + lazy_iov = xzalloc(sizeof(*lazy_iov));
>> + if (!lazy_iov)
>> + goto free_iovs;
>> +
>> + len = min_t(uint64_t, end, vma->end) - start;
>> + lazy_iov->base = start;
>> + lazy_iov->len = len;
>> + list_add_tail(&lazy_iov->l, &lpi->iovs);
>> +
>> + if (end <= vma->end)
>> + break;
>> +
>> + start = vma->end;
>> }
>> - } while (ret);
>> + }
>>
>> - /* Count detected pages */
>> - list_for_each_entry(uffd_pages, &lpi->pages, list)
>> - ret++;
>> + ret = nr_pages;
>> + goto free_mm;
>>
>> - pr_debug("Found %d pages to be handled by UFFD\n", ret);
>> +free_iovs:
>> + list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l)
>> + xfree(lazy_iov);
>> +free_mm:
>> + mm_entry__free_unpacked(mm, NULL);
>>
>> -out:
>> return ret;
>> }
>>
>> @@ -473,13 +500,22 @@ static int ud_open(int client, struct lazy_pages_info **_lpi)
>> uffd_flags = fcntl(lpi->uffd, F_GETFD, NULL);
>> pr_debug("uffd_flags are 0x%x\n", uffd_flags);
>>
>> + ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK);
>> + if (ret <= 0) {
>> + ret = -1;
>> + goto out;
>> + }
>> +
>> /*
>> * Find the memory pages belonging to the restored process
>> * so that it is trackable when all pages have been transferred.
>> */
>> - if ((lpi->total_pages = find_vmas(lpi)) == -1)
>> + lpi->total_pages = collect_lazy_iovecs(lpi);
>> + if (lpi->total_pages < 0)
>> goto out;
>>
>> + pr_debug("Found %ld pages to be handled by UFFD\n", lpi->total_pages);
>> +
>> hlist_add_head(&lpi->hash, &lpi_hash[lpi->uffd % LPI_HASH_SIZE]);
>> *_lpi = lpi;
>>
>> @@ -591,33 +627,33 @@ static int uffd_handle_page(struct lazy_pages_info *lpi, __u64 address,
>>
>> static int handle_remaining_pages(struct lazy_pages_info *lpi, void *dest)
>> {
>> - struct uffd_pages_struct *uffd_pages;
>> - int rc;
>> + struct lazy_iovec *lazy_iov;
>> + int nr_pages, i, err;
>> + unsigned long addr;
>>
>> - list_for_each_entry(uffd_pages, &lpi->pages, list) {
>> - pr_debug("Checking remaining pages 0x%lx (flags 0x%x)\n",
>> - uffd_pages->addr, uffd_pages->flags);
>> - if (uffd_pages->flags & UFFD_FLAG_SENT)
>> - continue;
>> + lpi->pr.reset(&lpi->pr);
>>
>> - rc = uffd_handle_page(lpi, uffd_pages->addr, dest);
>> - if (rc < 0) {
>> - pr_err("Error during UFFD copy\n");
>> - return -1;
>> - }
>> + list_for_each_entry(lazy_iov, &lpi->iovs, l) {
>> + nr_pages = lazy_iov->len / PAGE_SIZE;
>> +
>> + for (i = 0; i < nr_pages; i++) {
>> + addr = lazy_iov->base + i * PAGE_SIZE;
>>
>> - uffd_pages->flags |= UFFD_FLAG_SENT;
>> + err = uffd_handle_page(lpi, addr, dest);
>
> This place looks dangerous. The iovs can be quite large and requesting the whole
> IOV during single #PF handling may slow the restored tree down significantly.
We never request more than one page at #PF time. The entire IOVs are
requested only after 5 secs timeout
> Can we either keep the per-page API or make IOV-based work on IOV fractions if
> needed?
>
>> + if (err < 0) {
>> + pr_err("Error during UFFD copy\n");
>> + return -1;
>> + }
>> + }
>> }
>>
>> return 0;
>> }
>>
>> -
>> static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
>> __u64 address)
>> {
>> int rc;
>> - struct uffd_pages_struct *uffd_pages;
>>
>> rc = uffd_handle_page(lpi, address, dest);
>> if (rc < 0) {
>> @@ -625,14 +661,9 @@ static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
>> return -1;
>> }
>>
>> - /*
>> - * Mark this page as having been already transferred, so
>> - * that it has not to be copied again later.
>> - */
>> - list_for_each_entry(uffd_pages, &lpi->pages, list) {
>> - if (uffd_pages->addr == address)
>> - uffd_pages->flags |= UFFD_FLAG_SENT;
>> - }
>> + rc = update_lazy_iovecs(lpi, address, PAGE_SIZE);
>> + if (rc < 0)
>> + return -1;
>>
>> return 0;
>> }
>> @@ -642,7 +673,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
>> struct uffd_msg msg;
>> __u64 flags;
>> __u64 address;
>> - struct uffd_pages_struct *uffd_pages;
>> int ret;
>>
>> ret = read(lpi->uffd, &msg, sizeof(msg));
>> @@ -662,11 +692,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
>> address = msg.arg.pagefault.address & ~(page_size() - 1);
>> pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
>>
>> - /* Make sure to not transfer a page twice */
>> - list_for_each_entry(uffd_pages, &lpi->pages, list)
>> - if ((uffd_pages->addr == address) && (uffd_pages->flags & UFFD_FLAG_SENT))
>> - return 0;
>> -
>> /* Now handle the pages actually requested. */
>> flags = msg.arg.pagefault.flags;
>> pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
>>
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
--
Sincerely yours,
Mike.
More information about the CRIU
mailing list