[CRIU] [PATCH 07/11] vma: Move cow decision earlier
Mike Rapoport
rppt at linux.vnet.ibm.com
Sun May 7 03:49:22 PDT 2017
On Fri, May 05, 2017 at 07:03:16PM +0300, Pavel Emelyanov wrote:
> Collect VMAs into COW-groups. This is done by checking each pstree_item's
> VMA list in parallel with the parent one and finding VMAs that have
> chances to get COW pages. The vma->parent pointer is used to tie such
> areas together.
>
> As a nice side effect -- tasks with different exe files are not even
> tried to cow-ed after this patch.
>
> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
> criu/cr-restore.c | 2 +
> criu/include/mem.h | 1 +
> criu/include/vma.h | 11 ++++
> criu/mem.c | 162 ++++++++++++++++++++++++++++++++++-------------------
> 4 files changed, 117 insertions(+), 59 deletions(-)
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index f3f5f67..bbe0dad 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -367,6 +367,8 @@ static int root_prepare_shared(void)
> if (ret < 0)
> goto err;
>
> + prepare_cow_vmas();
> +
> ret = prepare_restorer_blob();
> if (ret)
> goto err;
> diff --git a/criu/include/mem.h b/criu/include/mem.h
> index 220a301..6791bfd 100644
> --- a/criu/include/mem.h
> +++ b/criu/include/mem.h
> @@ -18,6 +18,7 @@ struct mem_dump_ctl {
> extern bool page_is_zero(u64 pme);
> extern bool page_in_parent(bool dirty);
> extern int prepare_mm_pid(struct pstree_item *i);
> +extern void prepare_cow_vmas(void);
> extern int do_task_reset_dirty_track(int pid);
> extern unsigned long dump_pages_args_size(struct vm_area_list *vmas);
> extern int parasite_dump_pages_seized(struct pstree_item *item,
> diff --git a/criu/include/vma.h b/criu/include/vma.h
> index 12f03fb..dcce080 100644
> --- a/criu/include/vma.h
> +++ b/criu/include/vma.h
> @@ -57,6 +57,17 @@ struct vma_area {
> struct vma_area *pvma; /* parent for inherited VMAs */
> unsigned long *page_bitmap; /* existent pages */
> unsigned long premmaped_addr; /* restore only */
> +
> + /*
> + * Some notes about pvma, page_bitmap and premmaped_addr bits
> + * above.
> + *
> + * The pvma is set on prepare_cow_vmas() when resolving which
> + * VMAs _may_ inherit pages from which. The other two are set
> + * in prepare_mappings() when the respective VMAs get mmap-ed
> + * or mremap-ed, and are then inherited on fork_with_pid()-s
> + * called from create_children_and_session().
> + */
* The pvma is set in prepare_cow_vmas() when we resolve which
* VMAs _may_ inherit pages from each other.
* The page_bitmap and premmaped_addr are set in prepare_mappings()
* when the respective VMAs get mmap-ed or mremap-ed.
* These VMAs are then inherited during fork_with_pid()-s
* called from create_children_and_session().
> };
> };
> };
> diff --git a/criu/mem.c b/criu/mem.c
> index 1b805cd..a9e6e08 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -552,15 +552,99 @@ int prepare_mm_pid(struct pstree_item *i)
> return ret;
> }
>
> +static inline bool check_cow_vmas(struct vma_area *vma, struct vma_area *pvma)
> +{
> + /*
> + * VMAs that _may_[1] have COW-ed pages should ...
> + *
> + * [1] I say "may" because whether or not particular pages are
> + * COW-ed is determined later in restore_priv_vma_content() by
> + * memcpy-ing the contents.
memcmp'aring?
> + */
> +
> + /* ... coinside by start/stop pair (start is checked by caller) */
> + if (vma->e->end != pvma->e->end)
> + return false;
> + /* ... both be private (and thus have space in premmaped area) */
> + if (!vma_area_is_private(vma, kdat.task_size))
> + return false;
> + if (!vma_area_is_private(vma, kdat.task_size))
> + return false;
The check for the same VMA is repeated here, should be pvma in one of the
case above, IMHO.
> + /* ... have growsdown and anon flags coinside */
> + if ((vma->e->flags ^ pvma->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
> + return false;
> + /* ... belong to the same file if being filemap */
> + if (!(vma->e->flags & MAP_ANONYMOUS) && vma->e->shmid != pvma->e->shmid)
> + return false;
> +
> + pr_debug("Found two COW VMAs @%#lx-%#lx\n", vma->e->start, pvma->e->end);
> + return true;
> +}
> +
> +static void prepare_cow_vmas_for(struct vm_area_list *vmas, struct vm_area_list *pvmas)
> +{
> + struct vma_area *vma, *pvma;
> +
> + vma = list_first_entry(&vmas->h, struct vma_area, list);
> + pvma = list_first_entry(&pvmas->h, struct vma_area, list);
> +
> + while (1) {
> + if ((vma->e->start == pvma->e->start) && check_cow_vmas(vma, pvma))
> + vma->pvma = pvma;
> +
> + /* <= here to shift from matching VMAs and ... */
> + while (vma->e->start <= pvma->e->start) {
> + vma = list_entry(vma->list.next, struct vma_area, list);
> + if (&vma->list == &vmas->h)
> + return;
> + }
> +
> + /* ... no == here since we must stop on matching pair */
> + while (pvma->e->start < vma->e->start) {
> + pvma = list_entry(pvma->list.next, struct vma_area, list);
It's worth having vma_next already by this point ;-)
> + if (&pvma->list == &pvmas->h)
> + return;
> + }
> + }
> +}
> +
> +void prepare_cow_vmas(void)
> +{
> + struct pstree_item *pi;
> +
> + for_each_pstree_item(pi) {
> + struct pstree_item *ppi;
> + struct vm_area_list *vmas, *pvmas;
> +
> + ppi = pi->parent;
> + if (!ppi)
> + continue;
> +
> + vmas = &rsti(pi)->vmas;
> + if (vmas->nr == 0) /* Zombie */
> + continue;
> +
> + if (rsti(pi)->mm->exe_file_id != rsti(ppi)->mm->exe_file_id)
> + /*
> + * Tasks running different executables have
> + * close to zero chance of having cow-ed areas
> + * and actually kernel never creates such.
> + */
> + continue;
> +
> + pvmas = &rsti(ppi)->vmas;
> + BUG_ON(pvmas->nr == 0); /* zombies cannot have kids */
> +
> + prepare_cow_vmas_for(vmas, pvmas);
> + }
> +}
> +
> /* Map a private vma, if it is not mapped by a parent yet */
> -static int premap_private_vma(struct pstree_item *t,
> - struct vma_area *vma, void **tgt_addr,
> - struct vma_area **pvma, struct list_head *pvma_list)
> +static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void **tgt_addr)
> {
> int ret;
> - void *addr, *paddr = NULL;
> + void *addr;
> unsigned long nr_pages, size;
> - struct vma_area *p = *pvma;
>
> if (vma_area_is(vma, VMA_FILE_PRIVATE)) {
> ret = vma->vm_open(vpid(t), vma);
> @@ -577,51 +661,20 @@ static int premap_private_vma(struct pstree_item *t,
> if (vma->page_bitmap == NULL)
> return -1;
>
> - list_for_each_entry_from(p, pvma_list, list) {
> - if (p->e->start > vma->e->start)
> - break;
> -
> - if (!vma_area_is(p, VMA_PREMMAPED))
> - continue;
> -
> - if (p->e->end != vma->e->end ||
> - p->e->start != vma->e->start)
> - continue;
> -
> - /* Check flags, which must be identical for both vma-s */
> - if ((vma->e->flags ^ p->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
> - break;
> -
> - if (!(vma->e->flags & MAP_ANONYMOUS) &&
> - vma->e->shmid != p->e->shmid)
> - break;
> -
> - pr_info("COW %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" vma\n",
> - vma->e->start, vma->e->end, vma->e->pgoff);
> - paddr = decode_pointer(p->premmaped_addr);
> -
> - break;
> - }
> -
> /*
> * A grow-down VMA has a guard page, which protect a VMA below it.
> * So one more page is mapped here to restore content of the first page
> */
> - if (vma->e->flags & MAP_GROWSDOWN) {
> + if (vma->e->flags & MAP_GROWSDOWN)
> vma->e->start -= PAGE_SIZE;
> - if (paddr)
> - paddr -= PAGE_SIZE;
> - }
>
> size = vma_entry_len(vma->e);
> - if (paddr == NULL) {
> + if (vma->pvma == NULL) {
> int flag = 0;
> /*
> * The respective memory area was NOT found in the parent.
> * Map a new one.
> */
> - pr_info("Map %#016"PRIx64"-%#016"PRIx64" %#016"PRIx64" vma\n",
> - vma->e->start, vma->e->end, vma->e->pgoff);
>
> /*
> * Restore AIO ring buffer content to temporary anonymous area.
> @@ -639,14 +692,19 @@ static int premap_private_vma(struct pstree_item *t,
> pr_perror("Unable to map ANON_VMA");
> return -1;
> }
> -
> - *pvma = p;
> } else {
> + void *paddr;
> +
> /*
> - * This region was found in parent -- remap it to inherit physical
> - * pages (if any) from it (and COW them later if required).
> + * The area in question can be COWed with the parent. Remap the
> + * parent area. Note, that it has already being passed through
> + * the restore_priv_vma_content() call and thus may have some
> + * pages in it.
> */
> - vma->pvma = p;
> +
> + paddr = decode_pointer(vma->pvma->premmaped_addr);
> + if (vma->e->flags & MAP_GROWSDOWN)
> + paddr -= PAGE_SIZE;
>
> addr = mremap(paddr, size, size,
> MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr);
> @@ -654,8 +712,6 @@ static int premap_private_vma(struct pstree_item *t,
> pr_perror("Unable to remap a private vma");
> return -1;
> }
> -
> - *pvma = list_entry(p->list.next, struct vma_area, list);
> }
>
> vma->e->status |= VMA_PREMMAPED;
> @@ -678,23 +734,11 @@ static int premap_private_vma(struct pstree_item *t,
> static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
> void *at, struct page_read *pr)
> {
> - struct list_head *parent_vmas;
> - struct vma_area *pvma, *vma;
> + struct vma_area *vma;
> unsigned long pstart = 0;
> int ret = 0;
> LIST_HEAD(empty);
>
> - /*
> - * Keep parent vmas at hands to check whether we can "inherit" them.
> - * See comments in premap_private_vma.
> - */
> - if (t->parent)
> - parent_vmas = &rsti(t->parent)->vmas.h;
> - else
> - parent_vmas = ∅
> -
> - pvma = list_first_entry(parent_vmas, struct vma_area, list);
> -
> list_for_each_entry(vma, &vmas->h, list) {
> if (pstart > vma->e->start) {
> ret = -1;
> @@ -706,7 +750,7 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
> if (!vma_area_is_private(vma, kdat.task_size))
> continue;
>
> - ret = premap_private_vma(t, vma, &at, &pvma, parent_vmas);
> + ret = premap_private_vma(t, vma, &at);
> if (ret < 0)
> break;
> }
> --
> 2.5.5
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
>
More information about the CRIU
mailing list