[CRIU] [PATCH v6 6/9] criu: pagemap: add entries for zero pages

Pavel Emelyanov xemul at virtuozzo.com
Fri Jul 15 11:27:45 PDT 2016


On 07/15/2016 09:13 PM, Mike Rapoport wrote:
> On Fri, Jul 15, 2016 at 8:33 PM, Pavel Emelyanov <xemul at virtuozzo.com> wrote:
>> On 07/14/2016 03:49 PM, Mike Rapoport wrote:
>>> The pages that are mapped to zero_page_pfn are not dumped but information
>>> where are they located is required for lazy restore.
>>> Note that get_pagemap users presumed that zero pages are not a part of the
>>> pagemap and these pages were just silently skipped during memory restore.
>>> At the moment I preserve this semantics and force get_pagemap to skip zero
>>> pages.
>>>
>>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
>>> ---
>>>  criu/include/page-pipe.h |  7 ++++-
>>>  criu/include/page-xfer.h |  2 +-
>>>  criu/include/stats.h     |  1 +
>>>  criu/mem.c               | 28 ++++++++++++--------
>>>  criu/page-pipe.c         | 14 ++++++++--
>>>  criu/page-xfer.c         | 67 ++++++++++++++++++++++++++++++++++--------------
>>>  criu/pagemap.c           | 27 ++++++++++++-------
>>>  criu/stats.c             |  1 +
>>>  images/pagemap.proto     |  1 +
>>>  images/stats.proto       |  2 ++
>>>  10 files changed, 108 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
>>> index 635dd6d..a0c6791 100644
>>> --- a/criu/include/page-pipe.h
>>> +++ b/criu/include/page-pipe.h
>>> @@ -80,6 +80,9 @@ struct page_pipe_buf {
>>>       struct list_head l;     /* links into page_pipe->bufs */
>>>  };
>>>
>>> +#define PP_HOLE_PARENT (1 << 0)
>>> +#define PP_HOLE_ZERO   (1 << 1)
>>> +
>>>  struct page_pipe {
>>>       unsigned int nr_pipes;  /* how many page_pipe_bufs in there */
>>>       struct list_head bufs;  /* list of bufs */
>>> @@ -92,6 +95,7 @@ struct page_pipe {
>>>       unsigned int nr_holes;  /* number of holes allocated */
>>>       unsigned int free_hole; /* number of holes in use */
>>>       struct iovec *holes;    /* holes */
>>> +     unsigned int *hole_flags;
>>>
>>>       unsigned flags;         /* PP_FOO flags below */
>>>  };
>>> @@ -111,7 +115,8 @@ extern struct page_pipe *create_page_pipe(unsigned int nr, struct iovec *, unsig
>>>  extern void destroy_page_pipe(struct page_pipe *p);
>>>  extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr,
>>>                             unsigned int flags);
>>> -extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
>>> +extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
>>> +                           unsigned int flags);
>>>
>>>  extern void debug_show_page_pipe(struct page_pipe *pp);
>>>  void page_pipe_reinit(struct page_pipe *pp);
>>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>>> index d19671b..3ba61ed 100644
>>> --- a/criu/include/page-xfer.h
>>> +++ b/criu/include/page-xfer.h
>>> @@ -16,7 +16,7 @@ struct page_xfer {
>>>       /* transfers pages related to previous pagemap */
>>>       int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
>>>       /* transfers one hole -- vaddr:len entry w/o pages */
>>> -     int (*write_hole)(struct page_xfer *self, struct iovec *iov);
>>> +     int (*write_hole)(struct page_xfer *self, struct iovec *iov, int type);
>>>       void (*close)(struct page_xfer *self);
>>>
>>>       /* private data for every page-xfer engine */
>>> diff --git a/criu/include/stats.h b/criu/include/stats.h
>>> index e417636..c0effa7 100644
>>> --- a/criu/include/stats.h
>>> +++ b/criu/include/stats.h
>>> @@ -25,6 +25,7 @@ enum {
>>>       CNT_PAGES_SCANNED,
>>>       CNT_PAGES_SKIPPED_PARENT,
>>>       CNT_PAGES_WRITTEN,
>>> +     CNT_PAGES_ZERO,
>>>
>>>       DUMP_CNT_NR_STATS,
>>>  };
>>> diff --git a/criu/mem.c b/criu/mem.c
>>> index 5530ad7..455d75f 100644
>>> --- a/criu/mem.c
>>> +++ b/criu/mem.c
>>> @@ -107,14 +107,17 @@ static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
>>>               return false;
>>>       if (vma_entry_is(vmae, VMA_AREA_AIORING))
>>>               return true;
>>> -     if (pme & PME_SWAP)
>>> -             return true;
>>> -     if ((pme & PME_PRESENT) && ((pme & PME_PFRAME_MASK) != kdat.zero_page_pfn))
>>> +     if (pme & (PME_PRESENT | PME_SWAP))
>>>               return true;
>>
>> I've just realized one thing I don't understand :)
>>
>> Let's take two PTEs, one is not present (and not swapped) and thus the pagemap
>> entry for it is simply not generated. The other one is mapped with zero pfn
>> and for this the pagemap zero is generated.
>>
>> On regular restore both PTEs will be left uninitialized, that's why we just
>> skip the zero pagemaps in the image. But what would be the difference on
>> lazy restore? If the process #PF-s on non present PTE and on zero PTE what
>> will lazy daemon do in both cases?
> 
> For regular restore, a read #PF will implicitly map the page to
> zero_pfn. So, when a restored process tries to read from those pages,
> it gets zeroes, as it should.

Yes, this is what I mean -- there's not difference between zero pfn maps
and non-present maps in regular restore case.

> For the userfault case, the #PF is delegated to the user space and the
> PTE remains uninitialized until the call to mcopy_atomoc/mzero_atomic.

Yes, that's true.

> Therefore uffd.c has to tell the kernel what would be the contents of
> a faulting page, even if it's zeroes, because otherwise there will be
> no PTE...

Exactly. So what's the difference in uffd daemon behavior when it gets
a #PF into "zero hole" and in the vaddr that doesn't have covering pagemap
entry?

>> -- Pavel
>>
>>>       return false;
>>>  }
>>>
>>> +static inline bool page_is_zero(u64 pme)
>>> +{
>>> +     return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn;
>>> +}
>>> +
>>>  static inline bool page_in_parent(u64 pme)
>>>  {
>>>       /*
>>> @@ -138,7 +141,7 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>>>  {
>>>       u64 *at = &map[PAGE_PFN(*off)];
>>>       unsigned long pfn, nr_to_scan;
>>> -     unsigned long pages[2] = {};
>>> +     unsigned long pages[3] = {};
>>>
>>>       nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
>>>
>>> @@ -162,12 +165,15 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>>>                * page. The latter would be checked in page-xfer.
>>>                */
>>>
>>> -             if (has_parent && page_in_parent(at[pfn])) {
>>> -                     ret = page_pipe_add_hole(pp, vaddr);
>>> +             if (page_is_zero(at[pfn])) {
>>> +                     ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_ZERO);
>>>                       pages[0]++;
>>> +             } else if (has_parent && page_in_parent(at[pfn])) {
>>> +                     ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT);
>>> +                     pages[1]++;
>>>               } else {
>>>                       ret = page_pipe_add_page(pp, vaddr, ppb_flags);
>>> -                     pages[1]++;
>>> +                     pages[2]++;
>>>               }
>>>
>>>               if (ret) {
>>> @@ -179,10 +185,12 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>>>       *off += pfn * PAGE_SIZE;
>>>
>>>       cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
>>> -     cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
>>> -     cnt_add(CNT_PAGES_WRITTEN, pages[1]);
>>> +     cnt_add(CNT_PAGES_ZERO, pages[0]);
>>> +     cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[1]);
>>> +     cnt_add(CNT_PAGES_WRITTEN, pages[2]);
>>>
>>> -     pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);
>>> +     pr_info("Pagemap generated: %lu pages %lu holes %lu zeros\n",
>>> +             pages[2], pages[1], pages[0]);
>>>       return 0;
>>>  }
>>>
>>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>>> index 403af7e..a2a3b8e 100644
>>> --- a/criu/page-pipe.c
>>> +++ b/criu/page-pipe.c
>>> @@ -282,7 +282,8 @@ int page_pipe_add_page(struct page_pipe *pp, unsigned long addr,
>>>
>>>  #define PP_HOLES_BATCH       32
>>>
>>> -int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>>> +int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
>>> +                    unsigned int flags)
>>>  {
>>>       if (pp->free_hole >= pp->nr_holes) {
>>>               pp->holes = xrealloc(pp->holes,
>>> @@ -290,11 +291,17 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>>>               if (!pp->holes)
>>>                       return -1;
>>>
>>> +             pp->hole_flags = xrealloc(pp->hole_flags,
>>> +                                       (pp->nr_holes + PP_HOLES_BATCH) * sizeof(unsigned int));
>>> +             if(!pp->hole_flags)
>>> +                     return -1;
>>> +
>>>               pp->nr_holes += PP_HOLES_BATCH;
>>>       }
>>>
>>>       if (pp->free_hole &&
>>> -                     iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
>>> +         pp->hole_flags[pp->free_hole - 1] == flags &&
>>> +         iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
>>>               goto out;
>>>
>>>       if (pp->flags & PP_COMPAT) {
>>> @@ -304,6 +311,9 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>>>       } else {
>>>               iov_init(&pp->holes[pp->free_hole++], addr);
>>>       }
>>> +
>>> +     pp->hole_flags[pp->free_hole - 1] = flags;
>>> +
>>>  out:
>>>       return 0;
>>>  }
>>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>>> index fbf087c..5b61a3f 100644
>>> --- a/criu/page-xfer.c
>>> +++ b/criu/page-xfer.c
>>> @@ -37,6 +37,7 @@ static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
>>>  #define PS_IOV_OPEN  3
>>>  #define PS_IOV_OPEN2 4
>>>  #define PS_IOV_PARENT        5
>>> +#define PS_IOV_ZERO  6
>>>
>>>  #define PS_IOV_FLUSH         0x1023
>>>  #define PS_IOV_FLUSH_N_CLOSE 0x1024
>>> @@ -104,9 +105,10 @@ static int write_pages_to_server(struct page_xfer *xfer,
>>>       return 0;
>>>  }
>>>
>>> -static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
>>> +static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov,
>>> +                             int type)
>>>  {
>>> -     return send_iov(xfer->sk, PS_IOV_HOLE, xfer->dst_id, iov);
>>> +     return send_iov(xfer->sk, type, xfer->dst_id, iov);
>>>  }
>>>
>>>  static void close_server_xfer(struct page_xfer *xfer)
>>> @@ -223,24 +225,35 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
>>>       }
>>>  }
>>>
>>> -static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
>>> +static int write_hole_loc(struct page_xfer *xfer, struct iovec *iov, int type)
>>>  {
>>>       PagemapEntry pe = PAGEMAP_ENTRY__INIT;
>>>
>>> -     if (xfer->parent != NULL) {
>>> -             int ret;
>>> +     iovec2pagemap(iov, &pe);
>>>
>>> -             ret = check_pagehole_in_parent(xfer->parent, iov);
>>> -             if (ret) {
>>> -                     pr_err("Hole %p/%zu not found in parent\n",
>>> -                                     iov->iov_base, iov->iov_len);
>>> -                     return -1;
>>> +     switch (type) {
>>> +     case PS_IOV_HOLE:
>>> +             if (xfer->parent != NULL) {
>>> +                     int ret;
>>> +
>>> +                     ret = check_pagehole_in_parent(xfer->parent, iov);
>>> +                     if (ret) {
>>> +                             pr_err("Hole %p/%zu not found in parent\n",
>>> +                                    iov->iov_base, iov->iov_len);
>>> +                             return -1;
>>> +                     }
>>>               }
>>> -     }
>>>
>>> -     iovec2pagemap(iov, &pe);
>>> -     pe.has_in_parent = true;
>>> -     pe.in_parent = true;
>>> +             pe.has_in_parent = true;
>>> +             pe.in_parent = true;
>>> +             break;
>>> +     case PS_IOV_ZERO:
>>> +             pe.has_zero = true;
>>> +             pe.zero = true;
>>> +             break;
>>> +     default:
>>> +             return -1;
>>> +     }
>>>
>>>       if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
>>>               return -1;
>>> @@ -307,7 +320,7 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
>>>  out:
>>>       xfer->write_pagemap = write_pagemap_loc;
>>>       xfer->write_pages = write_pages_loc;
>>> -     xfer->write_hole = write_pagehole_loc;
>>> +     xfer->write_hole = write_hole_loc;
>>>       xfer->close = close_page_xfer;
>>>       return 0;
>>>  }
>>> @@ -321,14 +334,14 @@ int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
>>>  }
>>>
>>>  static int page_xfer_dump_hole(struct page_xfer *xfer,
>>> -             struct iovec *hole, unsigned long off)
>>> +                            struct iovec *hole, unsigned long off, int type)
>>>  {
>>>       BUG_ON(hole->iov_base < (void *)off);
>>>       hole->iov_base -= off;
>>>       pr_debug("\th %p [%u]\n", hole->iov_base,
>>>                       (unsigned int)(hole->iov_len / PAGE_SIZE));
>>>
>>> -     if (xfer->write_hole(xfer, hole))
>>> +     if (xfer->write_hole(xfer, hole, type))
>>>               return -1;
>>>
>>>       return 0;
>>> @@ -349,6 +362,20 @@ static struct iovec get_iov(struct iovec *iovs, unsigned int n, bool compat)
>>>       }
>>>  }
>>>
>>> +static int get_hole_type(struct page_pipe *pp, int n)
>>> +{
>>> +     unsigned int hole_flags = pp->hole_flags[n];
>>> +
>>> +     if (hole_flags == PP_HOLE_PARENT)
>>> +             return PS_IOV_HOLE;
>>> +     if (hole_flags == PP_HOLE_ZERO)
>>> +             return PS_IOV_ZERO;
>>> +     else
>>> +             BUG();
>>> +
>>> +     return -1;
>>> +}
>>> +
>>>  static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>>>                     unsigned int *cur_hole, void *limit, unsigned long off)
>>>  {
>>> @@ -357,11 +384,12 @@ static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>>>       for (; *cur_hole < pp->free_hole ; (*cur_hole)++) {
>>>               struct iovec hole = get_iov(pp->holes, *cur_hole,
>>>                                           pp->flags & PP_COMPAT);
>>> +             int hole_type = get_hole_type(pp, *cur_hole);
>>>
>>>               if (limit && hole.iov_base >= limit)
>>>                       break;
>>>
>>> -             ret = page_xfer_dump_hole(xfer, &hole, off);
>>> +             ret = page_xfer_dump_hole(xfer, &hole, off, hole_type);
>>>               if (ret)
>>>                       return ret;
>>>       }
>>> @@ -589,7 +617,7 @@ static int page_server_hole(int sk, struct page_server_iov *pi)
>>>               return -1;
>>>
>>>       psi2iovec(pi, &iov);
>>> -     if (lxfer->write_hole(lxfer, &iov))
>>> +     if (lxfer->write_hole(lxfer, &iov, pi->cmd))
>>>               return -1;
>>>
>>>       return 0;
>>> @@ -645,6 +673,7 @@ static int page_server_serve(int sk)
>>>                       ret = page_server_add(sk, &pi);
>>>                       break;
>>>               case PS_IOV_HOLE:
>>> +             case PS_IOV_ZERO:
>>>                       ret = page_server_hole(sk, &pi);
>>>                       break;
>>>               case PS_IOV_FLUSH:
>>> diff --git a/criu/pagemap.c b/criu/pagemap.c
>>> index fded268..2416259 100644
>>> --- a/criu/pagemap.c
>>> +++ b/criu/pagemap.c
>>> @@ -121,14 +121,25 @@ int dedup_one_iovec(struct page_read *pr, struct iovec *iov)
>>>       return 0;
>>>  }
>>>
>>> +static void put_pagemap(struct page_read *pr)
>>> +{
>>> +     pr->curr_pme++;
>>> +}
>>> +
>>>  static int get_pagemap(struct page_read *pr, struct iovec *iov)
>>>  {
>>>       PagemapEntry *pe;
>>>
>>> -     if (pr->curr_pme >= pr->nr_pmes)
>>> -             return 0;
>>> +     for (;;) {
>>> +             if (pr->curr_pme >= pr->nr_pmes)
>>> +                     return 0;
>>> +
>>> +             pe = pr->pmes[pr->curr_pme];
>>>
>>> -     pe = pr->pmes[pr->curr_pme];
>>> +             if (!pe->zero)
>>> +                     break;
>>> +             put_pagemap(pr);
>>> +     }
>>>
>>>       pagemap2iovec(pe, iov);
>>>
>>> @@ -143,18 +154,13 @@ static int get_pagemap(struct page_read *pr, struct iovec *iov)
>>>       return 1;
>>>  }
>>>
>>> -static void put_pagemap(struct page_read *pr)
>>> -{
>>> -     pr->curr_pme++;
>>> -}
>>> -
>>>  static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
>>>  {
>>>       if (!len)
>>>               return;
>>>
>>>       pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
>>> -     if (!pr->pe->in_parent)
>>> +     if (!pr->pe->in_parent && !pr->pe->zero)
>>>               pr->pi_off += len;
>>>       pr->cvaddr += len;
>>>  }
>>> @@ -256,6 +262,9 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
>>>                       vaddr += p_nr * PAGE_SIZE;
>>>                       buf += p_nr * PAGE_SIZE;
>>>               } while (nr);
>>> +     } else if (pr->pe->zero) {
>>> +             /* zero mappings should be skipped by get_pagemap */
>>> +             BUG();
>>>       } else {
>>>               int fd = img_raw_fd(pr->pi);
>>>               off_t current_vaddr = lseek(fd, pr->pi_off, SEEK_SET);
>>> diff --git a/criu/stats.c b/criu/stats.c
>>> index 12b7d05..c01e010 100644
>>> --- a/criu/stats.c
>>> +++ b/criu/stats.c
>>> @@ -122,6 +122,7 @@ void write_stats(int what)
>>>               ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED];
>>>               ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT];
>>>               ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN];
>>> +             ds_entry.pages_zero = dstats->counts[CNT_PAGES_ZERO];
>>>
>>>               name = "dump";
>>>       } else if (what == RESTORE_STATS) {
>>> diff --git a/images/pagemap.proto b/images/pagemap.proto
>>> index e45549c..0008162 100644
>>> --- a/images/pagemap.proto
>>> +++ b/images/pagemap.proto
>>> @@ -10,4 +10,5 @@ message pagemap_entry {
>>>       required uint64 vaddr           = 1 [(criu).hex = true];
>>>       required uint32 nr_pages        = 2;
>>>       optional bool   in_parent       = 3;
>>> +     optional bool   zero            = 4;
>>>  }
>>> diff --git a/images/stats.proto b/images/stats.proto
>>> index 8188766..c099eb1 100644
>>> --- a/images/stats.proto
>>> +++ b/images/stats.proto
>>> @@ -12,6 +12,8 @@ message dump_stats_entry {
>>>       required uint64                 pages_written           = 7;
>>>
>>>       optional uint32                 irmap_resolve           = 8;
>>> +
>>> +     required uint64                 pages_zero              = 9;
>>>  }
>>>
>>>  message restore_stats_entry {
>>>
>>
> 
> 
> 



More information about the CRIU mailing list