[CRIU] [PATCH v6 6/9] criu: pagemap: add entries for zero pages

Mike Rapoport mike.rapoport at gmail.com
Fri Jul 15 11:38:08 PDT 2016


On Fri, Jul 15, 2016 at 9:27 PM, Pavel Emelyanov <xemul at virtuozzo.com> wrote:
> On 07/15/2016 09:13 PM, Mike Rapoport wrote:
>> On Fri, Jul 15, 2016 at 8:33 PM, Pavel Emelyanov <xemul at virtuozzo.com> wrote:
>>> On 07/14/2016 03:49 PM, Mike Rapoport wrote:
>>>> The pages that are mapped to zero_page_pfn are not dumped but information
>>>> where are they located is required for lazy restore.
>>>> Note that get_pagemap users presumed that zero pages are not a part of the
>>>> pagemap and these pages were just silently skipped during memory restore.
>>>> At the moment I preserve this semantics and force get_pagemap to skip zero
>>>> pages.
>>>>
>>>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
>>>> ---
>>>>  criu/include/page-pipe.h |  7 ++++-
>>>>  criu/include/page-xfer.h |  2 +-
>>>>  criu/include/stats.h     |  1 +
>>>>  criu/mem.c               | 28 ++++++++++++--------
>>>>  criu/page-pipe.c         | 14 ++++++++--
>>>>  criu/page-xfer.c         | 67 ++++++++++++++++++++++++++++++++++--------------
>>>>  criu/pagemap.c           | 27 ++++++++++++-------
>>>>  criu/stats.c             |  1 +
>>>>  images/pagemap.proto     |  1 +
>>>>  images/stats.proto       |  2 ++
>>>>  10 files changed, 108 insertions(+), 42 deletions(-)
>>>>
>>>> diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
>>>> index 635dd6d..a0c6791 100644
>>>> --- a/criu/include/page-pipe.h
>>>> +++ b/criu/include/page-pipe.h
>>>> @@ -80,6 +80,9 @@ struct page_pipe_buf {
>>>>       struct list_head l;     /* links into page_pipe->bufs */
>>>>  };
>>>>
>>>> +#define PP_HOLE_PARENT (1 << 0)
>>>> +#define PP_HOLE_ZERO   (1 << 1)
>>>> +
>>>>  struct page_pipe {
>>>>       unsigned int nr_pipes;  /* how many page_pipe_bufs in there */
>>>>       struct list_head bufs;  /* list of bufs */
>>>> @@ -92,6 +95,7 @@ struct page_pipe {
>>>>       unsigned int nr_holes;  /* number of holes allocated */
>>>>       unsigned int free_hole; /* number of holes in use */
>>>>       struct iovec *holes;    /* holes */
>>>> +     unsigned int *hole_flags;
>>>>
>>>>       unsigned flags;         /* PP_FOO flags below */
>>>>  };
>>>> @@ -111,7 +115,8 @@ extern struct page_pipe *create_page_pipe(unsigned int nr, struct iovec *, unsig
>>>>  extern void destroy_page_pipe(struct page_pipe *p);
>>>>  extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr,
>>>>                             unsigned int flags);
>>>> -extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
>>>> +extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
>>>> +                           unsigned int flags);
>>>>
>>>>  extern void debug_show_page_pipe(struct page_pipe *pp);
>>>>  void page_pipe_reinit(struct page_pipe *pp);
>>>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>>>> index d19671b..3ba61ed 100644
>>>> --- a/criu/include/page-xfer.h
>>>> +++ b/criu/include/page-xfer.h
>>>> @@ -16,7 +16,7 @@ struct page_xfer {
>>>>       /* transfers pages related to previous pagemap */
>>>>       int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
>>>>       /* transfers one hole -- vaddr:len entry w/o pages */
>>>> -     int (*write_hole)(struct page_xfer *self, struct iovec *iov);
>>>> +     int (*write_hole)(struct page_xfer *self, struct iovec *iov, int type);
>>>>       void (*close)(struct page_xfer *self);
>>>>
>>>>       /* private data for every page-xfer engine */
>>>> diff --git a/criu/include/stats.h b/criu/include/stats.h
>>>> index e417636..c0effa7 100644
>>>> --- a/criu/include/stats.h
>>>> +++ b/criu/include/stats.h
>>>> @@ -25,6 +25,7 @@ enum {
>>>>       CNT_PAGES_SCANNED,
>>>>       CNT_PAGES_SKIPPED_PARENT,
>>>>       CNT_PAGES_WRITTEN,
>>>> +     CNT_PAGES_ZERO,
>>>>
>>>>       DUMP_CNT_NR_STATS,
>>>>  };
>>>> diff --git a/criu/mem.c b/criu/mem.c
>>>> index 5530ad7..455d75f 100644
>>>> --- a/criu/mem.c
>>>> +++ b/criu/mem.c
>>>> @@ -107,14 +107,17 @@ static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
>>>>               return false;
>>>>       if (vma_entry_is(vmae, VMA_AREA_AIORING))
>>>>               return true;
>>>> -     if (pme & PME_SWAP)
>>>> -             return true;
>>>> -     if ((pme & PME_PRESENT) && ((pme & PME_PFRAME_MASK) != kdat.zero_page_pfn))
>>>> +     if (pme & (PME_PRESENT | PME_SWAP))
>>>>               return true;
>>>
>>> I've just realized one thing I don't understand :)
>>>
>>> Let's take two PTEs, one is not present (and not swapped) and thus the pagemap
>>> entry for it is simply not generated. The other one is mapped with zero pfn
>>> and for this the pagemap zero is generated.
>>>
>>> On regular restore both PTEs will be left uninitialized, that's why we just
>>> skip the zero pagemaps in the image. But what would be the difference on
>>> lazy restore? If the process #PF-s on non present PTE and on zero PTE what
>>> will lazy daemon do in both cases?
>>
>> For regular restore, a read #PF will implicitly map the page to
>> zero_pfn. So, when a restored process tries to read from those pages,
>> it gets zeroes, as it should.
>
> Yes, this is what I mean -- there's not difference between zero pfn maps
> and non-present maps in regular restore case.
>
>> For the userfault case, the #PF is delegated to the user space and the
>> PTE remains uninitialized until the call to mcopy_atomoc/mzero_atomic.
>
> Yes, that's true.
>
>> Therefore uffd.c has to tell the kernel what would be the contents of
>> a faulting page, even if it's zeroes, because otherwise there will be
>> no PTE...
>
> Exactly. So what's the difference in uffd daemon behavior when it gets
> a #PF into "zero hole" and in the vaddr that doesn't have covering pagemap
> entry?

No real difference :)
With current implmentation of the lazy-pages daemon, we can continue
using "implicit" zero pages.
I think, it's just more convinient to have "zero holes" defined
explicitly. Opens some room for optimization as well. E.g. the uffd
daemon can traverse the pagemap in the background and fill zero pages
"in advance".

>>> -- Pavel
>>>
>>>>       return false;
>>>>  }
>>>>
>>>> +static inline bool page_is_zero(u64 pme)
>>>> +{
>>>> +     return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn;
>>>> +}
>>>> +
>>>>  static inline bool page_in_parent(u64 pme)
>>>>  {
>>>>       /*
>>>> @@ -138,7 +141,7 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>>>>  {
>>>>       u64 *at = &map[PAGE_PFN(*off)];
>>>>       unsigned long pfn, nr_to_scan;
>>>> -     unsigned long pages[2] = {};
>>>> +     unsigned long pages[3] = {};
>>>>
>>>>       nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
>>>>
>>>> @@ -162,12 +165,15 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>>>>                * page. The latter would be checked in page-xfer.
>>>>                */
>>>>
>>>> -             if (has_parent && page_in_parent(at[pfn])) {
>>>> -                     ret = page_pipe_add_hole(pp, vaddr);
>>>> +             if (page_is_zero(at[pfn])) {
>>>> +                     ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_ZERO);
>>>>                       pages[0]++;
>>>> +             } else if (has_parent && page_in_parent(at[pfn])) {
>>>> +                     ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT);
>>>> +                     pages[1]++;
>>>>               } else {
>>>>                       ret = page_pipe_add_page(pp, vaddr, ppb_flags);
>>>> -                     pages[1]++;
>>>> +                     pages[2]++;
>>>>               }
>>>>
>>>>               if (ret) {
>>>> @@ -179,10 +185,12 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>>>>       *off += pfn * PAGE_SIZE;
>>>>
>>>>       cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
>>>> -     cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
>>>> -     cnt_add(CNT_PAGES_WRITTEN, pages[1]);
>>>> +     cnt_add(CNT_PAGES_ZERO, pages[0]);
>>>> +     cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[1]);
>>>> +     cnt_add(CNT_PAGES_WRITTEN, pages[2]);
>>>>
>>>> -     pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);
>>>> +     pr_info("Pagemap generated: %lu pages %lu holes %lu zeros\n",
>>>> +             pages[2], pages[1], pages[0]);
>>>>       return 0;
>>>>  }
>>>>
>>>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>>>> index 403af7e..a2a3b8e 100644
>>>> --- a/criu/page-pipe.c
>>>> +++ b/criu/page-pipe.c
>>>> @@ -282,7 +282,8 @@ int page_pipe_add_page(struct page_pipe *pp, unsigned long addr,
>>>>
>>>>  #define PP_HOLES_BATCH       32
>>>>
>>>> -int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>>>> +int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
>>>> +                    unsigned int flags)
>>>>  {
>>>>       if (pp->free_hole >= pp->nr_holes) {
>>>>               pp->holes = xrealloc(pp->holes,
>>>> @@ -290,11 +291,17 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>>>>               if (!pp->holes)
>>>>                       return -1;
>>>>
>>>> +             pp->hole_flags = xrealloc(pp->hole_flags,
>>>> +                                       (pp->nr_holes + PP_HOLES_BATCH) * sizeof(unsigned int));
>>>> +             if(!pp->hole_flags)
>>>> +                     return -1;
>>>> +
>>>>               pp->nr_holes += PP_HOLES_BATCH;
>>>>       }
>>>>
>>>>       if (pp->free_hole &&
>>>> -                     iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
>>>> +         pp->hole_flags[pp->free_hole - 1] == flags &&
>>>> +         iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
>>>>               goto out;
>>>>
>>>>       if (pp->flags & PP_COMPAT) {
>>>> @@ -304,6 +311,9 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>>>>       } else {
>>>>               iov_init(&pp->holes[pp->free_hole++], addr);
>>>>       }
>>>> +
>>>> +     pp->hole_flags[pp->free_hole - 1] = flags;
>>>> +
>>>>  out:
>>>>       return 0;
>>>>  }
>>>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>>>> index fbf087c..5b61a3f 100644
>>>> --- a/criu/page-xfer.c
>>>> +++ b/criu/page-xfer.c
>>>> @@ -37,6 +37,7 @@ static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
>>>>  #define PS_IOV_OPEN  3
>>>>  #define PS_IOV_OPEN2 4
>>>>  #define PS_IOV_PARENT        5
>>>> +#define PS_IOV_ZERO  6
>>>>
>>>>  #define PS_IOV_FLUSH         0x1023
>>>>  #define PS_IOV_FLUSH_N_CLOSE 0x1024
>>>> @@ -104,9 +105,10 @@ static int write_pages_to_server(struct page_xfer *xfer,
>>>>       return 0;
>>>>  }
>>>>
>>>> -static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
>>>> +static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov,
>>>> +                             int type)
>>>>  {
>>>> -     return send_iov(xfer->sk, PS_IOV_HOLE, xfer->dst_id, iov);
>>>> +     return send_iov(xfer->sk, type, xfer->dst_id, iov);
>>>>  }
>>>>
>>>>  static void close_server_xfer(struct page_xfer *xfer)
>>>> @@ -223,24 +225,35 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
>>>>       }
>>>>  }
>>>>
>>>> -static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
>>>> +static int write_hole_loc(struct page_xfer *xfer, struct iovec *iov, int type)
>>>>  {
>>>>       PagemapEntry pe = PAGEMAP_ENTRY__INIT;
>>>>
>>>> -     if (xfer->parent != NULL) {
>>>> -             int ret;
>>>> +     iovec2pagemap(iov, &pe);
>>>>
>>>> -             ret = check_pagehole_in_parent(xfer->parent, iov);
>>>> -             if (ret) {
>>>> -                     pr_err("Hole %p/%zu not found in parent\n",
>>>> -                                     iov->iov_base, iov->iov_len);
>>>> -                     return -1;
>>>> +     switch (type) {
>>>> +     case PS_IOV_HOLE:
>>>> +             if (xfer->parent != NULL) {
>>>> +                     int ret;
>>>> +
>>>> +                     ret = check_pagehole_in_parent(xfer->parent, iov);
>>>> +                     if (ret) {
>>>> +                             pr_err("Hole %p/%zu not found in parent\n",
>>>> +                                    iov->iov_base, iov->iov_len);
>>>> +                             return -1;
>>>> +                     }
>>>>               }
>>>> -     }
>>>>
>>>> -     iovec2pagemap(iov, &pe);
>>>> -     pe.has_in_parent = true;
>>>> -     pe.in_parent = true;
>>>> +             pe.has_in_parent = true;
>>>> +             pe.in_parent = true;
>>>> +             break;
>>>> +     case PS_IOV_ZERO:
>>>> +             pe.has_zero = true;
>>>> +             pe.zero = true;
>>>> +             break;
>>>> +     default:
>>>> +             return -1;
>>>> +     }
>>>>
>>>>       if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
>>>>               return -1;
>>>> @@ -307,7 +320,7 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
>>>>  out:
>>>>       xfer->write_pagemap = write_pagemap_loc;
>>>>       xfer->write_pages = write_pages_loc;
>>>> -     xfer->write_hole = write_pagehole_loc;
>>>> +     xfer->write_hole = write_hole_loc;
>>>>       xfer->close = close_page_xfer;
>>>>       return 0;
>>>>  }
>>>> @@ -321,14 +334,14 @@ int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
>>>>  }
>>>>
>>>>  static int page_xfer_dump_hole(struct page_xfer *xfer,
>>>> -             struct iovec *hole, unsigned long off)
>>>> +                            struct iovec *hole, unsigned long off, int type)
>>>>  {
>>>>       BUG_ON(hole->iov_base < (void *)off);
>>>>       hole->iov_base -= off;
>>>>       pr_debug("\th %p [%u]\n", hole->iov_base,
>>>>                       (unsigned int)(hole->iov_len / PAGE_SIZE));
>>>>
>>>> -     if (xfer->write_hole(xfer, hole))
>>>> +     if (xfer->write_hole(xfer, hole, type))
>>>>               return -1;
>>>>
>>>>       return 0;
>>>> @@ -349,6 +362,20 @@ static struct iovec get_iov(struct iovec *iovs, unsigned int n, bool compat)
>>>>       }
>>>>  }
>>>>
>>>> +static int get_hole_type(struct page_pipe *pp, int n)
>>>> +{
>>>> +     unsigned int hole_flags = pp->hole_flags[n];
>>>> +
>>>> +     if (hole_flags == PP_HOLE_PARENT)
>>>> +             return PS_IOV_HOLE;
>>>> +     if (hole_flags == PP_HOLE_ZERO)
>>>> +             return PS_IOV_ZERO;
>>>> +     else
>>>> +             BUG();
>>>> +
>>>> +     return -1;
>>>> +}
>>>> +
>>>>  static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>>>>                     unsigned int *cur_hole, void *limit, unsigned long off)
>>>>  {
>>>> @@ -357,11 +384,12 @@ static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>>>>       for (; *cur_hole < pp->free_hole ; (*cur_hole)++) {
>>>>               struct iovec hole = get_iov(pp->holes, *cur_hole,
>>>>                                           pp->flags & PP_COMPAT);
>>>> +             int hole_type = get_hole_type(pp, *cur_hole);
>>>>
>>>>               if (limit && hole.iov_base >= limit)
>>>>                       break;
>>>>
>>>> -             ret = page_xfer_dump_hole(xfer, &hole, off);
>>>> +             ret = page_xfer_dump_hole(xfer, &hole, off, hole_type);
>>>>               if (ret)
>>>>                       return ret;
>>>>       }
>>>> @@ -589,7 +617,7 @@ static int page_server_hole(int sk, struct page_server_iov *pi)
>>>>               return -1;
>>>>
>>>>       psi2iovec(pi, &iov);
>>>> -     if (lxfer->write_hole(lxfer, &iov))
>>>> +     if (lxfer->write_hole(lxfer, &iov, pi->cmd))
>>>>               return -1;
>>>>
>>>>       return 0;
>>>> @@ -645,6 +673,7 @@ static int page_server_serve(int sk)
>>>>                       ret = page_server_add(sk, &pi);
>>>>                       break;
>>>>               case PS_IOV_HOLE:
>>>> +             case PS_IOV_ZERO:
>>>>                       ret = page_server_hole(sk, &pi);
>>>>                       break;
>>>>               case PS_IOV_FLUSH:
>>>> diff --git a/criu/pagemap.c b/criu/pagemap.c
>>>> index fded268..2416259 100644
>>>> --- a/criu/pagemap.c
>>>> +++ b/criu/pagemap.c
>>>> @@ -121,14 +121,25 @@ int dedup_one_iovec(struct page_read *pr, struct iovec *iov)
>>>>       return 0;
>>>>  }
>>>>
>>>> +static void put_pagemap(struct page_read *pr)
>>>> +{
>>>> +     pr->curr_pme++;
>>>> +}
>>>> +
>>>>  static int get_pagemap(struct page_read *pr, struct iovec *iov)
>>>>  {
>>>>       PagemapEntry *pe;
>>>>
>>>> -     if (pr->curr_pme >= pr->nr_pmes)
>>>> -             return 0;
>>>> +     for (;;) {
>>>> +             if (pr->curr_pme >= pr->nr_pmes)
>>>> +                     return 0;
>>>> +
>>>> +             pe = pr->pmes[pr->curr_pme];
>>>>
>>>> -     pe = pr->pmes[pr->curr_pme];
>>>> +             if (!pe->zero)
>>>> +                     break;
>>>> +             put_pagemap(pr);
>>>> +     }
>>>>
>>>>       pagemap2iovec(pe, iov);
>>>>
>>>> @@ -143,18 +154,13 @@ static int get_pagemap(struct page_read *pr, struct iovec *iov)
>>>>       return 1;
>>>>  }
>>>>
>>>> -static void put_pagemap(struct page_read *pr)
>>>> -{
>>>> -     pr->curr_pme++;
>>>> -}
>>>> -
>>>>  static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
>>>>  {
>>>>       if (!len)
>>>>               return;
>>>>
>>>>       pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
>>>> -     if (!pr->pe->in_parent)
>>>> +     if (!pr->pe->in_parent && !pr->pe->zero)
>>>>               pr->pi_off += len;
>>>>       pr->cvaddr += len;
>>>>  }
>>>> @@ -256,6 +262,9 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
>>>>                       vaddr += p_nr * PAGE_SIZE;
>>>>                       buf += p_nr * PAGE_SIZE;
>>>>               } while (nr);
>>>> +     } else if (pr->pe->zero) {
>>>> +             /* zero mappings should be skipped by get_pagemap */
>>>> +             BUG();
>>>>       } else {
>>>>               int fd = img_raw_fd(pr->pi);
>>>>               off_t current_vaddr = lseek(fd, pr->pi_off, SEEK_SET);
>>>> diff --git a/criu/stats.c b/criu/stats.c
>>>> index 12b7d05..c01e010 100644
>>>> --- a/criu/stats.c
>>>> +++ b/criu/stats.c
>>>> @@ -122,6 +122,7 @@ void write_stats(int what)
>>>>               ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED];
>>>>               ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT];
>>>>               ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN];
>>>> +             ds_entry.pages_zero = dstats->counts[CNT_PAGES_ZERO];
>>>>
>>>>               name = "dump";
>>>>       } else if (what == RESTORE_STATS) {
>>>> diff --git a/images/pagemap.proto b/images/pagemap.proto
>>>> index e45549c..0008162 100644
>>>> --- a/images/pagemap.proto
>>>> +++ b/images/pagemap.proto
>>>> @@ -10,4 +10,5 @@ message pagemap_entry {
>>>>       required uint64 vaddr           = 1 [(criu).hex = true];
>>>>       required uint32 nr_pages        = 2;
>>>>       optional bool   in_parent       = 3;
>>>> +     optional bool   zero            = 4;
>>>>  }
>>>> diff --git a/images/stats.proto b/images/stats.proto
>>>> index 8188766..c099eb1 100644
>>>> --- a/images/stats.proto
>>>> +++ b/images/stats.proto
>>>> @@ -12,6 +12,8 @@ message dump_stats_entry {
>>>>       required uint64                 pages_written           = 7;
>>>>
>>>>       optional uint32                 irmap_resolve           = 8;
>>>> +
>>>> +     required uint64                 pages_zero              = 9;
>>>>  }
>>>>
>>>>  message restore_stats_entry {
>>>>
>>>
>>
>>
>>
>



-- 
Sincerely yours,
Mike.


More information about the CRIU mailing list