[CRIU] [PATCH v6 6/9] criu: pagemap: add entries for zero pages
Mike Rapoport
mike.rapoport at gmail.com
Fri Jul 15 11:13:57 PDT 2016
On Fri, Jul 15, 2016 at 8:33 PM, Pavel Emelyanov <xemul at virtuozzo.com> wrote:
> On 07/14/2016 03:49 PM, Mike Rapoport wrote:
>> The pages that are mapped to zero_page_pfn are not dumped but information
>> where are they located is required for lazy restore.
>> Note that get_pagemap users presumed that zero pages are not a part of the
>> pagemap and these pages were just silently skipped during memory restore.
>> At the moment I preserve this semantics and force get_pagemap to skip zero
>> pages.
>>
>> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
>> ---
>> criu/include/page-pipe.h | 7 ++++-
>> criu/include/page-xfer.h | 2 +-
>> criu/include/stats.h | 1 +
>> criu/mem.c | 28 ++++++++++++--------
>> criu/page-pipe.c | 14 ++++++++--
>> criu/page-xfer.c | 67 ++++++++++++++++++++++++++++++++++--------------
>> criu/pagemap.c | 27 ++++++++++++-------
>> criu/stats.c | 1 +
>> images/pagemap.proto | 1 +
>> images/stats.proto | 2 ++
>> 10 files changed, 108 insertions(+), 42 deletions(-)
>>
>> diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
>> index 635dd6d..a0c6791 100644
>> --- a/criu/include/page-pipe.h
>> +++ b/criu/include/page-pipe.h
>> @@ -80,6 +80,9 @@ struct page_pipe_buf {
>> struct list_head l; /* links into page_pipe->bufs */
>> };
>>
>> +#define PP_HOLE_PARENT (1 << 0)
>> +#define PP_HOLE_ZERO (1 << 1)
>> +
>> struct page_pipe {
>> unsigned int nr_pipes; /* how many page_pipe_bufs in there */
>> struct list_head bufs; /* list of bufs */
>> @@ -92,6 +95,7 @@ struct page_pipe {
>> unsigned int nr_holes; /* number of holes allocated */
>> unsigned int free_hole; /* number of holes in use */
>> struct iovec *holes; /* holes */
>> + unsigned int *hole_flags;
>>
>> unsigned flags; /* PP_FOO flags below */
>> };
>> @@ -111,7 +115,8 @@ extern struct page_pipe *create_page_pipe(unsigned int nr, struct iovec *, unsig
>> extern void destroy_page_pipe(struct page_pipe *p);
>> extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr,
>> unsigned int flags);
>> -extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
>> +extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
>> + unsigned int flags);
>>
>> extern void debug_show_page_pipe(struct page_pipe *pp);
>> void page_pipe_reinit(struct page_pipe *pp);
>> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
>> index d19671b..3ba61ed 100644
>> --- a/criu/include/page-xfer.h
>> +++ b/criu/include/page-xfer.h
>> @@ -16,7 +16,7 @@ struct page_xfer {
>> /* transfers pages related to previous pagemap */
>> int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
>> /* transfers one hole -- vaddr:len entry w/o pages */
>> - int (*write_hole)(struct page_xfer *self, struct iovec *iov);
>> + int (*write_hole)(struct page_xfer *self, struct iovec *iov, int type);
>> void (*close)(struct page_xfer *self);
>>
>> /* private data for every page-xfer engine */
>> diff --git a/criu/include/stats.h b/criu/include/stats.h
>> index e417636..c0effa7 100644
>> --- a/criu/include/stats.h
>> +++ b/criu/include/stats.h
>> @@ -25,6 +25,7 @@ enum {
>> CNT_PAGES_SCANNED,
>> CNT_PAGES_SKIPPED_PARENT,
>> CNT_PAGES_WRITTEN,
>> + CNT_PAGES_ZERO,
>>
>> DUMP_CNT_NR_STATS,
>> };
>> diff --git a/criu/mem.c b/criu/mem.c
>> index 5530ad7..455d75f 100644
>> --- a/criu/mem.c
>> +++ b/criu/mem.c
>> @@ -107,14 +107,17 @@ static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
>> return false;
>> if (vma_entry_is(vmae, VMA_AREA_AIORING))
>> return true;
>> - if (pme & PME_SWAP)
>> - return true;
>> - if ((pme & PME_PRESENT) && ((pme & PME_PFRAME_MASK) != kdat.zero_page_pfn))
>> + if (pme & (PME_PRESENT | PME_SWAP))
>> return true;
>
> I've just realized one thing I don't understand :)
>
> Let's take two PTEs, one is not present (and not swapped) and thus the pagemap
> entry for it is simply not generated. The other one is mapped with zero pfn
> and for this the pagemap zero is generated.
>
> On regular restore both PTEs will be left uninitialized, that's why we just
> skip the zero pagemaps in the image. But what would be the difference on
> lazy restore? If the process #PF-s on non present PTE and on zero PTE what
> will lazy daemon do in both cases?
For regular restore, a read #PF will implicitly map the page to
zero_pfn. So, when a restored process tries to read from those pages,
it gets zeroes, as it should.
For the userfault case, the #PF is delegated to the user space and the
PTE remains uninitialized until the call to mcopy_atomoc/mzero_atomic.
Therefore uffd.c has to tell the kernel what would be the contents of
a faulting page, even if it's zeroes, because otherwise there will be
no PTE...
> -- Pavel
>
>> return false;
>> }
>>
>> +static inline bool page_is_zero(u64 pme)
>> +{
>> + return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn;
>> +}
>> +
>> static inline bool page_in_parent(u64 pme)
>> {
>> /*
>> @@ -138,7 +141,7 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>> {
>> u64 *at = &map[PAGE_PFN(*off)];
>> unsigned long pfn, nr_to_scan;
>> - unsigned long pages[2] = {};
>> + unsigned long pages[3] = {};
>>
>> nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
>>
>> @@ -162,12 +165,15 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>> * page. The latter would be checked in page-xfer.
>> */
>>
>> - if (has_parent && page_in_parent(at[pfn])) {
>> - ret = page_pipe_add_hole(pp, vaddr);
>> + if (page_is_zero(at[pfn])) {
>> + ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_ZERO);
>> pages[0]++;
>> + } else if (has_parent && page_in_parent(at[pfn])) {
>> + ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT);
>> + pages[1]++;
>> } else {
>> ret = page_pipe_add_page(pp, vaddr, ppb_flags);
>> - pages[1]++;
>> + pages[2]++;
>> }
>>
>> if (ret) {
>> @@ -179,10 +185,12 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>> *off += pfn * PAGE_SIZE;
>>
>> cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
>> - cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
>> - cnt_add(CNT_PAGES_WRITTEN, pages[1]);
>> + cnt_add(CNT_PAGES_ZERO, pages[0]);
>> + cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[1]);
>> + cnt_add(CNT_PAGES_WRITTEN, pages[2]);
>>
>> - pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);
>> + pr_info("Pagemap generated: %lu pages %lu holes %lu zeros\n",
>> + pages[2], pages[1], pages[0]);
>> return 0;
>> }
>>
>> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
>> index 403af7e..a2a3b8e 100644
>> --- a/criu/page-pipe.c
>> +++ b/criu/page-pipe.c
>> @@ -282,7 +282,8 @@ int page_pipe_add_page(struct page_pipe *pp, unsigned long addr,
>>
>> #define PP_HOLES_BATCH 32
>>
>> -int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>> +int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
>> + unsigned int flags)
>> {
>> if (pp->free_hole >= pp->nr_holes) {
>> pp->holes = xrealloc(pp->holes,
>> @@ -290,11 +291,17 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>> if (!pp->holes)
>> return -1;
>>
>> + pp->hole_flags = xrealloc(pp->hole_flags,
>> + (pp->nr_holes + PP_HOLES_BATCH) * sizeof(unsigned int));
>> + if(!pp->hole_flags)
>> + return -1;
>> +
>> pp->nr_holes += PP_HOLES_BATCH;
>> }
>>
>> if (pp->free_hole &&
>> - iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
>> + pp->hole_flags[pp->free_hole - 1] == flags &&
>> + iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
>> goto out;
>>
>> if (pp->flags & PP_COMPAT) {
>> @@ -304,6 +311,9 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>> } else {
>> iov_init(&pp->holes[pp->free_hole++], addr);
>> }
>> +
>> + pp->hole_flags[pp->free_hole - 1] = flags;
>> +
>> out:
>> return 0;
>> }
>> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
>> index fbf087c..5b61a3f 100644
>> --- a/criu/page-xfer.c
>> +++ b/criu/page-xfer.c
>> @@ -37,6 +37,7 @@ static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
>> #define PS_IOV_OPEN 3
>> #define PS_IOV_OPEN2 4
>> #define PS_IOV_PARENT 5
>> +#define PS_IOV_ZERO 6
>>
>> #define PS_IOV_FLUSH 0x1023
>> #define PS_IOV_FLUSH_N_CLOSE 0x1024
>> @@ -104,9 +105,10 @@ static int write_pages_to_server(struct page_xfer *xfer,
>> return 0;
>> }
>>
>> -static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
>> +static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov,
>> + int type)
>> {
>> - return send_iov(xfer->sk, PS_IOV_HOLE, xfer->dst_id, iov);
>> + return send_iov(xfer->sk, type, xfer->dst_id, iov);
>> }
>>
>> static void close_server_xfer(struct page_xfer *xfer)
>> @@ -223,24 +225,35 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
>> }
>> }
>>
>> -static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
>> +static int write_hole_loc(struct page_xfer *xfer, struct iovec *iov, int type)
>> {
>> PagemapEntry pe = PAGEMAP_ENTRY__INIT;
>>
>> - if (xfer->parent != NULL) {
>> - int ret;
>> + iovec2pagemap(iov, &pe);
>>
>> - ret = check_pagehole_in_parent(xfer->parent, iov);
>> - if (ret) {
>> - pr_err("Hole %p/%zu not found in parent\n",
>> - iov->iov_base, iov->iov_len);
>> - return -1;
>> + switch (type) {
>> + case PS_IOV_HOLE:
>> + if (xfer->parent != NULL) {
>> + int ret;
>> +
>> + ret = check_pagehole_in_parent(xfer->parent, iov);
>> + if (ret) {
>> + pr_err("Hole %p/%zu not found in parent\n",
>> + iov->iov_base, iov->iov_len);
>> + return -1;
>> + }
>> }
>> - }
>>
>> - iovec2pagemap(iov, &pe);
>> - pe.has_in_parent = true;
>> - pe.in_parent = true;
>> + pe.has_in_parent = true;
>> + pe.in_parent = true;
>> + break;
>> + case PS_IOV_ZERO:
>> + pe.has_zero = true;
>> + pe.zero = true;
>> + break;
>> + default:
>> + return -1;
>> + }
>>
>> if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
>> return -1;
>> @@ -307,7 +320,7 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
>> out:
>> xfer->write_pagemap = write_pagemap_loc;
>> xfer->write_pages = write_pages_loc;
>> - xfer->write_hole = write_pagehole_loc;
>> + xfer->write_hole = write_hole_loc;
>> xfer->close = close_page_xfer;
>> return 0;
>> }
>> @@ -321,14 +334,14 @@ int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
>> }
>>
>> static int page_xfer_dump_hole(struct page_xfer *xfer,
>> - struct iovec *hole, unsigned long off)
>> + struct iovec *hole, unsigned long off, int type)
>> {
>> BUG_ON(hole->iov_base < (void *)off);
>> hole->iov_base -= off;
>> pr_debug("\th %p [%u]\n", hole->iov_base,
>> (unsigned int)(hole->iov_len / PAGE_SIZE));
>>
>> - if (xfer->write_hole(xfer, hole))
>> + if (xfer->write_hole(xfer, hole, type))
>> return -1;
>>
>> return 0;
>> @@ -349,6 +362,20 @@ static struct iovec get_iov(struct iovec *iovs, unsigned int n, bool compat)
>> }
>> }
>>
>> +static int get_hole_type(struct page_pipe *pp, int n)
>> +{
>> + unsigned int hole_flags = pp->hole_flags[n];
>> +
>> + if (hole_flags == PP_HOLE_PARENT)
>> + return PS_IOV_HOLE;
>> + if (hole_flags == PP_HOLE_ZERO)
>> + return PS_IOV_ZERO;
>> + else
>> + BUG();
>> +
>> + return -1;
>> +}
>> +
>> static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>> unsigned int *cur_hole, void *limit, unsigned long off)
>> {
>> @@ -357,11 +384,12 @@ static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>> for (; *cur_hole < pp->free_hole ; (*cur_hole)++) {
>> struct iovec hole = get_iov(pp->holes, *cur_hole,
>> pp->flags & PP_COMPAT);
>> + int hole_type = get_hole_type(pp, *cur_hole);
>>
>> if (limit && hole.iov_base >= limit)
>> break;
>>
>> - ret = page_xfer_dump_hole(xfer, &hole, off);
>> + ret = page_xfer_dump_hole(xfer, &hole, off, hole_type);
>> if (ret)
>> return ret;
>> }
>> @@ -589,7 +617,7 @@ static int page_server_hole(int sk, struct page_server_iov *pi)
>> return -1;
>>
>> psi2iovec(pi, &iov);
>> - if (lxfer->write_hole(lxfer, &iov))
>> + if (lxfer->write_hole(lxfer, &iov, pi->cmd))
>> return -1;
>>
>> return 0;
>> @@ -645,6 +673,7 @@ static int page_server_serve(int sk)
>> ret = page_server_add(sk, &pi);
>> break;
>> case PS_IOV_HOLE:
>> + case PS_IOV_ZERO:
>> ret = page_server_hole(sk, &pi);
>> break;
>> case PS_IOV_FLUSH:
>> diff --git a/criu/pagemap.c b/criu/pagemap.c
>> index fded268..2416259 100644
>> --- a/criu/pagemap.c
>> +++ b/criu/pagemap.c
>> @@ -121,14 +121,25 @@ int dedup_one_iovec(struct page_read *pr, struct iovec *iov)
>> return 0;
>> }
>>
>> +static void put_pagemap(struct page_read *pr)
>> +{
>> + pr->curr_pme++;
>> +}
>> +
>> static int get_pagemap(struct page_read *pr, struct iovec *iov)
>> {
>> PagemapEntry *pe;
>>
>> - if (pr->curr_pme >= pr->nr_pmes)
>> - return 0;
>> + for (;;) {
>> + if (pr->curr_pme >= pr->nr_pmes)
>> + return 0;
>> +
>> + pe = pr->pmes[pr->curr_pme];
>>
>> - pe = pr->pmes[pr->curr_pme];
>> + if (!pe->zero)
>> + break;
>> + put_pagemap(pr);
>> + }
>>
>> pagemap2iovec(pe, iov);
>>
>> @@ -143,18 +154,13 @@ static int get_pagemap(struct page_read *pr, struct iovec *iov)
>> return 1;
>> }
>>
>> -static void put_pagemap(struct page_read *pr)
>> -{
>> - pr->curr_pme++;
>> -}
>> -
>> static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
>> {
>> if (!len)
>> return;
>>
>> pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
>> - if (!pr->pe->in_parent)
>> + if (!pr->pe->in_parent && !pr->pe->zero)
>> pr->pi_off += len;
>> pr->cvaddr += len;
>> }
>> @@ -256,6 +262,9 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
>> vaddr += p_nr * PAGE_SIZE;
>> buf += p_nr * PAGE_SIZE;
>> } while (nr);
>> + } else if (pr->pe->zero) {
>> + /* zero mappings should be skipped by get_pagemap */
>> + BUG();
>> } else {
>> int fd = img_raw_fd(pr->pi);
>> off_t current_vaddr = lseek(fd, pr->pi_off, SEEK_SET);
>> diff --git a/criu/stats.c b/criu/stats.c
>> index 12b7d05..c01e010 100644
>> --- a/criu/stats.c
>> +++ b/criu/stats.c
>> @@ -122,6 +122,7 @@ void write_stats(int what)
>> ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED];
>> ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT];
>> ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN];
>> + ds_entry.pages_zero = dstats->counts[CNT_PAGES_ZERO];
>>
>> name = "dump";
>> } else if (what == RESTORE_STATS) {
>> diff --git a/images/pagemap.proto b/images/pagemap.proto
>> index e45549c..0008162 100644
>> --- a/images/pagemap.proto
>> +++ b/images/pagemap.proto
>> @@ -10,4 +10,5 @@ message pagemap_entry {
>> required uint64 vaddr = 1 [(criu).hex = true];
>> required uint32 nr_pages = 2;
>> optional bool in_parent = 3;
>> + optional bool zero = 4;
>> }
>> diff --git a/images/stats.proto b/images/stats.proto
>> index 8188766..c099eb1 100644
>> --- a/images/stats.proto
>> +++ b/images/stats.proto
>> @@ -12,6 +12,8 @@ message dump_stats_entry {
>> required uint64 pages_written = 7;
>>
>> optional uint32 irmap_resolve = 8;
>> +
>> + required uint64 pages_zero = 9;
>> }
>>
>> message restore_stats_entry {
>>
>
--
Sincerely yours,
Mike.
More information about the CRIU
mailing list