[CRIU] [PATCH v6 6/9] criu: pagemap: add entries for zero pages

Pavel Emelyanov xemul at virtuozzo.com
Fri Jul 15 10:33:09 PDT 2016


On 07/14/2016 03:49 PM, Mike Rapoport wrote:
> The pages that are mapped to zero_page_pfn are not dumped but information
> where are they located is required for lazy restore.
> Note that get_pagemap users presumed that zero pages are not a part of the
> pagemap and these pages were just silently skipped during memory restore.
> At the moment I preserve this semantics and force get_pagemap to skip zero
> pages.
> 
> Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> ---
>  criu/include/page-pipe.h |  7 ++++-
>  criu/include/page-xfer.h |  2 +-
>  criu/include/stats.h     |  1 +
>  criu/mem.c               | 28 ++++++++++++--------
>  criu/page-pipe.c         | 14 ++++++++--
>  criu/page-xfer.c         | 67 ++++++++++++++++++++++++++++++++++--------------
>  criu/pagemap.c           | 27 ++++++++++++-------
>  criu/stats.c             |  1 +
>  images/pagemap.proto     |  1 +
>  images/stats.proto       |  2 ++
>  10 files changed, 108 insertions(+), 42 deletions(-)
> 
> diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
> index 635dd6d..a0c6791 100644
> --- a/criu/include/page-pipe.h
> +++ b/criu/include/page-pipe.h
> @@ -80,6 +80,9 @@ struct page_pipe_buf {
>  	struct list_head l;	/* links into page_pipe->bufs */
>  };
>  
> +#define PP_HOLE_PARENT (1 << 0)
> +#define PP_HOLE_ZERO   (1 << 1)
> +
>  struct page_pipe {
>  	unsigned int nr_pipes;	/* how many page_pipe_bufs in there */
>  	struct list_head bufs;	/* list of bufs */
> @@ -92,6 +95,7 @@ struct page_pipe {
>  	unsigned int nr_holes;	/* number of holes allocated */
>  	unsigned int free_hole;	/* number of holes in use */
>  	struct iovec *holes;	/* holes */
> +	unsigned int *hole_flags;
>  
>  	unsigned flags;		/* PP_FOO flags below */
>  };
> @@ -111,7 +115,8 @@ extern struct page_pipe *create_page_pipe(unsigned int nr, struct iovec *, unsig
>  extern void destroy_page_pipe(struct page_pipe *p);
>  extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr,
>  			      unsigned int flags);
> -extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
> +extern int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
> +			      unsigned int flags);
>  
>  extern void debug_show_page_pipe(struct page_pipe *pp);
>  void page_pipe_reinit(struct page_pipe *pp);
> diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
> index d19671b..3ba61ed 100644
> --- a/criu/include/page-xfer.h
> +++ b/criu/include/page-xfer.h
> @@ -16,7 +16,7 @@ struct page_xfer {
>  	/* transfers pages related to previous pagemap */
>  	int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
>  	/* transfers one hole -- vaddr:len entry w/o pages */
> -	int (*write_hole)(struct page_xfer *self, struct iovec *iov);
> +	int (*write_hole)(struct page_xfer *self, struct iovec *iov, int type);
>  	void (*close)(struct page_xfer *self);
>  
>  	/* private data for every page-xfer engine */
> diff --git a/criu/include/stats.h b/criu/include/stats.h
> index e417636..c0effa7 100644
> --- a/criu/include/stats.h
> +++ b/criu/include/stats.h
> @@ -25,6 +25,7 @@ enum {
>  	CNT_PAGES_SCANNED,
>  	CNT_PAGES_SKIPPED_PARENT,
>  	CNT_PAGES_WRITTEN,
> +	CNT_PAGES_ZERO,
>  
>  	DUMP_CNT_NR_STATS,
>  };
> diff --git a/criu/mem.c b/criu/mem.c
> index 5530ad7..455d75f 100644
> --- a/criu/mem.c
> +++ b/criu/mem.c
> @@ -107,14 +107,17 @@ static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
>  		return false;
>  	if (vma_entry_is(vmae, VMA_AREA_AIORING))
>  		return true;
> -	if (pme & PME_SWAP)
> -		return true;
> -	if ((pme & PME_PRESENT) && ((pme & PME_PFRAME_MASK) != kdat.zero_page_pfn))
> +	if (pme & (PME_PRESENT | PME_SWAP))
>  		return true;

I've just realized one thing I don't understand :)

Let's take two PTEs, one is not present (and not swapped) and thus the pagemap
entry for it is simply not generated. The other one is mapped with zero pfn
and for this the pagemap zero is generated.

On regular restore both PTEs will be left uninitialized, that's why we just
skip the zero pagemaps in the image. But what would be the difference on
lazy restore? If the process #PF-s on non present PTE and on zero PTE what
will lazy daemon do in both cases?

-- Pavel

>  	return false;
>  }
>  
> +static inline bool page_is_zero(u64 pme)
> +{
> +	return (pme & PME_PFRAME_MASK) == kdat.zero_page_pfn;
> +}
> +
>  static inline bool page_in_parent(u64 pme)
>  {
>  	/*
> @@ -138,7 +141,7 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>  {
>  	u64 *at = &map[PAGE_PFN(*off)];
>  	unsigned long pfn, nr_to_scan;
> -	unsigned long pages[2] = {};
> +	unsigned long pages[3] = {};
>  
>  	nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
>  
> @@ -162,12 +165,15 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>  		 * page. The latter would be checked in page-xfer.
>  		 */
>  
> -		if (has_parent && page_in_parent(at[pfn])) {
> -			ret = page_pipe_add_hole(pp, vaddr);
> +		if (page_is_zero(at[pfn])) {
> +			ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_ZERO);
>  			pages[0]++;
> +		} else if (has_parent && page_in_parent(at[pfn])) {
> +			ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT);
> +			pages[1]++;
>  		} else {
>  			ret = page_pipe_add_page(pp, vaddr, ppb_flags);
> -			pages[1]++;
> +			pages[2]++;
>  		}
>  
>  		if (ret) {
> @@ -179,10 +185,12 @@ static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u
>  	*off += pfn * PAGE_SIZE;
>  
>  	cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
> -	cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
> -	cnt_add(CNT_PAGES_WRITTEN, pages[1]);
> +	cnt_add(CNT_PAGES_ZERO, pages[0]);
> +	cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[1]);
> +	cnt_add(CNT_PAGES_WRITTEN, pages[2]);
>  
> -	pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);
> +	pr_info("Pagemap generated: %lu pages %lu holes %lu zeros\n",
> +		pages[2], pages[1], pages[0]);
>  	return 0;
>  }
>  
> diff --git a/criu/page-pipe.c b/criu/page-pipe.c
> index 403af7e..a2a3b8e 100644
> --- a/criu/page-pipe.c
> +++ b/criu/page-pipe.c
> @@ -282,7 +282,8 @@ int page_pipe_add_page(struct page_pipe *pp, unsigned long addr,
>  
>  #define PP_HOLES_BATCH	32
>  
> -int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
> +int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr,
> +		       unsigned int flags)
>  {
>  	if (pp->free_hole >= pp->nr_holes) {
>  		pp->holes = xrealloc(pp->holes,
> @@ -290,11 +291,17 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>  		if (!pp->holes)
>  			return -1;
>  
> +		pp->hole_flags = xrealloc(pp->hole_flags,
> +					  (pp->nr_holes + PP_HOLES_BATCH) * sizeof(unsigned int));
> +		if(!pp->hole_flags)
> +			return -1;
> +
>  		pp->nr_holes += PP_HOLES_BATCH;
>  	}
>  
>  	if (pp->free_hole &&
> -			iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
> +	    pp->hole_flags[pp->free_hole - 1] == flags &&
> +	    iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
>  		goto out;
>  
>  	if (pp->flags & PP_COMPAT) {
> @@ -304,6 +311,9 @@ int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
>  	} else {
>  		iov_init(&pp->holes[pp->free_hole++], addr);
>  	}
> +
> +	pp->hole_flags[pp->free_hole - 1] = flags;
> +
>  out:
>  	return 0;
>  }
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index fbf087c..5b61a3f 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -37,6 +37,7 @@ static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
>  #define PS_IOV_OPEN	3
>  #define PS_IOV_OPEN2	4
>  #define PS_IOV_PARENT	5
> +#define PS_IOV_ZERO	6
>  
>  #define PS_IOV_FLUSH		0x1023
>  #define PS_IOV_FLUSH_N_CLOSE	0x1024
> @@ -104,9 +105,10 @@ static int write_pages_to_server(struct page_xfer *xfer,
>  	return 0;
>  }
>  
> -static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
> +static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov,
> +				int type)
>  {
> -	return send_iov(xfer->sk, PS_IOV_HOLE, xfer->dst_id, iov);
> +	return send_iov(xfer->sk, type, xfer->dst_id, iov);
>  }
>  
>  static void close_server_xfer(struct page_xfer *xfer)
> @@ -223,24 +225,35 @@ static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
>  	}
>  }
>  
> -static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
> +static int write_hole_loc(struct page_xfer *xfer, struct iovec *iov, int type)
>  {
>  	PagemapEntry pe = PAGEMAP_ENTRY__INIT;
>  
> -	if (xfer->parent != NULL) {
> -		int ret;
> +	iovec2pagemap(iov, &pe);
>  
> -		ret = check_pagehole_in_parent(xfer->parent, iov);
> -		if (ret) {
> -			pr_err("Hole %p/%zu not found in parent\n",
> -					iov->iov_base, iov->iov_len);
> -			return -1;
> +	switch (type) {
> +	case PS_IOV_HOLE:
> +		if (xfer->parent != NULL) {
> +			int ret;
> +
> +			ret = check_pagehole_in_parent(xfer->parent, iov);
> +			if (ret) {
> +				pr_err("Hole %p/%zu not found in parent\n",
> +				       iov->iov_base, iov->iov_len);
> +				return -1;
> +			}
>  		}
> -	}
>  
> -	iovec2pagemap(iov, &pe);
> -	pe.has_in_parent = true;
> -	pe.in_parent = true;
> +		pe.has_in_parent = true;
> +		pe.in_parent = true;
> +		break;
> +	case PS_IOV_ZERO:
> +		pe.has_zero = true;
> +		pe.zero = true;
> +		break;
> +	default:
> +		return -1;
> +	}
>  
>  	if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
>  		return -1;
> @@ -307,7 +320,7 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
>  out:
>  	xfer->write_pagemap = write_pagemap_loc;
>  	xfer->write_pages = write_pages_loc;
> -	xfer->write_hole = write_pagehole_loc;
> +	xfer->write_hole = write_hole_loc;
>  	xfer->close = close_page_xfer;
>  	return 0;
>  }
> @@ -321,14 +334,14 @@ int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
>  }
>  
>  static int page_xfer_dump_hole(struct page_xfer *xfer,
> -		struct iovec *hole, unsigned long off)
> +			       struct iovec *hole, unsigned long off, int type)
>  {
>  	BUG_ON(hole->iov_base < (void *)off);
>  	hole->iov_base -= off;
>  	pr_debug("\th %p [%u]\n", hole->iov_base,
>  			(unsigned int)(hole->iov_len / PAGE_SIZE));
>  
> -	if (xfer->write_hole(xfer, hole))
> +	if (xfer->write_hole(xfer, hole, type))
>  		return -1;
>  
>  	return 0;
> @@ -349,6 +362,20 @@ static struct iovec get_iov(struct iovec *iovs, unsigned int n, bool compat)
>  	}
>  }
>  
> +static int get_hole_type(struct page_pipe *pp, int n)
> +{
> +	unsigned int hole_flags = pp->hole_flags[n];
> +
> +	if (hole_flags == PP_HOLE_PARENT)
> +		return PS_IOV_HOLE;
> +	if (hole_flags == PP_HOLE_ZERO)
> +		return PS_IOV_ZERO;
> +	else
> +		BUG();
> +
> +	return -1;
> +}
> +
>  static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>  		      unsigned int *cur_hole, void *limit, unsigned long off)
>  {
> @@ -357,11 +384,12 @@ static int dump_holes(struct page_xfer *xfer, struct page_pipe *pp,
>  	for (; *cur_hole < pp->free_hole ; (*cur_hole)++) {
>  		struct iovec hole = get_iov(pp->holes, *cur_hole,
>  					    pp->flags & PP_COMPAT);
> +		int hole_type = get_hole_type(pp, *cur_hole);
>  
>  		if (limit && hole.iov_base >= limit)
>  			break;
>  
> -		ret = page_xfer_dump_hole(xfer, &hole, off);
> +		ret = page_xfer_dump_hole(xfer, &hole, off, hole_type);
>  		if (ret)
>  			return ret;
>  	}
> @@ -589,7 +617,7 @@ static int page_server_hole(int sk, struct page_server_iov *pi)
>  		return -1;
>  
>  	psi2iovec(pi, &iov);
> -	if (lxfer->write_hole(lxfer, &iov))
> +	if (lxfer->write_hole(lxfer, &iov, pi->cmd))
>  		return -1;
>  
>  	return 0;
> @@ -645,6 +673,7 @@ static int page_server_serve(int sk)
>  			ret = page_server_add(sk, &pi);
>  			break;
>  		case PS_IOV_HOLE:
> +		case PS_IOV_ZERO:
>  			ret = page_server_hole(sk, &pi);
>  			break;
>  		case PS_IOV_FLUSH:
> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index fded268..2416259 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -121,14 +121,25 @@ int dedup_one_iovec(struct page_read *pr, struct iovec *iov)
>  	return 0;
>  }
>  
> +static void put_pagemap(struct page_read *pr)
> +{
> +	pr->curr_pme++;
> +}
> +
>  static int get_pagemap(struct page_read *pr, struct iovec *iov)
>  {
>  	PagemapEntry *pe;
>  
> -	if (pr->curr_pme >= pr->nr_pmes)
> -		return 0;
> +	for (;;) {
> +		if (pr->curr_pme >= pr->nr_pmes)
> +			return 0;
> +
> +		pe = pr->pmes[pr->curr_pme];
>  
> -	pe = pr->pmes[pr->curr_pme];
> +		if (!pe->zero)
> +			break;
> +		put_pagemap(pr);
> +	}
>  
>  	pagemap2iovec(pe, iov);
>  
> @@ -143,18 +154,13 @@ static int get_pagemap(struct page_read *pr, struct iovec *iov)
>  	return 1;
>  }
>  
> -static void put_pagemap(struct page_read *pr)
> -{
> -	pr->curr_pme++;
> -}
> -
>  static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
>  {
>  	if (!len)
>  		return;
>  
>  	pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
> -	if (!pr->pe->in_parent)
> +	if (!pr->pe->in_parent && !pr->pe->zero)
>  		pr->pi_off += len;
>  	pr->cvaddr += len;
>  }
> @@ -256,6 +262,9 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
>  			vaddr += p_nr * PAGE_SIZE;
>  			buf += p_nr * PAGE_SIZE;
>  		} while (nr);
> +	} else if (pr->pe->zero) {
> +		/* zero mappings should be skipped by get_pagemap */
> +		BUG();
>  	} else {
>  		int fd = img_raw_fd(pr->pi);
>  		off_t current_vaddr = lseek(fd, pr->pi_off, SEEK_SET);
> diff --git a/criu/stats.c b/criu/stats.c
> index 12b7d05..c01e010 100644
> --- a/criu/stats.c
> +++ b/criu/stats.c
> @@ -122,6 +122,7 @@ void write_stats(int what)
>  		ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED];
>  		ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT];
>  		ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN];
> +		ds_entry.pages_zero = dstats->counts[CNT_PAGES_ZERO];
>  
>  		name = "dump";
>  	} else if (what == RESTORE_STATS) {
> diff --git a/images/pagemap.proto b/images/pagemap.proto
> index e45549c..0008162 100644
> --- a/images/pagemap.proto
> +++ b/images/pagemap.proto
> @@ -10,4 +10,5 @@ message pagemap_entry {
>  	required uint64 vaddr		= 1 [(criu).hex = true];
>  	required uint32 nr_pages	= 2;
>  	optional bool	in_parent	= 3;
> +	optional bool	zero		= 4;
>  }
> diff --git a/images/stats.proto b/images/stats.proto
> index 8188766..c099eb1 100644
> --- a/images/stats.proto
> +++ b/images/stats.proto
> @@ -12,6 +12,8 @@ message dump_stats_entry {
>  	required uint64			pages_written		= 7;
>  
>  	optional uint32			irmap_resolve		= 8;
> +
> +	required uint64			pages_zero		= 9;
>  }
>  
>  message restore_stats_entry {
> 



More information about the CRIU mailing list