[CRIU] [PATCH v2 4/7] lazy-pages: fix memory corruption when combining pre-dump with lazy pages

Mike Rapoport rppt at linux.vnet.ibm.com
Wed Jun 21 15:37:56 MSK 2017


On Wed, Jun 21, 2017 at 03:30:23PM +0300, Pavel Emelyanov wrote:
> On 06/21/2017 02:37 PM, Mike Rapoport wrote:
> > When we combine pre-dump with lazy pages, we populate a part of a memory
> > region with data that was saved during the pre-dump. Afterwards, the
> > region is registered with userfaultfd and we expect to get page faults for
> > the parts of the region that were not yet populated. However, khugepaged
> > collapses the pages and the page faults we would expect do not occur.
> > 
> > To mitigate this problem we temporarily disable THP for the restored
> > process, up to the point when we register all the memory regions with
> > userfaultfd.
> > 
> > https://lists.openvz.org/pipermail/criu/2017-May/037728.html
> > 
> > Reported-by: Adrian Reber <areber at redhat.com>
> > Signed-off-by: Mike Rapoport <rppt at linux.vnet.ibm.com>
> > ---
> >  criu/cr-restore.c       |  3 +++
> >  criu/include/pagemap.h  |  5 +++++
> >  criu/include/restorer.h |  1 +
> >  criu/include/rst_info.h |  2 ++
> >  criu/mem.c              | 33 ++++++++++++++++++++++++++++++++-
> >  criu/pie/restorer.c     |  8 ++++++++
> >  6 files changed, 51 insertions(+), 1 deletion(-)
> > 
> > diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> > index 41b5db3..f403127 100644
> > --- a/criu/cr-restore.c
> > +++ b/criu/cr-restore.c
> > @@ -2767,6 +2767,9 @@ static int prepare_mm(pid_t pid, struct task_restore_args *args)
> >  		goto out;
> >  
> >  	args->fd_exe_link = exe_fd;
> > +
> > +	args->has_thp_enabled = rsti(current)->has_thp_enabled;
> > +
> >  	ret = 0;
> >  out:
> >  	return ret;
> > diff --git a/criu/include/pagemap.h b/criu/include/pagemap.h
> > index d0a28b9..8007683 100644
> > --- a/criu/include/pagemap.h
> > +++ b/criu/include/pagemap.h
> > @@ -127,6 +127,11 @@ static inline unsigned long pagemap_len(PagemapEntry *pe)
> >  	return pe->nr_pages * PAGE_SIZE;
> >  }
> >  
> > +static inline bool page_read_has_parent(struct page_read *pr)
> > +{
> > +	return pr->parent != NULL;
> > +}
> > +
> >  /* Pagemap flags */
> >  #define PE_PARENT	(1 << 0)	/* pages are in parent snapshot */
> >  #define PE_LAZY		(1 << 1)	/* pages can be lazily restored */
> > diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> > index 736ba96..9595ad6 100644
> > --- a/criu/include/restorer.h
> > +++ b/criu/include/restorer.h
> > @@ -120,6 +120,7 @@ struct task_restore_args {
> >  	struct timeval			logstart;
> >  
> >  	int				uffd;
> > +	bool				has_thp_enabled;
> >  
> >  	/* threads restoration */
> >  	int				nr_threads;		/* number of threads */
> > diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h
> > index 78b1f64..f9840d1 100644
> > --- a/criu/include/rst_info.h
> > +++ b/criu/include/rst_info.h
> > @@ -62,6 +62,8 @@ struct rst_info {
> >  	 */
> >  	bool			has_seccomp;
> >  
> > +	bool			has_thp_enabled;
> > +
> >  	void			*breakpoint;
> >  };
> >  
> > diff --git a/criu/mem.c b/criu/mem.c
> > index 509f668..0b38ae8 100644
> > --- a/criu/mem.c
> > +++ b/criu/mem.c
> > @@ -4,6 +4,7 @@
> >  #include <errno.h>
> >  #include <fcntl.h>
> >  #include <sys/syscall.h>
> > +#include <sys/prctl.h>
> >  
> >  #include "types.h"
> >  #include "cr_options.h"
> > @@ -27,6 +28,7 @@
> >  #include "files-reg.h"
> >  #include "pagemap-cache.h"
> >  #include "fault-injection.h"
> > +#include "prctl.h"
> >  #include <compel/compel.h>
> >  
> >  #include "protobuf.h"
> > @@ -1024,6 +1026,33 @@ err_addr:
> >  	return -1;
> >  }
> >  
> > +static int maybe_disable_thp(struct pstree_item *t, struct page_read *pr)
> > +{
> > +	int ret;
> > +
> > +	if (!(opts.lazy_pages && page_read_has_parent(pr)))
> 
> Why checking for page_read_has_parent()? I mean -- what if page read doesn't
> have parent, why do we keep THP on?

There is no need to disable it if the page read doesn't have parent. In
this case VMA will be empty until userfaultfd_register, so there would be
no pages to collapse. And, once we register the VMA with uffd, khugepaged
will skip it.
 
> > +		return 0;
> > +
> > +	if (!kdat.has_thp_disable)
> > +		pr_warn("Disabling transparent huge pages. "
> > +			"It may affect performance!\n");
> > +
> > +	/*
> > +	 * temporarily disable THP to avoid collapse of pages
> > +	 * in the areas that will be monitored by uffd
> > +	 */
> > +	ret = prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0, 0);
> 
> What is get for?

We re-enable THP after the restore, at least I've intended to do so :)
If the process had THP disabled at the dump time we should keep it this
way.
 
> > +	if (unlikely(ret < 0))
> > +		return -1;
> > +	if (!ret && prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) {
> > +		pr_perror("Cannot disable THP");
> > +		return -1;
> > +	}
> > +	rsti(t)->has_thp_enabled = !ret;
> > +
> > +	return 0;
> > +}
> > +
> >  int prepare_mappings(struct pstree_item *t)
> >  {
> >  	int ret = 0;
> > @@ -1055,6 +1084,9 @@ int prepare_mappings(struct pstree_item *t)
> >  	if (ret <= 0)
> >  		return -1;
> >  
> > +	if (maybe_disable_thp(t, &pr))
> > +		return -1;
> > +
> >  	pr.advance(&pr); /* shift to the 1st iovec */
> >  
> >  	ret = premap_priv_vmas(t, vmas, &addr, &pr);
> > @@ -1196,4 +1228,3 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
> >  
> >  	return prepare_vma_ios(t, ta);
> >  }
> > -
> > diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> > index 7b9c052..28a1129 100644
> > --- a/criu/pie/restorer.c
> > +++ b/criu/pie/restorer.c
> > @@ -1272,6 +1272,14 @@ long __export_restore_task(struct task_restore_args *args)
> >  	}
> >  
> >  	if (args->uffd > -1) {
> > +		/* re-enable THP if we disabled it previously */
> > +		if (args->has_thp_enabled) {
> > +			if (sys_prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0)) {

And here should have been
sys_prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0);

;-)

> > +				pr_err("Cannot re-enable THP\n");
> > +				goto core_restore_end;
> > +			}
> > +		}
> > +
> >  		pr_debug("lazy-pages: closing uffd %d\n", args->uffd);
> >  		/*
> >  		 * All userfaultfd configuration has finished at this point.
> > 
> 



More information about the CRIU mailing list