[CRIU] [PATCH 1/2] pagemap: Introduce pagemap cache

Pavel Emelyanov xemul at parallels.com
Wed Feb 12 02:01:43 PST 2014


On 02/11/2014 11:08 PM, Cyrill Gorcunov wrote:
> 
> Pavel reported that in case if there a big number
> of small vmas present in the dumpee we're reading
> /proc/pid/pagemap too frequently.
> 
> To speedup this procedue we inroduce pagemap cache.
> The idea behind is simple
> 
>  - cache can carry PMEs which cover up to 2M of physical memory
>    at least (or if longest VMA area is bigger than 2M, then we
>    take its size for cache)
>  - when it is asked to fetch PME from the cache and cache miss
>    found, we walk over all near laying VMAs and pre-read their
>    PMEs
> 
> The interface is:
>  - pmc_init/pmc_fini for cache initialization and freeing
>  - pmc_get_pfn_index to retrieve specific PME from cache
> 
> Reported-by: Pavel Emelyanov <xemul at parallels.com>
> Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
> ---
>  Makefile.crtools  |   1 +
>  include/pagemap.h |  33 ++++++++++++++
>  pagemap.c         | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 160 insertions(+)
>  create mode 100644 include/pagemap.h
>  create mode 100644 pagemap.c
> 


> diff --git a/Makefile.crtools b/Makefile.crtools
> index 374cd8570289..4aefd7bec278 100644
> --- a/Makefile.crtools
> +++ b/Makefile.crtools
> @@ -53,6 +53,7 @@ obj-y	+= file-lock.o
>  obj-y	+= page-pipe.o
>  obj-y	+= page-xfer.o
>  obj-y	+= page-read.o
> +obj-y	+= pagemap.o

pagemap-cache.o

>  obj-y	+= kerndat.o
>  obj-y	+= stats.o
>  obj-y	+= string.o
> diff --git a/include/pagemap.h b/include/pagemap.h
> new file mode 100644
> index 000000000000..d5a581918951
> --- /dev/null
> +++ b/include/pagemap.h
> @@ -0,0 +1,33 @@
> +#ifndef __CR_PAGEMAP_H__
> +#define __CR_PAGEMAP_H__
> +
> +#include <sys/types.h>
> +#include "asm/types.h"
> +
> +struct vma_area;
> +
> +typedef struct {
> +	u64		pme;
> +} pagemap_entry_t;

All the existing code operates with just u64. Plz, do the same.

> +
> +/* To carry up to 2M of physical memory */
> +#define PAGEMAP_MAX_AREA	(2ul << 20)
> +#define	PAGEMAP_PFN(addr)	((addr) / PAGE_SIZE)
> +#define PAGEMAP_MAX_PAGES	PAGEMAP_PFN(PAGEMAP_MAX_AREA)
> +#define PAGEMAP_SIZE(pages)	((pages) * sizeof(pagemap_entry_t))
> +#define PAGEMAP_PFN_OFF(addr)	(PAGEMAP_PFN(addr) * sizeof(pagemap_entry_t))
> +
> +typedef struct {
> +	pid_t		pid;				/* which process it belongs */
> +	unsigned long	start;				/* start of area */
> +	unsigned long	end;				/* end of area */
> +	pagemap_entry_t	*map;				/* local buffer */
> +	size_t		nr_pages;			/* map allocated for */
> +	int		fd;				/* file to read PMs from */
> +} pmc_t;
> +
> +extern int pmc_init(pmc_t *pmc, pid_t pid, unsigned long max_pages);
> +extern int pmc_get_pfn_index(pmc_t *pmc, struct vma_area *vma, unsigned long addr, size_t *index);

Pagemap cache should return the pagemap_entry, not the index under which the
caller may find the desired value in some other place.

> +extern void pmc_fini(pmc_t *pmc);
> +
> +#endif /* __CR_PAGEMAP_H__ */
> diff --git a/pagemap.c b/pagemap.c
> new file mode 100644
> index 000000000000..9f5b646f7ff0
> --- /dev/null
> +++ b/pagemap.c
> @@ -0,0 +1,126 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +
> +#include "compiler.h"
> +#include "xmalloc.h"
> +#include "pagemap.h"
> +#include "util.h"
> +#include "log.h"
> +#include "vma.h"
> +
> +#undef	LOG_PREFIX
> +#define LOG_PREFIX "pagemap: "
> +
> +static inline void pmc_reset(pmc_t *pmc)
> +{
> +	memzero(pmc, sizeof(*pmc));
> +	pmc->fd = -1;
> +}
> +
> +int pmc_init(pmc_t *pmc, pid_t pid, unsigned long max_pages)
> +{
> +	pmc_reset(pmc);
> +
> +	pmc->nr_pages	= max(max_pages, PAGEMAP_MAX_PAGES);

2MB granularity is more than enough. Can you show numbers proving the opposite?

> +	pmc->fd		= open_proc(pid, "pagemap");
> +	pmc->map	= xmalloc(PAGEMAP_SIZE(pmc->nr_pages));
> +	pmc->pid	= pid;
> +
> +	if (!pmc->map || pmc->fd < 0) {
> +		pr_err("Failed to init pagemap for %d\n", pid);
> +		pmc_fini(pmc);
> +		return -1;
> +	}
> +
> +	pr_debug("init pid %d pages %zu covers %lu bytes\n",
> +		 pid, pmc->nr_pages, pmc->nr_pages * PAGE_SIZE);
> +
> +	return 0;
> +}
> +
> +/* Fill the cache and return offset @off to the start of the VMA */
> +int pmc_get_pfn_index(pmc_t *pmc, struct vma_area *vma, unsigned long addr, size_t *index)
> +{
> +	struct vma_area *v, *prev, *head;
> +
> +	unsigned long size_cov = 0;
> +	unsigned long size_gap = 0;
> +	size_t nr_vmas = 0;
> +	size_t len;
> +
> +	pr_debug("Cache lookup %p (pmc %p - %p)\n",
> +		 (void *)addr, (void *)pmc->start, (void *)pmc->end);
> +
> +	/*
> +	 * Cache hit -- the PMs has been read already.
> +	 */
> +	if (likely(pmc->start <= addr && pmc->end > addr)) {
> +		*index = PAGEMAP_PFN(addr - pmc->start);
> +		return 0;
> +	}
> +
> +	/*
> +	 * Cache miss -- fill the cache.
> +	 */
> +	pmc->start	= vma->e->start;
> +	pmc->end	= pmc->start + pmc->nr_pages * PAGE_SIZE;
> +	head		= container_of(vma->list.prev, struct vma_area, list);
> +	prev		= NULL;
> +
> +	pr_debug("Trying to fill cache (new pmc %p - %p)\n",
> +		 (void *)pmc->start, (void *)pmc->end);
> +
> +	list_for_each_entry(v, &head->list, list) {

This should be list_for_each_entry_continue to avoid problems pointed by Andrey.

> +		pr_debug("\t%p - %p <- %p - %p\n",
> +			 (void *)pmc->start, (void *)pmc->end,
> +			 (void *)v->e->start, (void *)v->e->end);
> +		if (v->e->start > pmc->end	||
> +		    v->e->start < pmc->start	||
> +		    v->e->end > pmc->end)
> +			break;
> +
> +		size_cov += vma_area_len(v);
> +		if (!prev) {
> +			prev = v;
> +			nr_vmas++;
> +			continue;
> +		}
> +
> +		size_gap += (v->e->start - prev->e->end);
> +
> +		/*
> +		 * Gap is too big, can't continue.
> +		 */
> +		if (size_gap > size_cov) {
> +			size_gap -= (v->e->start - prev->e->end);
> +			size_cov -= vma_area_len(v);
> +			pmc->end = prev->e->end;
> +			break;
> +		}
> +
> +		prev = v;
> +		nr_vmas++;
> +	}
> +
> +	pr_debug("nr_vmas %zu size_cov %lu size_gap %lu (%p %p)\n",
> +		 nr_vmas, size_cov, size_gap, (void *)pmc->start, (void *)pmc->end);
> +	BUG_ON(!nr_vmas || !size_cov);
> +
> +	pr_debug("Fill cache (new %p - %p)\n", (void *)pmc->start, (void *)pmc->end);
> +	len = PAGEMAP_PFN_OFF(pmc->end - pmc->start);
> +	if (pread(pmc->fd, pmc->map, len, PAGEMAP_PFN_OFF(pmc->start)) != len) {
> +		pmc->start = pmc->end = 0;
> +		pr_perror("Can't read %d's pagemap file", pmc->pid);
> +		return -1;
> +	}
> +
> +	pr_debug("Cache filled (%p - %p)\n", (void *)pmc->start, (void *)pmc->end);
> +	return pmc_get_pfn_index(pmc, vma, addr, index);

No implicitly 1-level-only recursion, please.

> +}
> +
> +void pmc_fini(pmc_t *pmc)
> +{
> +	close_safe(&pmc->fd);
> +	xfree(pmc->map);
> +	pmc_reset(pmc);
> +}



More information about the CRIU mailing list