[CRIU] [RFC 1/2] pagemap: Introduce pagemap cache

Pavel Emelyanov xemul at parallels.com
Thu Feb 13 03:40:37 PST 2014


On 02/13/2014 01:26 PM, Cyrill Gorcunov wrote:
> Pavel reported that in case if there a big number
> of small vmas present in the dumpee we're reading
> /proc/pid/pagemap too frequently.
> 
> To speedup this procedue we inroduce pagemap cache.
> The idea behind is simple
> 
>  - cache can carry PMEs which cover up to 2M of physical memory
>  - when it is asked to fetch PME from the cache and cache miss
>    found, we walk over all near laying VMAs and pre-read their
>    PMEs
> 
> The interface is:
>  - pmc_init/pmc_fini for cache initialization and freeing
>  - pmc_get_pme to retrieve specific PME from cache
> 
> Reported-by: Pavel Emelyanov <xemul at parallels.com>
> Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
> ---
>  Makefile.crtools        |   1 +
>  include/pagemap-cache.h |  41 +++++++++
>  pagemap-cache.c         | 216 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 258 insertions(+)
>  create mode 100644 include/pagemap-cache.h
>  create mode 100644 pagemap-cache.c
> 
> diff --git a/Makefile.crtools b/Makefile.crtools
> index 374cd8570289..6effc3e0958b 100644
> --- a/Makefile.crtools
> +++ b/Makefile.crtools
> @@ -53,6 +53,7 @@ obj-y	+= file-lock.o
>  obj-y	+= page-pipe.o
>  obj-y	+= page-xfer.o
>  obj-y	+= page-read.o
> +obj-y	+= pagemap-cache.o
>  obj-y	+= kerndat.o
>  obj-y	+= stats.o
>  obj-y	+= string.o
> diff --git a/include/pagemap-cache.h b/include/pagemap-cache.h
> new file mode 100644
> index 000000000000..21afa188a44d
> --- /dev/null
> +++ b/include/pagemap-cache.h
> @@ -0,0 +1,41 @@
> +#ifndef __CR_PAGEMAP_H__
> +#define __CR_PAGEMAP_H__
> +
> +#include <sys/types.h>
> +#include "asm/types.h"
> +
> +#include "list.h"
> +
> +struct vma_area;
> +
> +/* To carry up to 2M of physical memory */
> +#define PMC_SHIFT		(21)
> +#define PMC_SIZE		(1ul << PMC_SHIFT)
> +#define PMC_MASK		(~(PMC_SIZE - 1))
> +
> +#define PAGEMAP_LEN(pages)	((pages) * sizeof(u64))
> +#define	PAGEMAP_PFN(addr)	((addr) / PAGE_SIZE)
> +
> +#define PAGEMAP_MAP_PAGES	PAGEMAP_PFN(PMC_SIZE)
> +#define PAGEMAP_MAP_LEN		PAGEMAP_LEN(PAGEMAP_MAP_PAGES)
> +#define PAGEMAP_MAP_SIZE	(PAGEMAP_MAP_PAGES * PAGE_SIZE)
> +
> +#define PAGEMAP_PFN_OFF(addr)	(PAGEMAP_PFN(addr) * sizeof(u64))
> +
> +#define PAGEMAP_PME_ERR		((u64)-1)
> +
> +typedef struct {
> +	pid_t			pid;		/* which process it belongs */
> +	unsigned long		start;		/* start of area */
> +	unsigned long		end;		/* end of area */
> +	struct list_head	*head;		/* list of VMAs we're serving */
> +	struct vma_area		*last;		/* last vma we've handled */
> +	u64			*map;		/* local buffer */
> +	int			fd;		/* file to read PMs from */
> +} pmc_t;
> +
> +extern int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head);
> +extern u64 pmc_get_pme(pmc_t *pmc, unsigned long addr);
> +extern void pmc_fini(pmc_t *pmc);
> +
> +#endif /* __CR_PAGEMAP_H__ */
> diff --git a/pagemap-cache.c b/pagemap-cache.c
> new file mode 100644
> index 000000000000..36d880270166
> --- /dev/null
> +++ b/pagemap-cache.c
> @@ -0,0 +1,216 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +
> +#include "pagemap-cache.h"
> +#include "compiler.h"
> +#include "xmalloc.h"
> +#include "util.h"
> +#include "log.h"
> +#include "vma.h"
> +
> +#undef	LOG_PREFIX
> +#define LOG_PREFIX "pagemap-cache: "
> +
> +#ifdef PAGEMAP_CACHE_DEBUG
> +# define pr_trace	pr_debug
> +#else
> +# define pr_trace(fmt, ...)
> +#endif
> +
> +static inline void pmc_reset(pmc_t *pmc)
> +{
> +	memzero(pmc, sizeof(*pmc));
> +	pmc->fd = -1;
> +}
> +
> +static inline void pmc_zap(pmc_t *pmc)
> +{
> +	pmc->start = pmc->end = 0;
> +}
> +
> +int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head)
> +{
> +	pmc_reset(pmc);
> +
> +	BUG_ON(!vma_head);
> +
> +	pmc->pid	= pid;
> +	pmc->fd		= open_proc(pid, "pagemap");
> +	pmc->map	= xmalloc(PAGEMAP_MAP_LEN);
> +	pmc->head	= vma_head;
> +	pmc->last	= list_first_entry(vma_head, struct vma_area, list);
> +
> +	if (!pmc->map || pmc->fd < 0) {
> +		pr_err("Failed to init pagemap for %d\n", pid);
> +		pmc_fini(pmc);
> +		return -1;
> +	}
> +
> +	pr_debug("created pid %d pages %zu covers %lu bytes\n",
> +		 pid, PAGEMAP_MAP_PAGES, PAGEMAP_MAP_SIZE);
> +
> +	return 0;
> +}
> +
> +static u64 __pmc_get_pme(pmc_t *pmc, unsigned long addr)
> +{
> +	if (likely(pmc->start <= addr && pmc->end > addr)) {
> +		pr_trace("\t__pmc_get_pme %lx %lx-%lx index %lu\n",
> +			 addr, pmc->start, pmc->end,
> +			 PAGEMAP_PFN(addr - pmc->start));
> +		return pmc->map[PAGEMAP_PFN(addr - pmc->start)];
> +	}
> +	return PAGEMAP_PME_ERR;
> +}
> +
> +static struct vma_area *pmc_lookup_vma(pmc_t *pmc, unsigned long addr)
> +{
> +	struct vma_area *v = pmc->last;
> +
> +	if (v->e->start <= addr && v->e->end > addr) {
> +		pr_trace("pmc_lookup_vma %lx-%lx -> %lx hit\n",
> +			 v->e->start,  v->e->end, addr);
> +		return v;
> +	} else {
> +		if (v->e->start < addr) {
> +			list_for_each_entry_continue(v, pmc->head, list) {
> +				if (v->e->start <= addr && v->e->end > addr) {
> +					pr_trace("pmc_lookup_vma %lx-%lx %lx slow fwd\n",
> +						 v->e->start,  v->e->end, addr);
> +					return v;
> +				}
> +			}
> +		} else {
> +			v = pmc->last;
> +			list_for_each_entry_continue_reverse(v, pmc->head, list) {
> +				if (v->e->start <= addr && v->e->end > addr) {
> +					pr_trace("pmc_lookup_vma %lx-%lx %lx slow rwd\n",
> +						 v->e->start,  v->e->end, addr);
> +					return v;
> +				}
> +			}
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static inline void pmc_tag_last_vma(pmc_t *pmc, struct vma_area *v)
> +{
> +	pr_trace("pmc_tag_last_vma %lx-%lx\n", v->e->start, v->e->end);
> +	pmc->last = v;
> +}
> +
> +static int pmc_fill_cache(pmc_t *pmc, unsigned long from)
> +{
> +	size_t size_map, size_chunk, nr_vmas = 0;
> +	size_t size_left = PAGEMAP_MAP_SIZE;
> +	struct vma_area *vma, *prev = NULL;
> +	unsigned long size_cov = 0;
> +	unsigned long size_gap = 0;
> +
> +	pmc->start = from;

The from should be 2MB-aligned value.

> +	pmc->end = from + PAGEMAP_MAP_SIZE;
> +
> +	pr_trace("pmc_fill_cache %lx-%lx\n", pmc->start, pmc->end);
> +
> +	vma = pmc_lookup_vma(pmc, from);

The vma can be passed down by the caller. And the pmc api would look like

u64 pagemap_read(unsigned long vaddr, struct vm_area_struct *vma)

Where vma is the area from which to continue caching in case we've missed it.

> +	if (unlikely(!vma)) {
> +		pr_err("No vma for address %p\n", (void *)from);
> +		pmc_zap(pmc);
> +		return -1;
> +	}
> +
> +	/*
> +	 * In worst scenario we're chaching big VMA which
> +	 * doesn't fit the cache, which in turn forces us
> +	 * to read it by chunks in several attempts.
> +	 */
> +	size_chunk = min(vma->e->end, pmc->end) - from;
> +	size_left -= size_chunk;
> +	size_cov += size_chunk;
> +	prev = vma;
> +	nr_vmas++;
> +
> +	pr_trace("\tinsert %lx-%lx\n", from, from + size_chunk);
> +	pmc_tag_last_vma(pmc, vma);
> +
> +	/*
> +	 * In best scenario we can readahead VMAs if they
> +	 * are small enough to continue filling our cache.
> +	 */
> +	if (size_left) {
> +		list_for_each_entry_continue(vma, pmc->head, list) {
> +			pr_trace("\t\tattempt %lx-%lx\n", vma->e->start, vma->e->end);
> +			if (vma->e->start > pmc->end	||
> +			    vma->e->end > pmc->end	||
> +			    vma->e->start < pmc->start)
> +				break;
> +
> +			if (likely(prev))
> +				size_gap += (vma->e->start - prev->e->end);
> +
> +			pr_trace("\tinsert %lx-%lx\n", vma->e->start, vma->e->end);
> +			size_chunk = vma->e->end - vma->e->start;
> +			size_left -= size_chunk;
> +			size_cov += size_chunk;
> +			nr_vmas++;
> +
> +			pmc_tag_last_vma(pmc, vma);
> +
> +			if (!size_left)
> +				break;
> +			prev = vma;
> +		}
> +	}
> +
> +	/* Cache might be partially filled */
> +	pmc->end = from + size_cov;
> +
> +	size_map = PAGEMAP_PFN_OFF(pmc->end - pmc->start);

I don't get the arithmetic above. What is it about? You should calculate the
total length of vmas withing the 2MB chunk you're about to read and if this
value is less than X (X should be estimated constant) then just read the vma
length. The above maths seem to do more than that.

> +	if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
> +		pmc_zap(pmc);
> +		pr_perror("Can't read %d's pagemap file", pmc->pid);
> +		return -1;
> +	}
> +
> +	pr_debug("nr_vmas %zu size_cov %lu size_gap %lu (%p %p)\n",
> +		 nr_vmas, size_cov, size_gap, (void *)pmc->start,
> +		 (void *)pmc->end);
> +
> +	return 0;
> +}
> +
> +u64 pmc_get_pme(pmc_t *pmc, unsigned long addr)
> +{
> +	u64 pme;
> +
> +	pr_trace("pmc_get_pme %lx\n", addr);
> +
> +	/*
> +	 * Best case -- cache hit.
> +	 */
> +	pme = __pmc_get_pme(pmc, addr);
> +	if (likely(pme != PAGEMAP_PME_ERR))
> +	    return pme;
> +
> +	/*
> +	 * Cache miss, refill the cache.
> +	 */
> +	if (pmc_fill_cache(pmc, addr)) {
> +		pr_err("Failed to fill cache for %d (%lx)\n", pmc->pid, addr);
> +		return PAGEMAP_PME_ERR;
> +	}
> +
> +	/*
> +	 * It must be a cache hit.
> +	 */
> +	return __pmc_get_pme(pmc, addr);
> +}
> +
> +void pmc_fini(pmc_t *pmc)
> +{
> +	close_safe(&pmc->fd);
> +	xfree(pmc->map);
> +	pmc_reset(pmc);
> +}
> 




More information about the CRIU mailing list