[CRIU] [RFC 1/2] pagemap: Introduce pagemap cache

Cyrill Gorcunov gorcunov at openvz.org
Thu Feb 13 01:26:13 PST 2014


Pavel reported that in case if there a big number
of small vmas present in the dumpee we're reading
/proc/pid/pagemap too frequently.

To speedup this procedue we inroduce pagemap cache.
The idea behind is simple

 - cache can carry PMEs which cover up to 2M of physical memory
 - when it is asked to fetch PME from the cache and cache miss
   found, we walk over all near laying VMAs and pre-read their
   PMEs

The interface is:
 - pmc_init/pmc_fini for cache initialization and freeing
 - pmc_get_pme to retrieve specific PME from cache

Reported-by: Pavel Emelyanov <xemul at parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 Makefile.crtools        |   1 +
 include/pagemap-cache.h |  41 +++++++++
 pagemap-cache.c         | 216 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 258 insertions(+)
 create mode 100644 include/pagemap-cache.h
 create mode 100644 pagemap-cache.c

diff --git a/Makefile.crtools b/Makefile.crtools
index 374cd8570289..6effc3e0958b 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -53,6 +53,7 @@ obj-y	+= file-lock.o
 obj-y	+= page-pipe.o
 obj-y	+= page-xfer.o
 obj-y	+= page-read.o
+obj-y	+= pagemap-cache.o
 obj-y	+= kerndat.o
 obj-y	+= stats.o
 obj-y	+= string.o
diff --git a/include/pagemap-cache.h b/include/pagemap-cache.h
new file mode 100644
index 000000000000..21afa188a44d
--- /dev/null
+++ b/include/pagemap-cache.h
@@ -0,0 +1,41 @@
+#ifndef __CR_PAGEMAP_H__
+#define __CR_PAGEMAP_H__
+
+#include <sys/types.h>
+#include "asm/types.h"
+
+#include "list.h"
+
+struct vma_area;
+
+/* To carry up to 2M of physical memory */
+#define PMC_SHIFT		(21)
+#define PMC_SIZE		(1ul << PMC_SHIFT)
+#define PMC_MASK		(~(PMC_SIZE - 1))
+
+#define PAGEMAP_LEN(pages)	((pages) * sizeof(u64))
+#define	PAGEMAP_PFN(addr)	((addr) / PAGE_SIZE)
+
+#define PAGEMAP_MAP_PAGES	PAGEMAP_PFN(PMC_SIZE)
+#define PAGEMAP_MAP_LEN		PAGEMAP_LEN(PAGEMAP_MAP_PAGES)
+#define PAGEMAP_MAP_SIZE	(PAGEMAP_MAP_PAGES * PAGE_SIZE)
+
+#define PAGEMAP_PFN_OFF(addr)	(PAGEMAP_PFN(addr) * sizeof(u64))
+
+#define PAGEMAP_PME_ERR		((u64)-1)
+
+typedef struct {
+	pid_t			pid;		/* which process it belongs */
+	unsigned long		start;		/* start of area */
+	unsigned long		end;		/* end of area */
+	struct list_head	*head;		/* list of VMAs we're serving */
+	struct vma_area		*last;		/* last vma we've handled */
+	u64			*map;		/* local buffer */
+	int			fd;		/* file to read PMs from */
+} pmc_t;
+
+extern int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head);
+extern u64 pmc_get_pme(pmc_t *pmc, unsigned long addr);
+extern void pmc_fini(pmc_t *pmc);
+
+#endif /* __CR_PAGEMAP_H__ */
diff --git a/pagemap-cache.c b/pagemap-cache.c
new file mode 100644
index 000000000000..36d880270166
--- /dev/null
+++ b/pagemap-cache.c
@@ -0,0 +1,216 @@
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "pagemap-cache.h"
+#include "compiler.h"
+#include "xmalloc.h"
+#include "util.h"
+#include "log.h"
+#include "vma.h"
+
+#undef	LOG_PREFIX
+#define LOG_PREFIX "pagemap-cache: "
+
+#ifdef PAGEMAP_CACHE_DEBUG
+# define pr_trace	pr_debug
+#else
+# define pr_trace(fmt, ...)
+#endif
+
+static inline void pmc_reset(pmc_t *pmc)
+{
+	memzero(pmc, sizeof(*pmc));
+	pmc->fd = -1;
+}
+
+static inline void pmc_zap(pmc_t *pmc)
+{
+	pmc->start = pmc->end = 0;
+}
+
+int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head)
+{
+	pmc_reset(pmc);
+
+	BUG_ON(!vma_head);
+
+	pmc->pid	= pid;
+	pmc->fd		= open_proc(pid, "pagemap");
+	pmc->map	= xmalloc(PAGEMAP_MAP_LEN);
+	pmc->head	= vma_head;
+	pmc->last	= list_first_entry(vma_head, struct vma_area, list);
+
+	if (!pmc->map || pmc->fd < 0) {
+		pr_err("Failed to init pagemap for %d\n", pid);
+		pmc_fini(pmc);
+		return -1;
+	}
+
+	pr_debug("created pid %d pages %zu covers %lu bytes\n",
+		 pid, PAGEMAP_MAP_PAGES, PAGEMAP_MAP_SIZE);
+
+	return 0;
+}
+
+static u64 __pmc_get_pme(pmc_t *pmc, unsigned long addr)
+{
+	if (likely(pmc->start <= addr && pmc->end > addr)) {
+		pr_trace("\t__pmc_get_pme %lx %lx-%lx index %lu\n",
+			 addr, pmc->start, pmc->end,
+			 PAGEMAP_PFN(addr - pmc->start));
+		return pmc->map[PAGEMAP_PFN(addr - pmc->start)];
+	}
+	return PAGEMAP_PME_ERR;
+}
+
+static struct vma_area *pmc_lookup_vma(pmc_t *pmc, unsigned long addr)
+{
+	struct vma_area *v = pmc->last;
+
+	if (v->e->start <= addr && v->e->end > addr) {
+		pr_trace("pmc_lookup_vma %lx-%lx -> %lx hit\n",
+			 v->e->start,  v->e->end, addr);
+		return v;
+	} else {
+		if (v->e->start < addr) {
+			list_for_each_entry_continue(v, pmc->head, list) {
+				if (v->e->start <= addr && v->e->end > addr) {
+					pr_trace("pmc_lookup_vma %lx-%lx %lx slow fwd\n",
+						 v->e->start,  v->e->end, addr);
+					return v;
+				}
+			}
+		} else {
+			v = pmc->last;
+			list_for_each_entry_continue_reverse(v, pmc->head, list) {
+				if (v->e->start <= addr && v->e->end > addr) {
+					pr_trace("pmc_lookup_vma %lx-%lx %lx slow rwd\n",
+						 v->e->start,  v->e->end, addr);
+					return v;
+				}
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static inline void pmc_tag_last_vma(pmc_t *pmc, struct vma_area *v)
+{
+	pr_trace("pmc_tag_last_vma %lx-%lx\n", v->e->start, v->e->end);
+	pmc->last = v;
+}
+
+static int pmc_fill_cache(pmc_t *pmc, unsigned long from)
+{
+	size_t size_map, size_chunk, nr_vmas = 0;
+	size_t size_left = PAGEMAP_MAP_SIZE;
+	struct vma_area *vma, *prev = NULL;
+	unsigned long size_cov = 0;
+	unsigned long size_gap = 0;
+
+	pmc->start = from;
+	pmc->end = from + PAGEMAP_MAP_SIZE;
+
+	pr_trace("pmc_fill_cache %lx-%lx\n", pmc->start, pmc->end);
+
+	vma = pmc_lookup_vma(pmc, from);
+	if (unlikely(!vma)) {
+		pr_err("No vma for address %p\n", (void *)from);
+		pmc_zap(pmc);
+		return -1;
+	}
+
+	/*
+	 * In worst scenario we're chaching big VMA which
+	 * doesn't fit the cache, which in turn forces us
+	 * to read it by chunks in several attempts.
+	 */
+	size_chunk = min(vma->e->end, pmc->end) - from;
+	size_left -= size_chunk;
+	size_cov += size_chunk;
+	prev = vma;
+	nr_vmas++;
+
+	pr_trace("\tinsert %lx-%lx\n", from, from + size_chunk);
+	pmc_tag_last_vma(pmc, vma);
+
+	/*
+	 * In best scenario we can readahead VMAs if they
+	 * are small enough to continue filling our cache.
+	 */
+	if (size_left) {
+		list_for_each_entry_continue(vma, pmc->head, list) {
+			pr_trace("\t\tattempt %lx-%lx\n", vma->e->start, vma->e->end);
+			if (vma->e->start > pmc->end	||
+			    vma->e->end > pmc->end	||
+			    vma->e->start < pmc->start)
+				break;
+
+			if (likely(prev))
+				size_gap += (vma->e->start - prev->e->end);
+
+			pr_trace("\tinsert %lx-%lx\n", vma->e->start, vma->e->end);
+			size_chunk = vma->e->end - vma->e->start;
+			size_left -= size_chunk;
+			size_cov += size_chunk;
+			nr_vmas++;
+
+			pmc_tag_last_vma(pmc, vma);
+
+			if (!size_left)
+				break;
+			prev = vma;
+		}
+	}
+
+	/* Cache might be partially filled */
+	pmc->end = from + size_cov;
+
+	size_map = PAGEMAP_PFN_OFF(pmc->end - pmc->start);
+	if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
+		pmc_zap(pmc);
+		pr_perror("Can't read %d's pagemap file", pmc->pid);
+		return -1;
+	}
+
+	pr_debug("nr_vmas %zu size_cov %lu size_gap %lu (%p %p)\n",
+		 nr_vmas, size_cov, size_gap, (void *)pmc->start,
+		 (void *)pmc->end);
+
+	return 0;
+}
+
+u64 pmc_get_pme(pmc_t *pmc, unsigned long addr)
+{
+	u64 pme;
+
+	pr_trace("pmc_get_pme %lx\n", addr);
+
+	/*
+	 * Best case -- cache hit.
+	 */
+	pme = __pmc_get_pme(pmc, addr);
+	if (likely(pme != PAGEMAP_PME_ERR))
+	    return pme;
+
+	/*
+	 * Cache miss, refill the cache.
+	 */
+	if (pmc_fill_cache(pmc, addr)) {
+		pr_err("Failed to fill cache for %d (%lx)\n", pmc->pid, addr);
+		return PAGEMAP_PME_ERR;
+	}
+
+	/*
+	 * It must be a cache hit.
+	 */
+	return __pmc_get_pme(pmc, addr);
+}
+
+void pmc_fini(pmc_t *pmc)
+{
+	close_safe(&pmc->fd);
+	xfree(pmc->map);
+	pmc_reset(pmc);
+}
-- 
1.8.3.1



More information about the CRIU mailing list