[CRIU] [RFC 1/2] pagemap: Introduce pagemap cache
Cyrill Gorcunov
gorcunov at openvz.org
Thu Feb 13 01:26:13 PST 2014
Pavel reported that in case if there a big number
of small vmas present in the dumpee we're reading
/proc/pid/pagemap too frequently.
To speedup this procedue we inroduce pagemap cache.
The idea behind is simple
- cache can carry PMEs which cover up to 2M of physical memory
- when it is asked to fetch PME from the cache and cache miss
found, we walk over all near laying VMAs and pre-read their
PMEs
The interface is:
- pmc_init/pmc_fini for cache initialization and freeing
- pmc_get_pme to retrieve specific PME from cache
Reported-by: Pavel Emelyanov <xemul at parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
Makefile.crtools | 1 +
include/pagemap-cache.h | 41 +++++++++
pagemap-cache.c | 216 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 258 insertions(+)
create mode 100644 include/pagemap-cache.h
create mode 100644 pagemap-cache.c
diff --git a/Makefile.crtools b/Makefile.crtools
index 374cd8570289..6effc3e0958b 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -53,6 +53,7 @@ obj-y += file-lock.o
obj-y += page-pipe.o
obj-y += page-xfer.o
obj-y += page-read.o
+obj-y += pagemap-cache.o
obj-y += kerndat.o
obj-y += stats.o
obj-y += string.o
diff --git a/include/pagemap-cache.h b/include/pagemap-cache.h
new file mode 100644
index 000000000000..21afa188a44d
--- /dev/null
+++ b/include/pagemap-cache.h
@@ -0,0 +1,41 @@
+#ifndef __CR_PAGEMAP_H__
+#define __CR_PAGEMAP_H__
+
+#include <sys/types.h>
+#include "asm/types.h"
+
+#include "list.h"
+
+struct vma_area;
+
+/* To carry up to 2M of physical memory */
+#define PMC_SHIFT (21)
+#define PMC_SIZE (1ul << PMC_SHIFT)
+#define PMC_MASK (~(PMC_SIZE - 1))
+
+#define PAGEMAP_LEN(pages) ((pages) * sizeof(u64))
+#define PAGEMAP_PFN(addr) ((addr) / PAGE_SIZE)
+
+#define PAGEMAP_MAP_PAGES PAGEMAP_PFN(PMC_SIZE)
+#define PAGEMAP_MAP_LEN PAGEMAP_LEN(PAGEMAP_MAP_PAGES)
+#define PAGEMAP_MAP_SIZE (PAGEMAP_MAP_PAGES * PAGE_SIZE)
+
+#define PAGEMAP_PFN_OFF(addr) (PAGEMAP_PFN(addr) * sizeof(u64))
+
+#define PAGEMAP_PME_ERR ((u64)-1)
+
+typedef struct {
+ pid_t pid; /* which process it belongs */
+ unsigned long start; /* start of area */
+ unsigned long end; /* end of area */
+ struct list_head *head; /* list of VMAs we're serving */
+ struct vma_area *last; /* last vma we've handled */
+ u64 *map; /* local buffer */
+ int fd; /* file to read PMs from */
+} pmc_t;
+
+extern int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head);
+extern u64 pmc_get_pme(pmc_t *pmc, unsigned long addr);
+extern void pmc_fini(pmc_t *pmc);
+
+#endif /* __CR_PAGEMAP_H__ */
diff --git a/pagemap-cache.c b/pagemap-cache.c
new file mode 100644
index 000000000000..36d880270166
--- /dev/null
+++ b/pagemap-cache.c
@@ -0,0 +1,216 @@
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "pagemap-cache.h"
+#include "compiler.h"
+#include "xmalloc.h"
+#include "util.h"
+#include "log.h"
+#include "vma.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "pagemap-cache: "
+
+#ifdef PAGEMAP_CACHE_DEBUG
+# define pr_trace pr_debug
+#else
+# define pr_trace(fmt, ...)
+#endif
+
+static inline void pmc_reset(pmc_t *pmc)
+{
+ memzero(pmc, sizeof(*pmc));
+ pmc->fd = -1;
+}
+
+static inline void pmc_zap(pmc_t *pmc)
+{
+ pmc->start = pmc->end = 0;
+}
+
+int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head)
+{
+ pmc_reset(pmc);
+
+ BUG_ON(!vma_head);
+
+ pmc->pid = pid;
+ pmc->fd = open_proc(pid, "pagemap");
+ pmc->map = xmalloc(PAGEMAP_MAP_LEN);
+ pmc->head = vma_head;
+ pmc->last = list_first_entry(vma_head, struct vma_area, list);
+
+ if (!pmc->map || pmc->fd < 0) {
+ pr_err("Failed to init pagemap for %d\n", pid);
+ pmc_fini(pmc);
+ return -1;
+ }
+
+ pr_debug("created pid %d pages %zu covers %lu bytes\n",
+ pid, PAGEMAP_MAP_PAGES, PAGEMAP_MAP_SIZE);
+
+ return 0;
+}
+
+static u64 __pmc_get_pme(pmc_t *pmc, unsigned long addr)
+{
+ if (likely(pmc->start <= addr && pmc->end > addr)) {
+ pr_trace("\t__pmc_get_pme %lx %lx-%lx index %lu\n",
+ addr, pmc->start, pmc->end,
+ PAGEMAP_PFN(addr - pmc->start));
+ return pmc->map[PAGEMAP_PFN(addr - pmc->start)];
+ }
+ return PAGEMAP_PME_ERR;
+}
+
+static struct vma_area *pmc_lookup_vma(pmc_t *pmc, unsigned long addr)
+{
+ struct vma_area *v = pmc->last;
+
+ if (v->e->start <= addr && v->e->end > addr) {
+ pr_trace("pmc_lookup_vma %lx-%lx -> %lx hit\n",
+ v->e->start, v->e->end, addr);
+ return v;
+ } else {
+ if (v->e->start < addr) {
+ list_for_each_entry_continue(v, pmc->head, list) {
+ if (v->e->start <= addr && v->e->end > addr) {
+ pr_trace("pmc_lookup_vma %lx-%lx %lx slow fwd\n",
+ v->e->start, v->e->end, addr);
+ return v;
+ }
+ }
+ } else {
+ v = pmc->last;
+ list_for_each_entry_continue_reverse(v, pmc->head, list) {
+ if (v->e->start <= addr && v->e->end > addr) {
+ pr_trace("pmc_lookup_vma %lx-%lx %lx slow rwd\n",
+ v->e->start, v->e->end, addr);
+ return v;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static inline void pmc_tag_last_vma(pmc_t *pmc, struct vma_area *v)
+{
+ pr_trace("pmc_tag_last_vma %lx-%lx\n", v->e->start, v->e->end);
+ pmc->last = v;
+}
+
+static int pmc_fill_cache(pmc_t *pmc, unsigned long from)
+{
+ size_t size_map, size_chunk, nr_vmas = 0;
+ size_t size_left = PAGEMAP_MAP_SIZE;
+ struct vma_area *vma, *prev = NULL;
+ unsigned long size_cov = 0;
+ unsigned long size_gap = 0;
+
+ pmc->start = from;
+ pmc->end = from + PAGEMAP_MAP_SIZE;
+
+ pr_trace("pmc_fill_cache %lx-%lx\n", pmc->start, pmc->end);
+
+ vma = pmc_lookup_vma(pmc, from);
+ if (unlikely(!vma)) {
+ pr_err("No vma for address %p\n", (void *)from);
+ pmc_zap(pmc);
+ return -1;
+ }
+
+ /*
+ * In worst scenario we're chaching big VMA which
+ * doesn't fit the cache, which in turn forces us
+ * to read it by chunks in several attempts.
+ */
+ size_chunk = min(vma->e->end, pmc->end) - from;
+ size_left -= size_chunk;
+ size_cov += size_chunk;
+ prev = vma;
+ nr_vmas++;
+
+ pr_trace("\tinsert %lx-%lx\n", from, from + size_chunk);
+ pmc_tag_last_vma(pmc, vma);
+
+ /*
+ * In best scenario we can readahead VMAs if they
+ * are small enough to continue filling our cache.
+ */
+ if (size_left) {
+ list_for_each_entry_continue(vma, pmc->head, list) {
+ pr_trace("\t\tattempt %lx-%lx\n", vma->e->start, vma->e->end);
+ if (vma->e->start > pmc->end ||
+ vma->e->end > pmc->end ||
+ vma->e->start < pmc->start)
+ break;
+
+ if (likely(prev))
+ size_gap += (vma->e->start - prev->e->end);
+
+ pr_trace("\tinsert %lx-%lx\n", vma->e->start, vma->e->end);
+ size_chunk = vma->e->end - vma->e->start;
+ size_left -= size_chunk;
+ size_cov += size_chunk;
+ nr_vmas++;
+
+ pmc_tag_last_vma(pmc, vma);
+
+ if (!size_left)
+ break;
+ prev = vma;
+ }
+ }
+
+ /* Cache might be partially filled */
+ pmc->end = from + size_cov;
+
+ size_map = PAGEMAP_PFN_OFF(pmc->end - pmc->start);
+ if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
+ pmc_zap(pmc);
+ pr_perror("Can't read %d's pagemap file", pmc->pid);
+ return -1;
+ }
+
+ pr_debug("nr_vmas %zu size_cov %lu size_gap %lu (%p %p)\n",
+ nr_vmas, size_cov, size_gap, (void *)pmc->start,
+ (void *)pmc->end);
+
+ return 0;
+}
+
+u64 pmc_get_pme(pmc_t *pmc, unsigned long addr)
+{
+ u64 pme;
+
+ pr_trace("pmc_get_pme %lx\n", addr);
+
+ /*
+ * Best case -- cache hit.
+ */
+ pme = __pmc_get_pme(pmc, addr);
+ if (likely(pme != PAGEMAP_PME_ERR))
+ return pme;
+
+ /*
+ * Cache miss, refill the cache.
+ */
+ if (pmc_fill_cache(pmc, addr)) {
+ pr_err("Failed to fill cache for %d (%lx)\n", pmc->pid, addr);
+ return PAGEMAP_PME_ERR;
+ }
+
+ /*
+ * It must be a cache hit.
+ */
+ return __pmc_get_pme(pmc, addr);
+}
+
+void pmc_fini(pmc_t *pmc)
+{
+ close_safe(&pmc->fd);
+ xfree(pmc->map);
+ pmc_reset(pmc);
+}
--
1.8.3.1
More information about the CRIU
mailing list