[CRIU] [RFC 1/3] pagemap-cache: Introduce engine
Cyrill Gorcunov
gorcunov at openvz.org
Thu Feb 13 12:41:11 PST 2014
Pavel reported that in case if there a big number
of small vmas present in the dumpee we're reading
/proc/pid/pagemap too frequently.
To speedup this procedue we inroduce pagemap cache.
The interface is:
- pmc_init/pmc_fini for cache initialization and freeing
- pmc_get_pme to retrieve specific PME from cache
Reported-by: Pavel Emelyanov <xemul at parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
Makefile.crtools | 1 +
include/pagemap-cache.h | 40 +++++++++++++
pagemap-cache.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 191 insertions(+)
create mode 100644 include/pagemap-cache.h
create mode 100644 pagemap-cache.c
diff --git a/Makefile.crtools b/Makefile.crtools
index 374cd8570289..6effc3e0958b 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -53,6 +53,7 @@ obj-y += file-lock.o
obj-y += page-pipe.o
obj-y += page-xfer.o
obj-y += page-read.o
+obj-y += pagemap-cache.o
obj-y += kerndat.o
obj-y += stats.o
obj-y += string.o
diff --git a/include/pagemap-cache.h b/include/pagemap-cache.h
new file mode 100644
index 000000000000..9fe03f17d97f
--- /dev/null
+++ b/include/pagemap-cache.h
@@ -0,0 +1,40 @@
+#ifndef __CR_PAGEMAP_H__
+#define __CR_PAGEMAP_H__
+
+#include <sys/types.h>
+#include "asm/types.h"
+
+#include "list.h"
+
+struct vma_area;
+
+/* To carry up to 2M of physical memory */
+#define PMC_SHIFT (21)
+#define PMC_SIZE (1ul << PMC_SHIFT)
+#define PMC_MASK (~(PMC_SIZE - 1))
+#define PMC_SIZE_GAP (PMC_SIZE / 4)
+
+#define PAGEMAP_LEN(pages) ((pages) * sizeof(u64))
+#define PAGEMAP_PFN(addr) ((addr) / PAGE_SIZE)
+
+#define PAGEMAP_PFN_OFF(addr) (PAGEMAP_PFN(addr) * sizeof(u64))
+
+#define PAGEMAP_PME_ERR ((u64)-1)
+
+typedef struct {
+ pid_t pid; /* which process it belongs */
+ unsigned long start; /* start of area */
+ unsigned long end; /* end of area */
+ struct list_head *vma_head; /* list head of VMAs we're serving */
+ u64 *map; /* local buffer */
+ size_t map_len; /* length of a buffer */
+ int fd; /* file to read PMs from */
+} pmc_t;
+
+#define PMC_INIT (pmc_t){ }
+
+extern int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head, size_t size);
+extern u64 pmc_get_pme(pmc_t *pmc, struct vma_area *vma, unsigned long addr);
+extern void pmc_fini(pmc_t *pmc);
+
+#endif /* __CR_PAGEMAP_H__ */
diff --git a/pagemap-cache.c b/pagemap-cache.c
new file mode 100644
index 000000000000..ddeca9a12518
--- /dev/null
+++ b/pagemap-cache.c
@@ -0,0 +1,150 @@
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "pagemap-cache.h"
+#include "compiler.h"
+#include "xmalloc.h"
+#include "util.h"
+#include "log.h"
+#include "vma.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "pagemap-cache: "
+
+static inline void pmc_reset(pmc_t *pmc)
+{
+ memzero(pmc, sizeof(*pmc));
+ pmc->fd = -1;
+}
+
+static inline void pmc_zap(pmc_t *pmc)
+{
+ pmc->start = pmc->end = 0;
+}
+
+int pmc_init(pmc_t *pmc, pid_t pid, struct list_head *vma_head, size_t size)
+{
+ size_t map_size = max(size, PMC_SIZE);
+ pmc_reset(pmc);
+
+ BUG_ON(!vma_head);
+
+ pmc->pid = pid;
+ pmc->fd = open_proc(pid, "pagemap");
+ pmc->map_len = PAGEMAP_LEN(PAGEMAP_PFN(map_size));
+ pmc->map = xmalloc(pmc->map_len);
+ pmc->vma_head = vma_head;
+
+ if (!pmc->map || pmc->fd < 0) {
+ pr_err("Failed to init pagemap for %d\n", pid);
+ pmc_fini(pmc);
+ return -1;
+ }
+
+ pr_debug("created for pid %d (takes %zu bytes)\n", pid, pmc->map_len);
+
+ return 0;
+}
+
+static u64 __pmc_get_pme(pmc_t *pmc, unsigned long addr)
+{
+ if (likely(pmc->start <= addr && pmc->end > addr))
+ return pmc->map[PAGEMAP_PFN(addr - pmc->start)];
+ return PAGEMAP_PME_ERR;
+}
+
+static int pmc_fill_cache(pmc_t *pmc, struct vma_area *vma)
+{
+ unsigned long low = vma->e->start & PMC_MASK;
+ unsigned long high = low + PMC_SIZE;
+ size_t len = vma_area_len(vma);
+ size_t size_map;
+
+ pmc->start = vma->e->start;
+ pmc->end = vma->e->end;
+
+ pr_debug("filling VMA %lx-%lx (%luK) [l:%lx h:%lx]\n",
+ vma->e->start, vma->e->end, len >> 10, low, high);
+
+ /*
+ * If we meet a small VMA, lets try to fit 2M cache
+ * window at least 75% full, otherwise left as a plain
+ * "one vma at a time" read. Note the VMAs in cache must
+ * fit in solid manner, iow -- either the whole vma fits
+ * the cache window, either plain read is used.
+ *
+ * The benefit (apart redusing the number of read() calls)
+ * is to keep in kernel THP lock as less as possible.
+ */
+ if (len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) {
+ size_t size_cov = len;
+ size_t nr_vmas = 1;
+
+ pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
+ vma->e->start, vma->e->end, nr_vmas, size_cov);
+
+ list_for_each_entry_continue(vma, pmc->vma_head, list) {
+ if (vma->e->start > high || vma->e->end > high)
+ break;
+
+ BUG_ON(vma->e->start < low);
+ size_cov += vma_area_len(vma);
+ nr_vmas++;
+
+ pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
+ vma->e->start, vma->e->end, nr_vmas, size_cov);
+ }
+
+ if (size_cov > (PMC_SIZE - PMC_SIZE_GAP) && nr_vmas > 1) {
+ pmc->start = low;
+ pmc->end = high;
+ pr_debug("\tcache mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
+ } else
+ pr_debug("\tsimple mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
+ }
+
+ size_map = PAGEMAP_LEN(PAGEMAP_PFN(pmc->end - pmc->start));
+ BUG_ON(pmc->map_len < size_map);
+
+ if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
+ pmc_zap(pmc);
+ pr_perror("Can't read %d's pagemap file", pmc->pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+u64 pmc_get_pme(pmc_t *pmc, struct vma_area *vma, unsigned long addr)
+{
+ u64 pme;
+
+ BUG_ON(addr < vma->e->start || addr > vma->e->end);
+
+ /*
+ * Best case -- cache hit.
+ */
+ pme = __pmc_get_pme(pmc, addr);
+ if (likely(pme != PAGEMAP_PME_ERR))
+ return pme;
+
+ /*
+ * Cache miss, refill the cache.
+ */
+ if (pmc_fill_cache(pmc, vma)) {
+ pr_err("Failed to fill cache for %d (%lx)\n", pmc->pid, addr);
+ return PAGEMAP_PME_ERR;
+ }
+
+ /*
+ * It must be a cache hit.
+ */
+ return __pmc_get_pme(pmc, addr);
+}
+
+void pmc_fini(pmc_t *pmc)
+{
+ close_safe(&pmc->fd);
+ xfree(pmc->map);
+ pmc_reset(pmc);
+}
--
1.8.3.1
More information about the CRIU
mailing list