[CRIU] [PATCH 8/9] mem: Drain memory from parasite before dumping it into file

Pavel Emelyanov xemul at parallels.com
Fri Mar 1 11:04:06 EST 2013


Currently we dump pages directly from parasite into image files. This
is bad for several reasons:

1. We cannot use any more-or-less custom format for pages easily, since
   parasite code cannot be linked with any libraries;
2. We will not be able to optimize migration with preliminary memory
   migration (a.k.a. iterative migration) with it -- if we send pages 
   from parasite over network we are not able to let the task we dump 
   continue running.

That said, what is done is -- pages from target task are put into a
page-pipe in one go, then (not in this patch) parasite can be released
and we can do with pages whatever we want. For now pages are just
spliced from pipe into image file.

Some numbers:
In order to drain 1Gb of memory from task we need 1.5M of shared map
in args (for iovecs) and 4 pipes (8 descriptors) each referencing 128Mb
of pages, which int turn requires 4 x 640K chunks of sequential kernel
memory (for pipe_buffer). Not that big I guess.

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>

---
 arch/x86/syscall-x86-64.def |   1 +
 include/parasite.h          |  10 +-
 parasite-syscall.c          | 163 +++++++++++++++++++++++-------
 pie/parasite.c              | 241 +++-----------------------------------------
 4 files changed, 147 insertions(+), 268 deletions(-)

diff --git a/arch/x86/syscall-x86-64.def b/arch/x86/syscall-x86-64.def
index 6e69020..9e2522b 100644
--- a/arch/x86/syscall-x86-64.def
+++ b/arch/x86/syscall-x86-64.def
@@ -75,6 +75,7 @@ __NR_restart_syscall	219		sys_restart_syscall	(void)
 __NR_exit_group		231		sys_exit_group		(int error_code)
 __NR_set_robust_list	273		sys_set_robust_list	(struct robust_list_head *head, size_t len)
 __NR_get_robust_list	274		sys_get_robust_list	(int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
+__NR_vmsplice		278		sys_vmsplice		(int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
 __NR_signalfd4		289		sys_signalfd		(int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
 __NR_rt_tgsigqueueinfo	297		sys_rt_tgsigqueueinfo	(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 __NR_fanotify_init	300		sys_fanotify_init	(unsigned int flags, unsigned int event_f_flags)
diff --git a/include/parasite.h b/include/parasite.h
index 8badb5a..62c4179 100644
--- a/include/parasite.h
+++ b/include/parasite.h
@@ -24,9 +24,7 @@ enum {
 	PARASITE_CMD_FINI,
 	PARASITE_CMD_FINI_THREAD,
 
-	PARASITE_CMD_DUMPPAGES_INIT,
 	PARASITE_CMD_DUMPPAGES,
-	PARASITE_CMD_DUMPPAGES_FINI,
 
 	PARASITE_CMD_DUMP_SIGACTS,
 	PARASITE_CMD_DUMP_ITIMERS,
@@ -55,10 +53,10 @@ struct parasite_log_args {
 };
 
 struct parasite_dump_pages_args {
-	VmaEntry		vma_entry;
-	unsigned long		nrpages_dumped;	/* how many pages are dumped */
-	unsigned long		nrpages_skipped;
-	unsigned long		nrpages_total;
+	unsigned int	off;
+	unsigned int	nr;
+	unsigned int	nr_pages;
+	struct iovec	iovs[0];
 };
 
 struct parasite_dump_sa_args {
diff --git a/parasite-syscall.c b/parasite-syscall.c
index d57fa92..1fedffc 100644
--- a/parasite-syscall.c
+++ b/parasite-syscall.c
@@ -21,6 +21,7 @@
 #include "namespaces.h"
 #include "pstree.h"
 #include "net.h"
+#include "page-pipe.h"
 
 #include <string.h>
 #include <stdlib.h>
@@ -484,18 +485,98 @@ int parasite_dump_creds(struct parasite_ctl *ctl, CredsEntry *ce)
 
 static unsigned int vmas_pagemap_size(struct vm_area_list *vmas)
 {
+	/*
+	 * In the worst case I need one iovec for half of the
+	 * pages (e.g. every odd/even)
+	 */
+
+	return sizeof(struct parasite_dump_pages_args) +
+		vmas->priv_size * sizeof(struct iovec) / 2;
+}
+
+#define PME_PRESENT	(1ULL << 63)
+#define PME_SWAP	(1ULL << 62)
+#define PME_FILE	(1ULL << 61)
+
+static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
+{
+	if (vma_entry_is(vmae, VMA_AREA_VDSO))
+		return true;
+	/*
+	 * Optimisation for private mapping pages, that haven't
+	 * yet being COW-ed
+	 */
+	if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
+		return false;
+	if (pme & (PME_PRESENT | PME_SWAP))
+		return true;
+
+	return false;
+}
+
+static int generate_iovs(struct vma_area *vma, int pagemap, struct page_pipe *pp, u64 *map)
+{
+	unsigned long pfn, nr_to_scan;
+	u64 aux;
+
+	aux = vma->vma.start / PAGE_SIZE * sizeof(*map);
+	if (lseek(pagemap, aux, SEEK_SET) != aux) {
+		pr_perror("Can't rewind pagemap file");
+		return -1;
+	}
+
+	nr_to_scan = vma_area_len(vma) / PAGE_SIZE;
+	aux = nr_to_scan * sizeof(*map);
+	if (read(pagemap, map, aux) != aux) {
+		pr_perror("Can't read pagemap file");
+		return -1;
+	}
+
+	for (pfn = 0; pfn < nr_to_scan; pfn++) {
+		if (!should_dump_page(&vma->vma, map[pfn]))
+			continue;
+
+		if (page_pipe_add_page(pp, vma->vma.start + pfn * PAGE_SIZE))
+			return -1;
+	}
+
 	return 0;
 }
 
-/*
- * This routine drives parasite code (been previously injected into a victim
- * process) and tells it to dump pages into the file.
- */
+static int dump_one_page(int pipe, unsigned long addr, void *arg)
+{
+	int fd = *(int *)arg;
+	u64 iaddr;
+
+	iaddr = encode_pointer((void *)addr);
+	if (write_img(fd, &iaddr))
+		return -1;
+
+	if (splice(pipe, NULL, fd, NULL, PAGE_SIZE,
+				SPLICE_F_MOVE | SPLICE_F_NONBLOCK) != PAGE_SIZE) {
+		pr_perror("Can't splice page from page-pipe");
+		return -1;
+	}
+
+	return 0;
+}
+
+static int dump_pages_to_image(struct page_pipe *pp, struct cr_fdset *fds)
+{
+	int fd;
+
+	fd = fdset_fd(fds, CR_FD_PAGES);
+	return page_pipe_iterate_pages(pp, dump_one_page, &fd);
+}
+
 int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct vm_area_list *vma_area_list,
 			       struct cr_fdset *cr_fdset)
 {
-	struct parasite_dump_pages_args *parasite_dumppages;
-	unsigned long nrpages_dumped = 0, nrpages_skipped = 0, nrpages_total = 0;
+	struct parasite_dump_pages_args *args;
+	u64 *map;
+	int pagemap;
+	struct page_pipe *pp;
+	struct page_pipe_buf *ppb;
 	struct vma_area *vma_area;
 	int ret = -1;
 
@@ -503,51 +584,61 @@ int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct vm_area_list *vm
 	pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, ctl->pid);
 	pr_info("----------------------------------------\n");
 
-	ret = parasite_send_fd(ctl, fdset_fd(cr_fdset, CR_FD_PAGES));
-	if (ret < 0)
-		goto out;
+	pr_debug("   Private vmas %lu/%lu pages\n",
+			vma_area_list->longest, vma_area_list->priv_size);
+
+	args = parasite_args_s(ctl, vmas_pagemap_size(vma_area_list));
 
-	ret = parasite_execute(PARASITE_CMD_DUMPPAGES_INIT, ctl);
-	if (ret < 0) {
-		pr_err("Dumping pages failed with %i\n", ret);
+	map = xmalloc(vma_area_list->longest * sizeof(*map));
+	if (!map)
 		goto out;
-	}
 
-	parasite_dumppages = parasite_args(ctl, struct parasite_dump_pages_args);
+	ret = pagemap = open_proc(ctl->pid, "pagemap");
+	if (ret < 0)
+		goto out_free;
+
+	ret = -1;
+	pp = create_page_pipe(vma_area_list->priv_size / 2, args->iovs);
+	if (!pp)
+		goto out_close;
 
 	list_for_each_entry(vma_area, &vma_area_list->h, list) {
-		parasite_dumppages->vma_entry = vma_area->vma;
-
 		if (!privately_dump_vma(vma_area))
 			continue;
 
-		ret = parasite_execute(PARASITE_CMD_DUMPPAGES, ctl);
-		if (ret) {
-			pr_err("Dumping pages failed with %d\n", ret);
-			goto out_fini;
-		}
+		ret = generate_iovs(vma_area, pagemap, pp, map);
+		if (ret < 0)
+			goto out_pp;
+	}
+
+	args->off = 0;
+	list_for_each_entry(ppb, &pp->bufs, l) {
+		ret = parasite_send_fd(ctl, ppb->p[1]);
+		if (ret)
+			goto out_pp;
 
-		pr_info("vma %"PRIx64"-%"PRIx64"  dumped: %lu pages %lu skipped %lu total\n",
-				vma_area->vma.start, vma_area->vma.end,
-				parasite_dumppages->nrpages_dumped,
-				parasite_dumppages->nrpages_skipped,
-				parasite_dumppages->nrpages_total);
+		args->nr = ppb->nr_segs;
+		args->nr_pages = ppb->pages_in;
+		pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
+				args->nr_pages, args->nr, ppb->pipe_size, args->off);
+
+		ret = parasite_execute(PARASITE_CMD_DUMPPAGES, ctl);
+		if (ret < 0)
+			goto out_pp;
 
-		nrpages_dumped += parasite_dumppages->nrpages_dumped;
-		nrpages_skipped += parasite_dumppages->nrpages_skipped;
-		nrpages_total += parasite_dumppages->nrpages_total;
+		args->off += args->nr;
 	}
 
-	pr_info("\n");
-	pr_info("Summary: %lu dumped %lu skipped %lu total\n",
-			nrpages_dumped, nrpages_skipped, nrpages_total);
-	ret = 0;
+	ret = dump_pages_to_image(pp, cr_fdset);
 
-out_fini:
-	parasite_execute(PARASITE_CMD_DUMPPAGES_FINI, ctl);
+out_pp:
+	destroy_page_pipe(pp);
+out_close:
+	close(pagemap);
+out_free:
+	xfree(map);
 out:
 	pr_info("----------------------------------------\n");
-
 	return ret;
 }
 
diff --git a/pie/parasite.c b/pie/parasite.c
index a109bbf..ec35a16 100644
--- a/pie/parasite.c
+++ b/pie/parasite.c
@@ -14,8 +14,6 @@
 
 #include "asm/parasite.h"
 
-static void *brk_start, *brk_end, *brk_tail;
-
 static int tsock = -1;
 
 static struct tid_state_s {
@@ -32,228 +30,28 @@ static unsigned int next_tid_state;
 
 #define thread_leader	(&tid_state[0])
 
-#define MAX_HEAP_SIZE	(10 << 20)	/* Hope 10MB will be enough...  */
-
-static int brk_init(void)
-{
-	unsigned long ret;
-	/*
-	 *  Map 10 MB. Hope this will be enough for unix skb's...
-	 */
-	ret = sys_mmap(NULL, MAX_HEAP_SIZE,
-			    PROT_READ | PROT_WRITE,
-			    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-	if (ret > TASK_SIZE)
-		return -ENOMEM;
-
-	brk_start = brk_tail = (void *)ret;
-	brk_end = brk_start + MAX_HEAP_SIZE;
-	return 0;
-}
-
-static void brk_fini(void)
-{
-	sys_munmap(brk_start, MAX_HEAP_SIZE);
-}
-
-static void *brk_alloc(unsigned long bytes)
-{
-	void *addr = NULL;
-	if (brk_end >= (brk_tail + bytes)) {
-		addr	= brk_tail;
-		brk_tail+= bytes;
-	}
-	return addr;
-}
-
-static void brk_free(unsigned long bytes)
-{
-	if (brk_start >= (brk_tail - bytes))
-		brk_tail -= bytes;
-}
-
-#define PME_PRESENT	(1ULL << 63)
-#define PME_SWAP	(1ULL << 62)
-#define PME_FILE	(1ULL << 61)
-
-static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
-{
-	if (vma_entry_is(vmae, VMA_AREA_VDSO))
-		return true;
-	/*
-	 * Optimisation for private mapping pages, that haven't
-	 * yet being COW-ed
-	 */
-	if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
-		return false;
-	if (pme & (PME_PRESENT | PME_SWAP))
-		return true;
-
-	return false;
-}
-
-static int fd_pages = -1;
-static int fd_pagemap = -1;
-
-static int dump_pages_init()
-{
-	fd_pages = recv_fd(tsock);
-	if (fd_pages < 0)
-		return fd_pages;
-
-	fd_pagemap = sys_open("/proc/self/pagemap", O_RDONLY, 0);
-	if (fd_pagemap < 0) {
-		pr_err("Can't open self pagemap\n");
-		sys_close(fd_pages);
-		return fd_pagemap;
-	}
-
-	return 0;
-}
-
-static int sys_write_safe(int fd, void *buf, int size)
-{
-	int ret;
-
-	ret = sys_write(fd, buf, size);
-	if (ret < 0) {
-		pr_err("sys_write failed\n");
-		return ret;
-	}
-
-	if (ret != size) {
-		pr_err("not all data was written\n");
-		return -EIO;
-	}
-
-	return 0;
-}
+#ifndef SPLICE_F_GIFT
+#define SPLICE_F_GIFT	0x08
+#endif
 
-/*
- * This is the main page dumping routine, it's executed
- * inside a victim process space.
- */
 static int dump_pages(struct parasite_dump_pages_args *args)
 {
-	unsigned long nrpages, pfn, length;
-	unsigned long prot_old, prot_new;
-	bool bigmap = false;
-	u64 *map, off;
-	int ret = -1;
-
-	args->nrpages_dumped = 0;
-	args->nrpages_skipped = 0;
-	prot_old = prot_new = 0;
-
-	pfn = args->vma_entry.start / PAGE_SIZE;
-	nrpages	= (args->vma_entry.end - args->vma_entry.start) / PAGE_SIZE;
-	args->nrpages_total = nrpages;
-	length = nrpages * sizeof(*map);
-
-	/*
-	 * Up to 10M of pagemap will handle 5G mapping.
-	 */
-	map = brk_alloc(length);
-	if (!map) {
-		/*
-		 * Lets try allocate the bitmap inplace. If the VMA
-		 * is that big we assume the node has enough physical
-		 * memory.
-		 */
-		map = (u64 *)sys_mmap(NULL, length,
-				      PROT_READ | PROT_WRITE,
-				      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-		if ((long)(void *)map > TASK_SIZE) {
-			ret = -ENOMEM;
-			goto err;
-		}
-		bigmap = true;
-	}
-
-	off = pfn * sizeof(*map);
-	off = sys_lseek(fd_pagemap, off, SEEK_SET);
-	if (off != pfn * sizeof(*map)) {
-		pr_err("Can't seek pagemap\n");
-		ret = off;
-		goto err_free;
-	}
-
-	ret = sys_read(fd_pagemap, map, length);
-	if (ret != length) {
-		pr_err("Can't read self pagemap\n");
-		goto err_free;
-	}
-
-	/*
-	 * Try to change page protection if needed so we would
-	 * be able to dump contents.
-	 */
-	if (!(args->vma_entry.prot & PROT_READ)) {
-		prot_old = (unsigned long)args->vma_entry.prot;
-		prot_new = prot_old | PROT_READ;
-		ret = sys_mprotect(decode_pointer(args->vma_entry.start),
-				   (unsigned long)vma_entry_len(&args->vma_entry),
-				   prot_new);
-		if (ret) {
-			pr_err("sys_mprotect failed\n");
-			goto err_free;
-		}
-	}
+	int p, ret;
 
-	ret = 0;
-	for (pfn = 0; pfn < nrpages; pfn++) {
-		uint64_t vaddr;
-
-		if (should_dump_page(&args->vma_entry, map[pfn])) {
-			/*
-			 * That's the optimized write of
-			 * page_entry structure, see image.h
-			 */
-			vaddr = (unsigned long)args->vma_entry.start + pfn * PAGE_SIZE;
-
-			ret = sys_write_safe(fd_pages, &vaddr, sizeof(vaddr));
-			if (ret)
-				return ret;
-			ret = sys_write_safe(fd_pages, decode_pointer(vaddr), PAGE_SIZE);
-			if (ret)
-				return ret;
-
-			args->nrpages_dumped++;
-		} else if (map[pfn] & PME_PRESENT)
-			args->nrpages_skipped++;
-	}
+	p = recv_fd(tsock);
+	if (p < 0)
+		return -1;
 
-	/*
-	 * Don't left pages readable if they were not.
-	 */
-	if (prot_old != prot_new) {
-		ret = sys_mprotect(decode_pointer(args->vma_entry.start),
-				   (unsigned long)vma_entry_len(&args->vma_entry),
-				   prot_old);
-		if (ret) {
-			pr_err("PANIC: Ouch! sys_mprotect failed on restore\n");
-			goto err_free;
-		}
+	ret = sys_vmsplice(p, &args->iovs[args->off], args->nr,
+				SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
+	if (ret != PAGE_SIZE * args->nr_pages) {
+		sys_close(p);
+		pr_err("Can't splice pages ti pipe (%d/%d)", ret, args->nr_pages);
+		return -1;
 	}
 
-	ret = 0;
-err_free:
-	if (!bigmap)
-		brk_free(length);
-	else
-		sys_munmap(map, length);
-err:
-	return ret;
-}
-
-static int dump_pages_fini(void)
-{
-	int ret;
-
-	ret = sys_close(fd_pagemap);
-	ret |= sys_close(fd_pages);
-
-	return ret;
+	sys_close(p);
+	return 0;
 }
 
 static int dump_sigact(struct parasite_dump_sa_args *da)
@@ -436,10 +234,6 @@ static int init(struct parasite_init_args *args)
 	if (!args->nr_threads)
 		return -EINVAL;
 
-	ret = brk_init();
-	if (ret < 0)
-		return ret;
-
 	tid_state = (void *)sys_mmap(NULL, TID_STATE_SIZE(args->nr_threads),
 				     PROT_READ | PROT_WRITE,
 				     MAP_PRIVATE | MAP_ANONYMOUS,
@@ -610,7 +404,6 @@ static int fini(void)
 	sys_munmap(tid_state, TID_STATE_SIZE(nr_tid_state));
 	log_set_fd(-1);
 	sys_close(tsock);
-	brk_fini();
 
 	return ret;
 }
@@ -630,10 +423,6 @@ int __used parasite_service(unsigned int cmd, void *args)
 		return fini_thread();
 	case PARASITE_CMD_CFG_LOG:
 		return parasite_cfg_log(args);
-	case PARASITE_CMD_DUMPPAGES_INIT:
-		return dump_pages_init();
-	case PARASITE_CMD_DUMPPAGES_FINI:
-		return dump_pages_fini();
 	case PARASITE_CMD_DUMPPAGES:
 		return dump_pages(args);
 	case PARASITE_CMD_DUMP_SIGACTS:
-- 
1.7.11.7


More information about the CRIU mailing list