[CRIU] [PATCHv2 5/9] shmem: implement used and dirty pages tracking

Eugene Batalov eabatalov89 at gmail.com
Tue Dec 22 07:22:50 PST 2015


From: Fyodor Bocharov <bocharovfedor at gmail.com>

To track anon shared vma pages usage we create a bitmap for it.
Each bit in this bitmap corresponds to particular page in vma. Bit with
value of 1 in this map states that page is used. Bit 0 states that the
page wasn't ever used by any dumpee process.
Page is considered used if it has at least one PME_PRESENT or PME_SWAP
bit set in any dumpee process pagemap.
Pages usage tracking allows us not to dump unused pages at all.

To track anon shared vma pages dirtiness we create a bitmap for it.
Each bit in this bitmap corresponds to particular page in vma. Bit with
value of 1 in this map states that page is dirty. Bit 0 states that the
page is clean.
Page is considered dirty if it has at least one PME_SOFT_DIRTY bit set in
any dumpee process pagemap.
Dirty pages tracking allows not to dump used pages on incremental dumps
if pages contents haven't changed.

Signed-off-by: Fyodor Bocharov <fbocharov at yandex.ru>
Signed-off-by: Eugene Batalov <eabatalov89 at gmail.com>
---
 shmem.c      |  66 +++++++-
 shmem.c.orig | 513 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 575 insertions(+), 4 deletions(-)
 create mode 100644 shmem.c.orig

diff --git a/shmem.c b/shmem.c
index dcbe19f..c5ab16e 100644
--- a/shmem.c
+++ b/shmem.c
@@ -283,6 +283,8 @@ struct shmem_info_dump {
 	unsigned long	start;
 	unsigned long	end;
 	int		pid;
+	unsigned long	*pdirty_map;
+	unsigned long	*pused_map;
 
 	struct shmem_info_dump *next;
 };
@@ -290,6 +292,55 @@ struct shmem_info_dump {
 #define SHMEM_HASH_SIZE	32
 static struct shmem_info_dump *shmems_hash[SHMEM_HASH_SIZE];
 
+#define BLOCKS_CNT(size, block_size) (((size) + (block_size) - 1) / (block_size))
+
+static int expand_shmem(struct shmem_info_dump *si, unsigned long new_size)
+{
+	unsigned long nr_pages, nr_map_items, map_size,
+				nr_new_map_items, new_map_size;
+
+	nr_pages = BLOCKS_CNT(si->size, PAGE_SIZE);
+	nr_map_items = BLOCKS_CNT(nr_pages, sizeof(*si->pdirty_map) * 8);
+	map_size = nr_map_items * sizeof(*si->pdirty_map);
+
+	nr_pages = BLOCKS_CNT(new_size, PAGE_SIZE);
+	nr_new_map_items = BLOCKS_CNT(nr_pages, sizeof(*si->pdirty_map) * 8);
+	new_map_size = nr_new_map_items * sizeof(*si->pdirty_map);
+
+	BUG_ON(new_map_size < map_size);
+
+	si->pdirty_map = xrealloc(si->pdirty_map, new_map_size);
+	if (!si->pdirty_map)
+		return -1;
+	memzero(si->pdirty_map + nr_map_items, new_map_size - map_size);
+
+	si->pused_map = xrealloc(si->pused_map, new_map_size);
+	if (!si->pused_map)
+		return -1;
+	memzero(si->pused_map + nr_map_items, new_map_size - map_size);
+
+	si->size = new_size;
+	return 0;
+}
+
+static void update_shmem_pmaps(struct shmem_info_dump *si, u64 *map,
+		unsigned long off)
+{
+	unsigned long p, pcount, poff;
+
+	pcount = BLOCKS_CNT(si->size - off, PAGE_SIZE);
+	poff = BLOCKS_CNT(off, PAGE_SIZE);
+	for (p = 0; p < pcount; ++p) {
+		if (map[p] & PME_SOFT_DIRTY)
+			set_bit(p + poff, si->pdirty_map);
+		if (map[p] & PME_SWAP)
+			set_bit(p + poff, si->pused_map);
+		else if ((map[p] & PME_PRESENT) &&
+				((map[p] & PME_PFRAME_MASK) != kdat.zero_page_pfn))
+			set_bit(p + poff, si->pused_map);
+	}
+}
+
 static struct shmem_info_dump *shmem_find(struct shmem_info_dump **chain,
 		unsigned long shmid)
 {
@@ -306,13 +357,16 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
 {
 	struct shmem_info_dump *si, **chain;
 	unsigned long size = vma->pgoff + (vma->end - vma->start);
-	(void)map;
 
 	chain = &shmems_hash[vma->shmid % SHMEM_HASH_SIZE];
 	si = shmem_find(chain, vma->shmid);
 	if (si) {
-		if (si->size < size)
-			si->size = size;
+		if (si->size < size) {
+			if (expand_shmem(si, size))
+				return -1;
+		}
+		update_shmem_pmaps(si, map, vma->pgoff);
+
 		return 0;
 	}
 
@@ -323,12 +377,16 @@ int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
 	si->next = *chain;
 	*chain = si;
 
-	si->size = size;
+	si->size = 0;
 	si->pid = pid;
 	si->start = vma->start;
 	si->end = vma->end;
 	si->shmid = vma->shmid;
 
+	if (expand_shmem(si, size))
+		return -1;
+	update_shmem_pmaps(si, map, vma->pgoff);
+
 	return 0;
 }
 
diff --git a/shmem.c.orig b/shmem.c.orig
new file mode 100644
index 0000000..8a8bb44
--- /dev/null
+++ b/shmem.c.orig
@@ -0,0 +1,513 @@
+#include <unistd.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <fcntl.h>
+
+#include "pid.h"
+#include "shmem.h"
+#include "image.h"
+#include "cr_options.h"
+#include "kerndat.h"
+#include "page-pipe.h"
+#include "page-xfer.h"
+#include "rst-malloc.h"
+#include "vma.h"
+#include "mem.h"
+
+#include "protobuf.h"
+#include "protobuf/pagemap.pb-c.h"
+
+/*
+ * pid is a pid of a creater
+ * start, end are used for open mapping
+ * fd is a file discriptor, which is valid for creater,
+ * it's opened in cr-restor, because pgoff may be non zero
+ */
+struct shmem_info {
+	unsigned long	shmid;
+	unsigned long	size;
+	int		pid;
+	int		fd;
+
+	/*
+	 * 0. lock is initilized to zero
+	 * 1. the master opens a descriptor and set lock to 1
+	 * 2. slaves open their descriptors and increment lock
+	 * 3. the master waits all slaves on lock. After that
+	 *    it can close the descriptor.
+	 */
+	futex_t		lock;
+
+	/*
+	 * Here is a problem, that we don't know, which process will restore
+	 * an region. Each time when we	found a process with a smaller pid,
+	 * we reset self_count, so we can't have only one counter.
+	 */
+	int		count;		/* the number of regions */
+	int		self_count;	/* the number of regions, which belongs to "pid" */
+
+	struct list_head l;
+};
+
+/*
+ * This list is filled with shared objects before we fork
+ * any tasks. Thus the head is private (COW-ed) and the
+ * entries are all in shmem.
+ */
+static LIST_HEAD(shmems); /* XXX hash? tree? */
+
+void show_saved_shmems(void)
+{
+	struct shmem_info *si;
+
+	pr_info("\tSaved shmems:\n");
+	list_for_each_entry(si, &shmems, l)
+		pr_info("\t\tshmid: 0x%lx pid: %d\n", si->shmid, si->pid);
+}
+
+static struct shmem_info *find_shmem_by_id(unsigned long shmid)
+{
+	struct shmem_info *si;
+
+	list_for_each_entry(si, &shmems, l)
+		if (si->shmid == shmid)
+			return si;
+
+	return NULL;
+}
+
+int collect_shmem(int pid, VmaEntry *vi)
+{
+	unsigned long size = vi->pgoff + vi->end - vi->start;
+	struct shmem_info *si;
+
+	si = find_shmem_by_id(vi->shmid);
+	if (si) {
+
+		if (si->size < size)
+			si->size = size;
+		si->count++;
+
+		/*
+		 * Only the shared mapping with a lowest
+		 * pid will be created in real, other processes
+		 * will wait until the kernel propagate this mapping
+		 * into /proc
+		 */
+		if (!pid_rst_prio(pid, si->pid)) {
+			if (si->pid == pid)
+				si->self_count++;
+
+			return 0;
+		}
+
+		si->pid	 = pid;
+		si->self_count = 1;
+
+		return 0;
+	}
+
+	si = shmalloc(sizeof(struct shmem_info));
+	if (!si)
+		return -1;
+
+	pr_info("Add new shmem 0x%"PRIx64" (0x%016"PRIx64"-0x%016"PRIx64")\n",
+				vi->shmid, vi->start, vi->end);
+
+	si->shmid = vi->shmid;
+	si->pid	  = pid;
+	si->size  = size;
+	si->fd    = -1;
+	si->count = 1;
+	si->self_count = 1;
+	futex_init(&si->lock);
+	list_add_tail(&si->l, &shmems);
+
+	return 0;
+}
+
+static int shmem_wait_and_open(int pid, struct shmem_info *si)
+{
+	char path[128];
+	int ret;
+
+	pr_info("Waiting for the %lx shmem to appear\n", si->shmid);
+	futex_wait_while(&si->lock, 0);
+
+	snprintf(path, sizeof(path), "/proc/%d/fd/%d",
+		si->pid, si->fd);
+
+	pr_info("Opening shmem [%s] \n", path);
+	ret = open_proc_rw(si->pid, "fd/%d", si->fd);
+	if (ret < 0)
+		pr_perror("     %d: Can't stat shmem at %s",
+				si->pid, path);
+	futex_inc_and_wake(&si->lock);
+	return ret;
+}
+
+static int restore_shmem_content(void *addr, struct shmem_info *si)
+{
+	int ret = 0, fd_pg;
+	struct page_read pr;
+	unsigned long off_real;
+
+	ret = open_page_read(si->shmid, &pr, PR_SHMEM);
+	if (ret <= 0)
+		return -1;
+
+	fd_pg = img_raw_fd(pr.pi);
+	while (1) {
+		unsigned long vaddr;
+		unsigned nr_pages;
+		struct iovec iov;
+
+		ret = pr.get_pagemap(&pr, &iov);
+		if (ret <= 0)
+			break;
+
+		vaddr = (unsigned long)iov.iov_base;
+		nr_pages = iov.iov_len / PAGE_SIZE;
+
+		if (vaddr + nr_pages * PAGE_SIZE > si->size)
+			break;
+
+		off_real = lseek(fd_pg, 0, SEEK_CUR);
+
+		ret = read(fd_pg, addr + vaddr, nr_pages * PAGE_SIZE);
+		if (ret != nr_pages * PAGE_SIZE) {
+			ret = -1;
+			break;
+		}
+
+		if (opts.auto_dedup) {
+			ret = punch_hole(&pr, off_real, nr_pages * PAGE_SIZE, false);
+			if (ret == -1) {
+				break;
+			}
+		}
+
+		if (pr.put_pagemap)
+			pr.put_pagemap(&pr);
+	}
+
+	pr.close(&pr);
+	return ret;
+}
+
+int get_shmem_fd(int pid, VmaEntry *vi)
+{
+	struct shmem_info *si;
+	void *addr = MAP_FAILED;
+	int f = -1;
+	int flags;
+
+	si = find_shmem_by_id(vi->shmid);
+	pr_info("Search for 0x%016"PRIx64" shmem 0x%"PRIx64" %p/%d\n", vi->start, vi->shmid, si, si ? si->pid : -1);
+	if (!si) {
+		pr_err("Can't find my shmem 0x%016"PRIx64"\n", vi->start);
+		return -1;
+	}
+
+	if (si->pid != pid)
+		return shmem_wait_and_open(pid, si);
+
+	if (si->fd != -1)
+		return dup(si->fd);
+
+	flags = MAP_SHARED;
+	if (kdat.has_memfd) {
+		f = sys_memfd_create("", 0);
+		if (f < 0) {
+			pr_perror("Unable to create memfd");
+			goto err;
+		}
+
+		if (ftruncate(f, si->size)) {
+			pr_perror("Unable to truncate memfd");
+			goto err;
+		}
+		flags |= MAP_FILE;
+	} else
+		flags |= MAP_ANONYMOUS;
+
+	/*
+	 * The following hack solves problems:
+	 * vi->pgoff may be not zero in a target process.
+	 * This mapping may be mapped more then once.
+	 * The restorer doesn't have snprintf.
+	 * Here is a good place to restore content
+	 */
+	addr = mmap(NULL, si->size, PROT_WRITE | PROT_READ, flags, f, 0);
+	if (addr == MAP_FAILED) {
+		pr_err("Can't mmap shmid=0x%"PRIx64" size=%ld\n",
+				vi->shmid, si->size);
+		goto err;
+	}
+
+	if (restore_shmem_content(addr, si) < 0) {
+		pr_err("Can't restore shmem content\n");
+		goto err;
+	}
+
+	if (f == -1) {
+		f = open_proc_rw(getpid(), "map_files/%lx-%lx",
+				(unsigned long) addr,
+				(unsigned long) addr + si->size);
+		if (f < 0)
+			goto err;
+	}
+	munmap(addr, si->size);
+
+	si->fd = f;
+
+	/* Send signal to slaves, that they can open fd for this shmem */
+	futex_inc_and_wake(&si->lock);
+	/*
+	 * All other regions in this process will duplicate
+	 * the file descriptor, so we don't wait them.
+	 */
+	futex_wait_until(&si->lock, si->count - si->self_count + 1);
+
+	return f;
+err:
+	if (addr != MAP_FAILED)
+		munmap(addr, si->size);
+	close_safe(&f);
+	return -1;
+}
+
+struct shmem_info_dump {
+	unsigned long	size;
+	unsigned long	shmid;
+	unsigned long	start;
+	unsigned long	end;
+	int		pid;
+	unsigned long	*pdirty_map;
+	unsigned long	*pused_map;
+
+	struct shmem_info_dump *next;
+};
+
+#define SHMEM_HASH_SIZE	32
+static struct shmem_info_dump *shmems_hash[SHMEM_HASH_SIZE];
+
+#define BLOCKS_CNT(size, block_size) (((size) + (block_size) - 1) / (block_size))
+
+static int expand_shmem_pinfo_maps(struct shmem_info_dump *si, unsigned long new_mem_size)
+{
+	unsigned long nr_pages, nr_map_items, map_size,
+				nr_new_map_items, new_map_size;
+
+	nr_pages = BLOCKS_CNT(si->size, PAGE_SIZE);
+	nr_map_items = BLOCKS_CNT(nr_pages, sizeof(*si->pdirty_map) * 8);
+	map_size = nr_map_items * sizeof(*si->pdirty_map);
+
+	nr_pages = BLOCKS_CNT(new_mem_size, PAGE_SIZE);
+	nr_new_map_items = BLOCKS_CNT(nr_pages, sizeof(*si->pdirty_map) * 8);
+	new_map_size = nr_new_map_items * sizeof(*si->pdirty_map);
+
+	BUG_ON(new_map_size < map_size);
+
+	si->pdirty_map = xrealloc(si->pdirty_map, new_map_size);
+	if (!si->pdirty_map)
+		return -1;
+	memzero(si->pdirty_map + nr_map_items, new_map_size - map_size);
+
+	si->pused_map = xrealloc(si->pused_map, new_map_size);
+	if (!si->pused_map)
+		return -1;
+	memzero(si->pused_map + nr_map_items, new_map_size - map_size);
+
+	return 0;
+}
+
+static void update_shmem_pinfo_maps(struct shmem_info_dump *si, u64 *map,
+		unsigned long off)
+{
+	unsigned long p, pcount, poff;
+
+	pcount = BLOCKS_CNT(si->size - off, PAGE_SIZE);
+	poff = BLOCKS_CNT(off, PAGE_SIZE);
+	for (p = 0; p < pcount; ++p) {
+		if (map[p] & PME_SOFT_DIRTY)
+			set_bit(p + poff, si->pdirty_map);
+		if (map[p] & PME_SWAP)
+			set_bit(p + poff, si->pused_map);
+		else if ((map[p] & PME_PRESENT) &&
+				((map[p] & PME_PFRAME_MASK) != kdat.zero_page_pfn))
+			set_bit(p + poff, si->pused_map);
+	}
+}
+
+static struct shmem_info_dump *shmem_find(struct shmem_info_dump **chain,
+		unsigned long shmid)
+{
+	struct shmem_info_dump *sh;
+
+	for (sh = *chain; sh; sh = sh->next)
+		if (sh->shmid == shmid)
+			return sh;
+
+	return NULL;
+}
+
+int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map)
+{
+	struct shmem_info_dump *si, **chain;
+	unsigned long size = vma->pgoff + (vma->end - vma->start);
+<<<<<<< HEAD
+	(void)map;
+=======
+    (void)map;
+>>>>>>> f20feb6... shmem: implement used and dirty pages tracking
+
+	chain = &shmems_hash[vma->shmid % SHMEM_HASH_SIZE];
+	si = shmem_find(chain, vma->shmid);
+	if (si) {
+		if (si->size < size) {
+			if (expand_shmem_pinfo_maps(si, size))
+				return -1;
+
+			si->size = size;
+		}
+		update_shmem_pinfo_maps(si, map, vma->pgoff);
+
+		return 0;
+	}
+
+	si = xzalloc(sizeof(*si));
+	if (!si)
+		return -1;
+
+	si->next = *chain;
+	*chain = si;
+
+	si->size = 0;
+	si->pid = pid;
+	si->start = vma->start;
+	si->end = vma->end;
+	si->shmid = vma->shmid;
+
+	if (expand_shmem_pinfo_maps(si, size))
+		return -1;
+	si->size = size;
+	update_shmem_pinfo_maps(si, map, vma->pgoff);
+
+	return 0;
+}
+
+static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, void *addr)
+{
+	struct page_pipe_buf *ppb;
+
+	list_for_each_entry(ppb, &pp->bufs, l)
+		if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
+					SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
+				ppb->pages_in * PAGE_SIZE) {
+			pr_perror("Can't get shmem into page-pipe");
+			return -1;
+		}
+
+	return page_xfer_dump_pages(xfer, pp, (unsigned long)addr);
+}
+
+static int dump_one_shmem(struct shmem_info_dump *si)
+{
+	struct iovec *iovs;
+	struct page_pipe *pp;
+	struct page_xfer xfer;
+	int err, ret = -1, fd;
+	unsigned char *map = NULL;
+	void *addr = NULL;
+	unsigned long pfn, nrpages;
+
+	pr_info("Dumping shared memory %ld\n", si->shmid);
+
+	nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
+	map = xmalloc(nrpages * sizeof(*map));
+	if (!map)
+		goto err;
+
+	fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end);
+	if (fd < 0)
+		goto err;
+
+	addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0);
+	close(fd);
+	if (addr == MAP_FAILED) {
+		pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n",
+				si->shmid, si->start, si->end);
+		goto err;
+	}
+
+	/*
+	 * We can't use pagemap here, because this vma is
+	 * not mapped to us at all, but mincore reports the
+	 * pagecache status of a file, which is correct in
+	 * this case.
+	 */
+
+	err = mincore(addr, si->size, map);
+	if (err)
+		goto err_unmap;
+
+	iovs = xmalloc(((nrpages + 1) / 2) * sizeof(struct iovec));
+	if (!iovs)
+		goto err_unmap;
+
+	pp = create_page_pipe((nrpages + 1) / 2, iovs, true);
+	if (!pp)
+		goto err_iovs;
+
+	err = open_page_xfer(&xfer, CR_FD_SHMEM_PAGEMAP, si->shmid);
+	if (err)
+		goto err_pp;
+
+	for (pfn = 0; pfn < nrpages; pfn++) {
+		if (!(map[pfn] & PAGE_RSS))
+			continue;
+again:
+		ret = page_pipe_add_page(pp, (unsigned long)addr + pfn * PAGE_SIZE);
+		if (ret == -EAGAIN) {
+			ret = dump_pages(pp, &xfer, addr);
+			if (ret)
+				goto err_xfer;
+			page_pipe_reinit(pp);
+			goto again;
+		} else if (ret)
+			goto err_xfer;
+	}
+
+	ret = dump_pages(pp, &xfer, addr);
+
+err_xfer:
+	xfer.close(&xfer);
+err_pp:
+	destroy_page_pipe(pp);
+err_iovs:
+	xfree(iovs);
+err_unmap:
+	munmap(addr,  si->size);
+err:
+	xfree(map);
+	return ret;
+}
+
+#define for_each_shmem_dump(_i, _si)				\
+	for (i = 0; i < SHMEM_HASH_SIZE; i++)			\
+		for (si = shmems_hash[i]; si; si = si->next)
+
+int cr_dump_shmem(void)
+{
+	int ret = 0, i;
+	struct shmem_info_dump *si;
+
+	for_each_shmem_dump (i, si) {
+		ret = dump_one_shmem(si);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
-- 
1.9.1



More information about the CRIU mailing list