[CRIU] [PATCH v2 2/2] ipc: Restore mprotected sysvshmems

Pavel Emelyanov xemul at virtuozzo.com
Thu May 19 05:14:34 PDT 2016


Image a process has done shmget(2 pages), then shmat() then
mprotect(1 page, ro). In this case criu will dump 1 shmem 
segment 2 pages long and 2 vmas 1 page each.

But on restore time we'll call shmat() for _each_ vma and the
very first one will occupy the whole 2 pages space in vm
(there's no size argument for shmat, only for shmget) thus
blocking the 2nd vma from shmat()-in again.

The solution is:

1. check that each shmem segment is attached by
   the sequence of vmas that cover one w/o holes

2. shmat() only the first one

3. mprotect() all of them if needed (there's no hunks
   for this step in this path, mprotect is already
   called in pie/restorer.c and does things well)

v2:

* List can contain anon shmems (caught by zdtm)
* There can be many attachments of a segment (caught by transition/ipc)

Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>

---
 criu/cr-restore.c    |   6 +-
 criu/include/shmem.h |   6 +-
 criu/ipc_ns.c        |   4 ++
 criu/pie/restorer.c  |  14 ++++-
 criu/shmem.c         | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 200 insertions(+), 4 deletions(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 0f6cdde..ceb40fb 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -72,6 +72,7 @@
 #include "timerfd.h"
 #include "file-lock.h"
 #include "action-scripts.h"
+#include "shmem.h"
 #include "aio.h"
 #include "lsm.h"
 #include "seccomp.h"
@@ -700,7 +701,7 @@ static int open_vmas(int pid)
 				vma->e->pgoff, vma->e->status);
 
 		if (vma_area_is(vma, VMA_AREA_SYSVIPC))
-			ret = vma->e->shmid;
+			ret = get_sysv_shmem_fd(vma->e);
 		else if (vma_area_is(vma, VMA_ANON_SHARED))
 			ret = get_shmem_fd(pid, vma->e);
 		else if (vma_area_is(vma, VMA_FILE_SHARED))
@@ -939,6 +940,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
 	if (open_vmas(pid))
 		return -1;
 
+	if (fixup_sysv_shmems())
+		return -1;
+
 	if (open_cores(pid, core))
 		return -1;
 
diff --git a/criu/include/shmem.h b/criu/include/shmem.h
index 7988784..217394c 100644
--- a/criu/include/shmem.h
+++ b/criu/include/shmem.h
@@ -6,10 +6,14 @@
 
 struct _VmaEntry;
 extern int collect_shmem(int pid, struct _VmaEntry *vi);
+extern int collect_sysv_shmem(unsigned long shmid, unsigned long size);
 extern void show_saved_shmems(void);
 extern int get_shmem_fd(int pid, VmaEntry *vi);
-
+extern int get_sysv_shmem_fd(struct _VmaEntry *vi);
 extern int cr_dump_shmem(void);
 extern int add_shmem_area(pid_t pid, VmaEntry *vma);
+extern int fixup_sysv_shmems(void);
+
+#define SYSV_SHMEM_SKIP_FD	(0x7fffffff)
 
 #endif /* __CR_SHMEM_H__ */
diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c
index a383da0..19dfb72 100644
--- a/criu/ipc_ns.c
+++ b/criu/ipc_ns.c
@@ -14,6 +14,7 @@
 #include "namespaces.h"
 #include "sysctl.h"
 #include "ipc_ns.h"
+#include "shmem.h"
 
 #include "protobuf.h"
 #include "images/ipc-var.pb-c.h"
@@ -759,6 +760,9 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
 	};
 	struct shmid_ds shmid;
 
+	if (collect_sysv_shmem(shm->desc->id, shm->size))
+		return -1;
+
 	ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
 	if (ret < 0) {
 		pr_err("Failed to set desired IPC shm ID\n");
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 9249b9d..2c439a9 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -40,6 +40,7 @@
 #include "images/creds.pb-c.h"
 #include "images/mm.pb-c.h"
 
+#include "shmem.h"
 #include "asm/restorer.h"
 
 #ifndef PR_SET_PDEATHSIG
@@ -521,9 +522,18 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
 	int flags	= vma_entry->flags | MAP_FIXED;
 	unsigned long addr;
 
-	if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC))
+	if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) {
+		/*
+		 * See comment in get_sysv_shmem_fd() for what SYSV_SHMEM_SKIP_FD
+		 * means and why we check for PROT_EXEC few lines below.
+		 */
+		if (vma_entry->fd == SYSV_SHMEM_SKIP_FD)
+			return vma_entry->start;
+
+		pr_info("Attach SYSV shmem %d at %"PRIx64"\n", (int)vma_entry->fd, vma_entry->start);
 		return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start),
-				 (vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY);
+				vma_entry->prot & PROT_EXEC ? 0 : SHM_RDONLY);
+	}
 
 	/*
 	 * Restore or shared mappings are tricky, since
diff --git a/criu/shmem.c b/criu/shmem.c
index 493477e..49fa88a 100644
--- a/criu/shmem.c
+++ b/criu/shmem.c
@@ -25,6 +25,7 @@
  * it's opened in cr-restor, because pgoff may be non zero
  */
 struct shmem_info {
+	struct list_head l;
 	unsigned long	shmid;
 	unsigned long	size;
 	int		pid;
@@ -46,10 +47,32 @@ struct shmem_info {
 	 */
 	int		count;		/* the number of regions */
 	int		self_count;	/* the number of regions, which belongs to "pid" */
+};
+
+/*
+ * Entries collected while creating the sysv shmem
+ * segment. As they sit in the same list with shmem_info-s
+ * their initial fields should match!
+ */
+struct shmem_sysv_info {
+	struct list_head l;
+	unsigned long	shmid;
+	unsigned long	size;
+	int		pid;
 
+	struct list_head att; /* list of shmem_sysv_att-s */
+	int		want_write;
+};
+
+struct shmem_sysv_att {
 	struct list_head l;
+	VmaEntry	*first;
+	unsigned long	prev_end;
 };
 
+/* This is the "pid that will restore shmem" value for sysv */
+#define SYSVIPC_SHMEM_PID	(-1)
+
 /*
  * This list is filled with shared objects before we fork
  * any tasks. Thus the head is private (COW-ed) and the
@@ -77,6 +100,151 @@ static struct shmem_info *find_shmem_by_id(unsigned long shmid)
 	return NULL;
 }
 
+int collect_sysv_shmem(unsigned long shmid, unsigned long size)
+{
+	struct shmem_sysv_info *si;
+
+	/*
+	 * Tasks will not modify this object, so don't
+	 * shmalloc() as we do it for anon shared mem
+	 */
+	si = malloc(sizeof(*si));
+	if (!si)
+		return -1;
+
+	si->shmid = shmid;
+	si->pid = SYSVIPC_SHMEM_PID;
+	si->size = size;
+	si->want_write = 0;
+	INIT_LIST_HEAD(&si->att);
+	list_add_tail(&si->l, &shmems);
+
+	pr_info("Collected SysV shmem %lx, size %ld\n", si->shmid, si->size);
+
+	return 0;
+}
+
+int fixup_sysv_shmems(void)
+{
+	struct shmem_sysv_info *si;
+	struct shmem_sysv_att *att;
+
+	list_for_each_entry(si, &shmems, l) {
+		/* It can be anon shmem */
+		if (si->pid != SYSVIPC_SHMEM_PID)
+			continue;
+
+		list_for_each_entry(att, &si->att, l) {
+			/*
+			 * Same thing is checked in get_sysv_shmem_fd() for
+			 * intermediate holes.
+			 */
+			if (att->first->start + si->size != att->prev_end) {
+				pr_err("Sysv shmem %lx with tail hole not supported\n", si->shmid);
+				return -1;
+			}
+
+			/*
+			 * See comment in get_shmem_fd() about this PROT_EXEC 
+			 */
+			if (si->want_write)
+				att->first->prot |= PROT_EXEC;
+		}
+	}
+
+	return 0;
+}
+
+int get_sysv_shmem_fd(VmaEntry *vme)
+{
+	struct shmem_sysv_info *si;
+	struct shmem_sysv_att *att;
+	int ret_fd;
+
+	si = (struct shmem_sysv_info *)find_shmem_by_id(vme->shmid);
+	if (!si) {
+		pr_err("Can't find sysv shmem for %lx\n", vme->shmid);
+		return -1;
+	}
+
+	if (si->pid != SYSVIPC_SHMEM_PID) {
+		pr_err("SysV shmem vma %lx points to anon vma %lx\n",
+				vme->start, si->shmid);
+		return -1;
+	}
+
+	/*
+	 * We can have a chain of VMAs belonging to the same
+	 * sysv shmem segment all with different access rights
+	 * (ro and rw). But single shmat() system call attaches
+	 * the whole segment regardless of the actual mapping
+	 * size. This can be achieved by attaching a segment
+	 * and then write-protecting its parts.
+	 *
+	 * So, to restore this thing we note the very first
+	 * area of the segment and make it restore the whole
+	 * thing. All the subsequent ones will carry the sign
+	 * telling the restorer to omit shmat and only do the
+	 * ro protection. Yes, it may happen that some sysv
+	 * shmem vma-s sit in the list (and restorer's array)
+	 * for no use.
+	 *
+	 * Holes in between are not handled now, as well as
+	 * the hole at the end (see fixup_sysv_shmems).
+	 *
+	 * One corner case. At shmat() time we need to know
+	 * whether to create the segment rw or ro, but the
+	 * first vma can have different protection. So the
+	 * segment ro-ness is marked with PROT_EXEC bit in
+	 * the first vma. Unfortunatelly, we only know this
+	 * after we scan all the vmas, so this bit is set
+	 * at the end in fixup_sysv_shmems().
+	 */
+
+	if (vme->pgoff == 0) {
+		att = xmalloc(sizeof(*att));
+		if (!att)
+			return -1;
+
+		att->first = vme;
+		list_add(&att->l, &si->att);
+
+		ret_fd = si->shmid;
+	} else {
+		att = list_first_entry(&si->att, struct shmem_sysv_att, l);
+		if (att->prev_end != vme->start) {
+			pr_err("Sysv shmem %lx with a hole not supported\n", si->shmid);
+			return -1;
+		}
+		if (vme->pgoff != att->prev_end - att->first->start) {
+			pr_err("Sysv shmem %lx with misordered attach chunks\n", si->shmid);
+			return -1;
+		}
+
+		/*
+		 * Value that doesn't (shouldn't) match with any real
+		 * sysv shmem ID (thus it cannot be 0, as shmem id can)
+		 * and still is not negative to prevent open_vmas() from
+		 * treating it as error.
+		 */
+		ret_fd = SYSV_SHMEM_SKIP_FD;
+	}
+
+	pr_info("Note 0x%"PRIx64"-0x%"PRIx64" as %lx sysvshmem\n", vme->start, vme->end, si->shmid);
+
+	att->prev_end = vme->end;
+	if (!vme->has_fdflags || vme->fdflags == O_RDWR)
+		/*
+		 * We can't look at vma->prot & PROT_WRITE as all this stuff
+		 * can be read-protected. If !has_fdflags these are old images
+		 * and ... we have no other choice other than make it with
+		 * maximum access :(
+		 */
+		si->want_write = 1;
+
+	return ret_fd;
+}
+
 int collect_shmem(int pid, VmaEntry *vi)
 {
 	unsigned long size = vi->pgoff + vi->end - vi->start;
@@ -84,6 +252,10 @@ int collect_shmem(int pid, VmaEntry *vi)
 
 	si = find_shmem_by_id(vi->shmid);
 	if (si) {
+		if (si->pid == SYSVIPC_SHMEM_PID) {
+			pr_err("Shmem %lx already collected as SYSVIPC\n", vi->shmid);
+			return -1;
+		}
 
 		if (si->size < size)
 			si->size = size;
@@ -210,6 +382,8 @@ int get_shmem_fd(int pid, VmaEntry *vi)
 		return -1;
 	}
 
+	BUG_ON(si->pid == SYSVIPC_SHMEM_PID);
+
 	if (si->pid != pid)
 		return shmem_wait_and_open(pid, si);
 
-- 
2.5.0



More information about the CRIU mailing list