[CRIU] [PATCH v2 2/2] ipc: Restore mprotected sysvshmems
Pavel Emelyanov
xemul at virtuozzo.com
Thu May 19 05:14:34 PDT 2016
Image a process has done shmget(2 pages), then shmat() then
mprotect(1 page, ro). In this case criu will dump 1 shmem
segment 2 pages long and 2 vmas 1 page each.
But on restore time we'll call shmat() for _each_ vma and the
very first one will occupy the whole 2 pages space in vm
(there's no size argument for shmat, only for shmget) thus
blocking the 2nd vma from shmat()-in again.
The solution is:
1. check that each shmem segment is attached by
the sequence of vmas that cover one w/o holes
2. shmat() only the first one
3. mprotect() all of them if needed (there's no hunks
for this step in this path, mprotect is already
called in pie/restorer.c and does things well)
v2:
* List can contain anon shmems (caught by zdtm)
* There can be many attachments of a segment (caught by transition/ipc)
Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
---
criu/cr-restore.c | 6 +-
criu/include/shmem.h | 6 +-
criu/ipc_ns.c | 4 ++
criu/pie/restorer.c | 14 ++++-
criu/shmem.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 200 insertions(+), 4 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 0f6cdde..ceb40fb 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -72,6 +72,7 @@
#include "timerfd.h"
#include "file-lock.h"
#include "action-scripts.h"
+#include "shmem.h"
#include "aio.h"
#include "lsm.h"
#include "seccomp.h"
@@ -700,7 +701,7 @@ static int open_vmas(int pid)
vma->e->pgoff, vma->e->status);
if (vma_area_is(vma, VMA_AREA_SYSVIPC))
- ret = vma->e->shmid;
+ ret = get_sysv_shmem_fd(vma->e);
else if (vma_area_is(vma, VMA_ANON_SHARED))
ret = get_shmem_fd(pid, vma->e);
else if (vma_area_is(vma, VMA_FILE_SHARED))
@@ -939,6 +940,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
if (open_vmas(pid))
return -1;
+ if (fixup_sysv_shmems())
+ return -1;
+
if (open_cores(pid, core))
return -1;
diff --git a/criu/include/shmem.h b/criu/include/shmem.h
index 7988784..217394c 100644
--- a/criu/include/shmem.h
+++ b/criu/include/shmem.h
@@ -6,10 +6,14 @@
struct _VmaEntry;
extern int collect_shmem(int pid, struct _VmaEntry *vi);
+extern int collect_sysv_shmem(unsigned long shmid, unsigned long size);
extern void show_saved_shmems(void);
extern int get_shmem_fd(int pid, VmaEntry *vi);
-
+extern int get_sysv_shmem_fd(struct _VmaEntry *vi);
extern int cr_dump_shmem(void);
extern int add_shmem_area(pid_t pid, VmaEntry *vma);
+extern int fixup_sysv_shmems(void);
+
+#define SYSV_SHMEM_SKIP_FD (0x7fffffff)
#endif /* __CR_SHMEM_H__ */
diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c
index a383da0..19dfb72 100644
--- a/criu/ipc_ns.c
+++ b/criu/ipc_ns.c
@@ -14,6 +14,7 @@
#include "namespaces.h"
#include "sysctl.h"
#include "ipc_ns.h"
+#include "shmem.h"
#include "protobuf.h"
#include "images/ipc-var.pb-c.h"
@@ -759,6 +760,9 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
};
struct shmid_ds shmid;
+ if (collect_sysv_shmem(shm->desc->id, shm->size))
+ return -1;
+
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
if (ret < 0) {
pr_err("Failed to set desired IPC shm ID\n");
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 9249b9d..2c439a9 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -40,6 +40,7 @@
#include "images/creds.pb-c.h"
#include "images/mm.pb-c.h"
+#include "shmem.h"
#include "asm/restorer.h"
#ifndef PR_SET_PDEATHSIG
@@ -521,9 +522,18 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
int flags = vma_entry->flags | MAP_FIXED;
unsigned long addr;
- if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC))
+ if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) {
+ /*
+ * See comment in get_sysv_shmem_fd() for what SYSV_SHMEM_SKIP_FD
+ * means and why we check for PROT_EXEC few lines below.
+ */
+ if (vma_entry->fd == SYSV_SHMEM_SKIP_FD)
+ return vma_entry->start;
+
+ pr_info("Attach SYSV shmem %d at %"PRIx64"\n", (int)vma_entry->fd, vma_entry->start);
return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start),
- (vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY);
+ vma_entry->prot & PROT_EXEC ? 0 : SHM_RDONLY);
+ }
/*
* Restore or shared mappings are tricky, since
diff --git a/criu/shmem.c b/criu/shmem.c
index 493477e..49fa88a 100644
--- a/criu/shmem.c
+++ b/criu/shmem.c
@@ -25,6 +25,7 @@
* it's opened in cr-restor, because pgoff may be non zero
*/
struct shmem_info {
+ struct list_head l;
unsigned long shmid;
unsigned long size;
int pid;
@@ -46,10 +47,32 @@ struct shmem_info {
*/
int count; /* the number of regions */
int self_count; /* the number of regions, which belongs to "pid" */
+};
+
+/*
+ * Entries collected while creating the sysv shmem
+ * segment. As they sit in the same list with shmem_info-s
+ * their initial fields should match!
+ */
+struct shmem_sysv_info {
+ struct list_head l;
+ unsigned long shmid;
+ unsigned long size;
+ int pid;
+ struct list_head att; /* list of shmem_sysv_att-s */
+ int want_write;
+};
+
+struct shmem_sysv_att {
struct list_head l;
+ VmaEntry *first;
+ unsigned long prev_end;
};
+/* This is the "pid that will restore shmem" value for sysv */
+#define SYSVIPC_SHMEM_PID (-1)
+
/*
* This list is filled with shared objects before we fork
* any tasks. Thus the head is private (COW-ed) and the
@@ -77,6 +100,151 @@ static struct shmem_info *find_shmem_by_id(unsigned long shmid)
return NULL;
}
+int collect_sysv_shmem(unsigned long shmid, unsigned long size)
+{
+ struct shmem_sysv_info *si;
+
+ /*
+ * Tasks will not modify this object, so don't
+ * shmalloc() as we do it for anon shared mem
+ */
+ si = malloc(sizeof(*si));
+ if (!si)
+ return -1;
+
+ si->shmid = shmid;
+ si->pid = SYSVIPC_SHMEM_PID;
+ si->size = size;
+ si->want_write = 0;
+ INIT_LIST_HEAD(&si->att);
+ list_add_tail(&si->l, &shmems);
+
+ pr_info("Collected SysV shmem %lx, size %ld\n", si->shmid, si->size);
+
+ return 0;
+}
+
+int fixup_sysv_shmems(void)
+{
+ struct shmem_sysv_info *si;
+ struct shmem_sysv_att *att;
+
+ list_for_each_entry(si, &shmems, l) {
+ /* It can be anon shmem */
+ if (si->pid != SYSVIPC_SHMEM_PID)
+ continue;
+
+ list_for_each_entry(att, &si->att, l) {
+ /*
+ * Same thing is checked in get_sysv_shmem_fd() for
+ * intermediate holes.
+ */
+ if (att->first->start + si->size != att->prev_end) {
+ pr_err("Sysv shmem %lx with tail hole not supported\n", si->shmid);
+ return -1;
+ }
+
+ /*
+ * See comment in get_shmem_fd() about this PROT_EXEC
+ */
+ if (si->want_write)
+ att->first->prot |= PROT_EXEC;
+ }
+ }
+
+ return 0;
+}
+
+int get_sysv_shmem_fd(VmaEntry *vme)
+{
+ struct shmem_sysv_info *si;
+ struct shmem_sysv_att *att;
+ int ret_fd;
+
+ si = (struct shmem_sysv_info *)find_shmem_by_id(vme->shmid);
+ if (!si) {
+ pr_err("Can't find sysv shmem for %lx\n", vme->shmid);
+ return -1;
+ }
+
+ if (si->pid != SYSVIPC_SHMEM_PID) {
+ pr_err("SysV shmem vma %lx points to anon vma %lx\n",
+ vme->start, si->shmid);
+ return -1;
+ }
+
+ /*
+ * We can have a chain of VMAs belonging to the same
+ * sysv shmem segment all with different access rights
+ * (ro and rw). But single shmat() system call attaches
+ * the whole segment regardless of the actual mapping
+ * size. This can be achieved by attaching a segment
+ * and then write-protecting its parts.
+ *
+ * So, to restore this thing we note the very first
+ * area of the segment and make it restore the whole
+ * thing. All the subsequent ones will carry the sign
+ * telling the restorer to omit shmat and only do the
+ * ro protection. Yes, it may happen that some sysv
+ * shmem vma-s sit in the list (and restorer's array)
+ * for no use.
+ *
+ * Holes in between are not handled now, as well as
+ * the hole at the end (see fixup_sysv_shmems).
+ *
+ * One corner case. At shmat() time we need to know
+ * whether to create the segment rw or ro, but the
+ * first vma can have different protection. So the
+ * segment ro-ness is marked with PROT_EXEC bit in
+ * the first vma. Unfortunatelly, we only know this
+ * after we scan all the vmas, so this bit is set
+ * at the end in fixup_sysv_shmems().
+ */
+
+ if (vme->pgoff == 0) {
+ att = xmalloc(sizeof(*att));
+ if (!att)
+ return -1;
+
+ att->first = vme;
+ list_add(&att->l, &si->att);
+
+ ret_fd = si->shmid;
+ } else {
+ att = list_first_entry(&si->att, struct shmem_sysv_att, l);
+ if (att->prev_end != vme->start) {
+ pr_err("Sysv shmem %lx with a hole not supported\n", si->shmid);
+ return -1;
+ }
+ if (vme->pgoff != att->prev_end - att->first->start) {
+ pr_err("Sysv shmem %lx with misordered attach chunks\n", si->shmid);
+ return -1;
+ }
+
+ /*
+ * Value that doesn't (shouldn't) match with any real
+ * sysv shmem ID (thus it cannot be 0, as shmem id can)
+ * and still is not negative to prevent open_vmas() from
+ * treating it as error.
+ */
+ ret_fd = SYSV_SHMEM_SKIP_FD;
+ }
+
+ pr_info("Note 0x%"PRIx64"-0x%"PRIx64" as %lx sysvshmem\n", vme->start, vme->end, si->shmid);
+
+ att->prev_end = vme->end;
+ if (!vme->has_fdflags || vme->fdflags == O_RDWR)
+ /*
+ * We can't look at vma->prot & PROT_WRITE as all this stuff
+ * can be read-protected. If !has_fdflags these are old images
+ * and ... we have no other choice other than make it with
+ * maximum access :(
+ */
+ si->want_write = 1;
+
+ return ret_fd;
+}
+
int collect_shmem(int pid, VmaEntry *vi)
{
unsigned long size = vi->pgoff + vi->end - vi->start;
@@ -84,6 +252,10 @@ int collect_shmem(int pid, VmaEntry *vi)
si = find_shmem_by_id(vi->shmid);
if (si) {
+ if (si->pid == SYSVIPC_SHMEM_PID) {
+ pr_err("Shmem %lx already collected as SYSVIPC\n", vi->shmid);
+ return -1;
+ }
if (si->size < size)
si->size = size;
@@ -210,6 +382,8 @@ int get_shmem_fd(int pid, VmaEntry *vi)
return -1;
}
+ BUG_ON(si->pid == SYSVIPC_SHMEM_PID);
+
if (si->pid != pid)
return shmem_wait_and_open(pid, si);
--
2.5.0
More information about the CRIU
mailing list