[CRIU] [PATCH 5/5] vdso: x86 -- Add handling of vvar zones
Cyrill Gorcunov
gorcunov at openvz.org
Fri Jun 6 07:07:20 PDT 2014
New kernel 3.16 will have old vDSO zone splitted into the two vmas:
one for vdso code itself and second that named vvar for data been
referenced from vdso code.
Because I can't do 'dump' and 'restore' parts of the code separately
(otherwise test would fail) the commit is pretty big one and hard to
read so here is detailed explanation what's going on.
1) When start dumping we detect vvar zone by reading /proc/pid/smap
and looking up for "[vvar]" token. Note the vvar zone is mapped
by a kernel with PF/IO flags so we should not fail here.
Also it's assumed that at least for now kernel won't be changed
much and [vvar] zone always follows the [vdso] zone, otherwise
criu will print error.
2) In previous commits we disabled dumping vvar area contents so
the restorer code never try to read vvar data but still we need
to map vvar zone thus vma entry remains in image.
3) As with previous vdso format we might have 2 cases
a) Dump and restore is happening on same kernel
b) Dump and restore are done on different kernels
To detect which case we have we parse vdso data from image
and find symbols offsets then compare their values with runtime
symbols provided us by a kernel. If they match and (!!!) the
size of vvar zone is the same -- we simply remap both zones
from runtime kernel into the positions dumpee had at checkpoint
time. This is that named "inplace" remap (a).
If this happens the vdso_proxify() routine drops VMA_AREA_REGULAR
from vvar area provided by a caller code and restorer won't try
to handle this vma. It looks somehow strange and probably should
be reworked but for now I left it as is to minimize the patch.
In case of (b) we need to generate a proxy. We do that in same
way as we were before just include vvar zone into proxy and save
vvar proxy address inside vdso mark injected into vdso area. Thus
on subsequent checkpoint we can detect proxy vvar zone and rip
it off the list of vmas to handle.
Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
arch/x86/include/asm/vdso.h | 9 ++-
arch/x86/vdso-pie.c | 72 +++++++++++++++++-----
arch/x86/vdso.c | 141 ++++++++++++++++++++++++++++++++++++--------
cr-restore.c | 4 +-
include/vdso.h | 3 +-
pie/restorer.c | 28 ++++++++-
proc_parse.c | 17 +++++-
7 files changed, 225 insertions(+), 49 deletions(-)
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index bab8456531b4..740d47c957f4 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -10,6 +10,7 @@ struct parasite_ctl;
struct vm_area_list;
#define VDSO_PROT (PROT_READ | PROT_EXEC)
+#define VVAR_PROT (PROT_READ)
#define VDSO_BAD_ADDR (-1ul)
#define VVAR_BAD_ADDR VDSO_BAD_ADDR
@@ -68,6 +69,10 @@ static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
return t->vma_end - t->vma_start;
}
+static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
+{
+ return t->vvar_end - t->vvar_start;
+}
/*
* Special mark which allows to identify runtime vdso where
* calls from proxy vdso are redirected. This mark usually
@@ -131,7 +136,9 @@ extern u64 vdso_pfn;
extern int vdso_init(void);
extern int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size);
extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
-extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt, VmaEntry *vma_entry, unsigned long vdso_rt_parked_at);
+extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ VmaEntry *vdso_vma, VmaEntry *vvar_vma,
+ unsigned long vdso_rt_parked_at);
extern int vdso_redirect_calls(void *base_to, void *base_from, struct vdso_symtable *to, struct vdso_symtable *from);
extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
diff --git a/arch/x86/vdso-pie.c b/arch/x86/vdso-pie.c
index 967108d8fea1..c1f7f123a2f6 100644
--- a/arch/x86/vdso-pie.c
+++ b/arch/x86/vdso-pie.c
@@ -15,6 +15,7 @@
#include "compiler.h"
#include "syscall.h"
+#include "image.h"
#include "vdso.h"
#include "vma.h"
#include "log.h"
@@ -261,44 +262,83 @@ int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
return 0;
}
-int vdso_proxify(char *who, struct vdso_symtable *sym_rt, VmaEntry *vma, unsigned long vdso_rt_parked_at)
+int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ VmaEntry *vdso_vma, VmaEntry *vvar_vma,
+ unsigned long vdso_rt_parked_at)
{
struct vdso_symtable s = VDSO_SYMTABLE_INIT;
- size_t size = vma_entry_len(vma);
- bool remap_rt = true;
+ bool remap_rt = false;
+
+ /*
+ * vDSO mark overwrites Elf program header of proxy vDSO thus
+ * it must never ever be greater in size.
+ */
+ BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
/*
* Find symbols in dumpee vdso.
*/
- if (vdso_fill_symtable((void *)vma->start, size, &s))
+ if (vdso_fill_symtable((void *)vdso_vma->start, vma_entry_len(vdso_vma), &s))
return -1;
- if (size == vdso_vma_size(sym_rt)) {
- int i;
+ /*
+ * Try to figure out if the vDSO in image has the same symbols
+ * as run time vDSO, if yes we might try to reuse runtime vDSO
+ * instead of one in image.
+ *
+ * In case if VVAR area is present at least it must have same
+ * size as dumped one for inplace remap.
+ */
+ if (vma_entry_len(vdso_vma) == vdso_vma_size(sym_rt)) {
+ size_t i;
for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
- if (s.symbols[i].offset != sym_rt->symbols[i].offset) {
- remap_rt = false;
+ if (s.symbols[i].offset != sym_rt->symbols[i].offset)
break;
}
+ if (i == ARRAY_SIZE(s.symbols)) {
+ remap_rt = true;
+
+ if (vvar_vma && sym_rt->vvar_start != VVAR_BAD_ADDR)
+ remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vvar_vma));
}
- } else
- remap_rt = false;
+ }
/*
* Easy case -- the vdso from image has same offsets and size
* as runtime, so we simply remap runtime vdso to dumpee position
- * without generating any proxy.
+ * without generating any proxy. Note we may remap VVAR vdso as
+ * well which might not yet been mapped by a caller code. So
+ * drop VMA_AREA_REGULAR from it and caller would not touch it
+ * anymore.
*/
if (remap_rt) {
- pr_info("Runtime vdso matches dumpee, remap inplace\n");
+ unsigned long vvar_rt_parked_at = VVAR_BAD_ADDR;
+ int ret = 0;
- if (sys_munmap((void *)vma->start, size)) {
+ pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
+
+ if (sys_munmap((void *)vdso_vma->start, vma_entry_len(vdso_vma))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
- return vdso_remap(who, vdso_rt_parked_at, vma->start, size);
+ if (vvar_vma) {
+ if (sys_munmap((void *)vvar_vma->start, vma_entry_len(vvar_vma))) {
+ pr_err("Failed to unmap %s\n", who);
+ return -1;
+ }
+
+ vvar_rt_parked_at = ALIGN(vvar_vma_size(sym_rt), PAGE_SIZE);
+ vvar_rt_parked_at+= vdso_rt_parked_at;
+
+ ret = vdso_remap(who, vvar_rt_parked_at, vvar_vma->start, vma_entry_len(vvar_vma));
+ vvar_vma->status &= ~VMA_AREA_REGULAR;
+ }
+
+ ret |= vdso_remap(who, vdso_rt_parked_at, vdso_vma->start, vma_entry_len(vdso_vma));
+
+ return ret;
}
/*
@@ -309,7 +349,7 @@ int vdso_proxify(char *who, struct vdso_symtable *sym_rt, VmaEntry *vma, unsigne
pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
if (vdso_redirect_calls((void *)vdso_rt_parked_at,
- (void *)vma->start,
+ (void *)vdso_vma->start,
sym_rt, &s)) {
pr_err("Failed to proxify dumpee contents\n");
return -1;
@@ -321,7 +361,7 @@ int vdso_proxify(char *who, struct vdso_symtable *sym_rt, VmaEntry *vma, unsigne
* it's auto-generated every new session if proxy required.
*/
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
- vdso_put_mark((void *)vdso_rt_parked_at, vma->start, VVAR_BAD_ADDR);
+ vdso_put_mark((void *)vdso_rt_parked_at, vdso_vma->start, vvar_vma ? vvar_vma->start : VVAR_BAD_ADDR);
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
return 0;
}
diff --git a/arch/x86/vdso.c b/arch/x86/vdso.c
index 674afbed7bb6..d61a9095cc92 100644
--- a/arch/x86/vdso.c
+++ b/arch/x86/vdso.c
@@ -30,15 +30,19 @@
struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT;
u64 vdso_pfn = VDSO_BAD_PFN;
/*
- * Find out proxy vdso vma and drop it from the list. Also
- * fix vdso status on vmas if wrong status found.
+ * The VMAs list might have proxy vdso/vvar areas left
+ * from previous dump/restore cycle so we need to detect
+ * them and eliminated from the VMAs list, they will be
+ * generated again on restore if needed.
*/
int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
struct vm_area_list *vma_area_list)
{
- unsigned long proxy_addr = VDSO_BAD_ADDR;
+ unsigned long proxy_vdso_addr = VDSO_BAD_ADDR;
+ unsigned long proxy_vvar_addr = VVAR_BAD_ADDR;
+ struct vma_area *proxy_vdso_marked = NULL;
+ struct vma_area *proxy_vvar_marked = NULL;
struct parasite_vdso_vma_entry *args;
- struct vma_area *marked = NULL;
struct vma_area *vma;
int fd, ret = -1;
off_t off;
@@ -56,6 +60,23 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
if (vma_area_is(vma, VMA_FILE_SHARED) ||
vma_area_is(vma, VMA_FILE_PRIVATE))
continue;
+ /*
+ * It might be possible VVAR area from marked
+ * vDSO zone, we need to detect it earlier than
+ * VDSO_PROT test because VVAR_PROT is a subset
+ * of it but don't yield continue here,
+ * sigh... what a mess.
+ */
+ BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT));
+
+ if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) {
+ if (proxy_vvar_addr != VVAR_BAD_ADDR &&
+ proxy_vvar_addr == vma->e->start) {
+ BUG_ON(proxy_vvar_marked);
+ proxy_vvar_marked = vma;
+ continue;
+ }
+ }
if ((vma->e->prot & VDSO_PROT) != VDSO_PROT)
continue;
@@ -81,13 +102,16 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
}
/*
- * Defer handling marked vdso.
+ * Defer handling marked vdso until we walked over
+ * all vmas and restore potentially remapped vDSO
+ * area status.
*/
if (unlikely(args->is_marked)) {
BUG_ON(args->proxy_vdso_addr == VDSO_BAD_ADDR);
- BUG_ON(marked);
- marked = vma;
- proxy_addr = args->proxy_vdso_addr;
+ BUG_ON(proxy_vdso_marked);
+ proxy_vdso_marked = vma;
+ proxy_vdso_addr = args->proxy_vdso_addr;
+ proxy_vvar_addr = args->proxy_vvar_addr;
continue;
}
@@ -103,17 +127,21 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
BUG_ON(!pfn);
/*
- * Set proper VMA statuses.
+ * Setup proper VMA status. Note starting with 3.16
+ * the [vdso]/[vvar] marks are reported correctly
+ * even when they are remapped into a new place,
+ * but only since that particular version of the
+ * kernel!
*/
if (pfn == vdso_pfn) {
if (!vma_area_is(vma, VMA_AREA_VDSO)) {
- pr_debug("vdso: Restore status by pfn at %lx\n",
+ pr_debug("vdso: Restore vDSO status by pfn at %lx\n",
(long)vma->e->start);
vma->e->status |= VMA_AREA_VDSO;
}
} else {
- if (vma_area_is(vma, VMA_AREA_VDSO)) {
- pr_debug("vdso: Drop mishinted status at %lx\n",
+ if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) {
+ pr_debug("vdso: Drop mishinted vDSO status at %lx\n",
(long)vma->e->start);
vma->e->status &= ~VMA_AREA_VDSO;
}
@@ -124,27 +152,38 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
* There is marked vdso, it means such vdso is autogenerated
* and must be dropped from vma list.
*/
- if (marked) {
- pr_debug("vdso: Found marked at %lx (proxy at %lx)\n",
- (long)marked->e->start, (long)proxy_addr);
+ if (proxy_vdso_marked) {
+ pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n",
+ (long)proxy_vdso_marked->e->start,
+ (long)proxy_vdso_addr, (long)proxy_vvar_addr);
/*
- * Don't forget to restore the proxy vdso status, since
- * it's being not recognized by the kernel as vdso.
+ * Don't forget to restore the proxy vdso/vvar status, since
+ * it's unknown to the kernel.
*/
list_for_each_entry(vma, &vma_area_list->h, list) {
- if (vma->e->start == proxy_addr) {
+ if (vma->e->start == proxy_vdso_addr) {
vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO;
- pr_debug("vdso: Restore proxy status at %lx\n",
+ pr_debug("vdso: Restore proxy vDSO status at %lx\n",
+ (long)vma->e->start);
+ } else if (vma->e->start == proxy_vvar_addr) {
+ vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR;
+ pr_debug("vdso: Restore proxy VVAR status at %lx\n",
(long)vma->e->start);
- break;
}
}
pr_debug("vdso: Droppping marked vdso at %lx\n",
- (long)vma->e->start);
- list_del(&marked->list);
- xfree(marked);
+ (long)proxy_vdso_marked->e->start);
+ list_del(&proxy_vdso_marked->list);
+ xfree(proxy_vdso_marked);
+
+ if (proxy_vvar_marked) {
+ pr_debug("vdso: Droppping marked vvar at %lx\n",
+ (long)proxy_vvar_marked->e->start);
+ list_del(&proxy_vvar_marked->list);
+ xfree(proxy_vvar_marked);
+ }
}
ret = 0;
err:
@@ -168,24 +207,74 @@ static int vdso_fill_self_symtable(struct vdso_symtable *s)
while (fgets(buf, sizeof(buf), maps)) {
unsigned long start, end;
+ char *has_vdso, *has_vvar;
+
+ has_vdso = strstr(buf, "[vdso]");
+ if (!has_vdso)
+ has_vvar = strstr(buf, "[vvar]");
+ else
+ has_vvar = NULL;
- if (strstr(buf, "[vdso]") == NULL)
+ if (!has_vdso && !has_vvar)
continue;
ret = sscanf(buf, "%lx-%lx", &start, &end);
if (ret != 2) {
ret = -1;
- pr_err("Can't find vDSO bounds\n");
+ pr_err("Can't find vDSO/VVAR bounds\n");
break;
}
+ if (has_vdso) {
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ pr_err("Got second vDSO entry\n");
+ ret = -1;
+ goto err;
+ }
s->vma_start = start;
s->vma_end = end;
ret = vdso_fill_symtable((void *)start, end - start, s);
- break;
+ if (ret)
+ goto err;
+ } else {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ pr_err("Got second VVAR entry\n");
+ ret = -1;
+ goto err;
+ }
+ s->vvar_start = start;
+ s->vvar_end = end;
+ }
}
+ /*
+ * Validate its structure -- for new vDSO format the
+ * structure must be like
+ *
+ * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso]
+ * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar]
+ */
+ ret = 0;
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ if (s->vma_end != s->vvar_start) {
+ ret = -1;
+ pr_err("Unexpected rt vDSO area bounds\n");
+ goto err;
+ }
+ }
+ } else {
+ ret = -1;
+ pr_err("Can't find rt vDSO\n");
+ goto err;
+ }
+
+ pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n",
+ s->vma_start, s->vma_end,
+ s->vvar_start, s->vvar_end);
+
+err:
fclose(maps);
return ret;
}
diff --git a/cr-restore.c b/cr-restore.c
index 9bc77910a678..e63c278f81a4 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -2377,12 +2377,14 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
#ifdef CONFIG_VDSO
/*
- * Figure out how much memory runtime vdso will need.
+ * Figure out how much memory runtime vdso and vvar will need.
*/
vdso_rt_vma_size = vdso_vma_size(&vdso_sym_rt);
if (vdso_rt_vma_size) {
vdso_rt_delta = ALIGN(restore_bootstrap_len, PAGE_SIZE) - restore_bootstrap_len;
vdso_rt_size = vdso_rt_vma_size + vdso_rt_delta;
+ if (vvar_vma_size(&vdso_sym_rt))
+ vdso_rt_size += ALIGN(vvar_vma_size(&vdso_sym_rt), PAGE_SIZE);
}
restore_bootstrap_len += vdso_rt_size;
diff --git a/include/vdso.h b/include/vdso.h
index c0725a3bfd43..50549d5667fc 100644
--- a/include/vdso.h
+++ b/include/vdso.h
@@ -16,7 +16,8 @@
#define parasite_fixup_vdso(ctl, pid, vma_area_list) (0)
#define vdso_vma_size(t) (0)
#define vdso_remap(who, from, to, size) (0)
-#define vdso_proxify(who, sym_rt, vma, vdso_rt_parked_at) (0)
+#define vdso_proxify(who, sym_rt, vdso_vma, \
+ vvar_vma, vdso_rt_parked_at) (0)
#endif /* CONFIG_VDSO */
diff --git a/pie/restorer.c b/pie/restorer.c
index 9afa480aed4f..8a173eca71f0 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -661,6 +661,12 @@ long __export_restore_task(struct task_restore_args *args)
args->vdso_rt_parked_at,
vdso_vma_size(&args->vdso_sym_rt)))
goto core_restore_end;
+ if (args->vdso_sym_rt.vvar_start != VVAR_BAD_ADDR) {
+ if (vdso_remap("rt-vvar", args->vdso_sym_rt.vvar_start,
+ args->vdso_rt_parked_at + vdso_vma_size(&args->vdso_sym_rt),
+ vvar_vma_size(&args->vdso_sym_rt)))
+ goto core_restore_end;
+ }
#endif
if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len,
@@ -688,8 +694,17 @@ long __export_restore_task(struct task_restore_args *args)
goto core_restore_end;
#ifdef CONFIG_VDSO
if (vma_entry_is(vma_entry, VMA_AREA_VDSO)) {
+ VmaEntry *vma_vvar;
+
+ if (i + 1 < args->nr_vmas) {
+ vma_vvar = args->tgt_vmas + i + 1;
+ if (!vma_entry_is(vma_entry, VMA_AREA_VVAR))
+ vma_vvar = NULL;
+ } else
+ vma_vvar = NULL;
if (vdso_proxify("left dumpee", &args->vdso_sym_rt,
- vma_entry, args->vdso_rt_parked_at))
+ vma_entry, vma_vvar,
+ args->vdso_rt_parked_at))
goto core_restore_end;
}
#endif
@@ -716,8 +731,17 @@ long __export_restore_task(struct task_restore_args *args)
goto core_restore_end;
#ifdef CONFIG_VDSO
if (vma_entry_is(vma_entry, VMA_AREA_VDSO)) {
+ VmaEntry *vma_vvar;
+
+ if (i + 1 < args->nr_vmas) {
+ vma_vvar = args->tgt_vmas + i + 1;
+ if (!vma_entry_is(vma_entry, VMA_AREA_VVAR))
+ vma_vvar = NULL;
+ } else
+ vma_vvar = NULL;
if (vdso_proxify("right dumpee", &args->vdso_sym_rt,
- vma_entry, args->vdso_rt_parked_at))
+ vma_entry, vma_vvar,
+ args->vdso_rt_parked_at))
goto core_restore_end;
}
#endif
diff --git a/proc_parse.c b/proc_parse.c
index e9fef08ac93b..86b34a25c2da 100644
--- a/proc_parse.c
+++ b/proc_parse.c
@@ -138,8 +138,12 @@ static int parse_vmflags(char *buf, struct vma_area *vma_area)
vma_area->e->madv |= (1ul << MADV_NOHUGEPAGE);
/* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */
- if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf"))
- vma_area->e->status |= VMA_UNSUPP;
+ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) {
+#ifdef CONFIG_VDSO
+ if (!vma_area_is(vma_area, VMA_AREA_VVAR))
+#endif
+ vma_area->e->status |= VMA_UNSUPP;
+ }
/*
* Anything else is just ignored.
@@ -409,6 +413,15 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, bool use_map_file
pr_warn_once("Found vDSO area without support\n");
goto err;
#endif
+ } else if (strstr(buf, "[vvar]")) {
+#ifdef CONFIG_VDSO
+ vma_area->e->status |= VMA_AREA_REGULAR;
+ if ((vma_area->e->prot & VVAR_PROT) == VVAR_PROT)
+ vma_area->e->status |= VMA_AREA_VVAR;
+#else
+ pr_warn_once("Found VVAR area without support\n");
+ goto err;
+#endif
} else if (strstr(buf, "[heap]")) {
vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP;
} else {
--
1.9.3
More information about the CRIU
mailing list