[PATCH 4/4] arch: x86 -- Add proxification of vDSO calls

Cyrill Gorcunov gorcunov at openvz.org
Tue Apr 16 04:49:04 EDT 2013


When tasks are restored in a kernel which version differ from the
one they were being dumped on, the vDSO content may have been changed
thus any call to vDSO service routine will lead to SIGBUS in better case
or in wrong results without notice in worst one.

So to work this situatio out we need a that named vDSO proxy, thus any
call to original vDSO service routine shall be redirected to valid run
time vDSO supplied by the kernel we're restoring on.

In terms of ugly ascii gprahics it might be represented as following.

Original vDSO in image
======================

    +---------------------------+
    | __vdso_getcpu             |
    |       ...                 |
    |       function body here  |
    +---------------------------+
    | __vdso_time               |
    |       ...                 |
    |       function body here  |
    +---------------------------+
    ...

Run-time proxified calls
========================

    Mapped from image and then patched

    +---------------------------+
    | __vdso_getcpu             |
    |       redirect            +----+
    |       int3                |    |
    +---------------------------+    |
    | __vdso_time               |    |
    |       redirect            +-+  |
    |       int3                | |  |
    +---------------------------+ |  |
    ...                           |  |
                                  |  |  New memory area with valid vDSO content
                                  |  |  +---------------------------+
                                  |  |  | __vdso_getcpu             |
                                  |  +->|       ...                 |
                                  |     |       function body here  |
                                  |     +---------------------------+
                                  |     | __vdso_time               |
                                  +---->|       ...                 |
                                        |       function body here  |
                                        +---------------------------+
                                        ...

To achieve this we

 1) Read vDSO from the image. If there is no image -- we do nothing.
 2) Read vDSO from memory the kernel provides us and remember its structure
    and functions addresses.
 3) When all VMAs the task has are read we append new one with size of run-time
    vDSO.
 4) Fill new vDSO VMA with data from the run time vDSO
 5) Patch old vDSO entry points to redirect calls to new vDSO entries.

This is far from being optimal, and better do

 - lookup for appropriate hole between VMAs from image and map new vDSO here,
   because there might be no place for new VMA after all read from image
 - don't copy run time vDSO contents but rather re-map run-time vDSO to the
   address needed at very late stage of restore

Even though, this patch provides a ground for future development and
known to work.

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 arch/x86/crtools.c |  55 +++++++++++++++++++++++++++
 arch/x86/vdso.c    |  37 ++++++++++++++++++
 cr-restore.c       | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/vdso.h     |   4 ++
 4 files changed, 201 insertions(+), 2 deletions(-)

diff --git a/arch/x86/crtools.c b/arch/x86/crtools.c
index e061816..06d6631 100644
--- a/arch/x86/crtools.c
+++ b/arch/x86/crtools.c
@@ -147,6 +147,61 @@ int arch_fill_self_vdso(symtable_t *t)
 	return ret;
 }
 
+int arch_read_vdso_layout(symtable_t *t)
+{
+	unsigned int i, nr_entries = 0;
+	VdsoSymbolEntry *symbol;
+	int ret = -1, fd;
+
+	fd = open_image(CR_FD_VDSO, O_RSTR);
+	if (fd < 0)
+		goto err;
+
+	INIT_SYMTABLE(t);
+
+	while (1) {
+		ret = pb_read_one_eof(fd, &symbol, PB_VDSO);
+		if (ret < 0)
+			goto err;
+		else if (ret == 0)
+			break;
+
+		pr_debug("vdso: read name %s offset %lx\n",
+			 symbol->name, symbol->offset);
+
+		i = arch_vdso_get_symbol_index(symbol->name);
+		if (i == VDSO_SYMBOL_MAX) {
+			pr_err("vDSO symbol %s is not reconized\n",
+			       symbol->name);
+			goto err;
+		}
+
+		strncpy(t->sym[i].name, symbol->name, sizeof(t->sym[i].name));
+		t->sym[i].name[sizeof(t->sym[i].name) - 1] = '\0';
+		t->sym[i].offset = symbol->offset;
+
+		vdso_symbol_entry__free_unpacked(symbol, NULL);
+
+		nr_entries++;
+	}
+
+	/* Verify read data */
+	if (nr_entries) {
+		for (i = 0; i < ARRAY_SIZE(t->sym); i++) {
+			if (!arch_is_vdso_symbol_valid(&t->sym[i])) {
+				pr_err("Invalid vDSO data for symbol %s\n",
+				       arch_vdso_get_symbol_name(i));
+				goto err;
+			}
+		}
+	}
+	ret = 0;
+
+err:
+	close(fd);
+	return ret;
+}
+
 int arch_dump_vdso_layout(void)
 {
 	VdsoSymbolEntry symbol = VDSO_SYMBOL_ENTRY__INIT;
diff --git a/arch/x86/vdso.c b/arch/x86/vdso.c
index 2d7c715..73dadec 100644
--- a/arch/x86/vdso.c
+++ b/arch/x86/vdso.c
@@ -52,6 +52,43 @@ static const char vdso_ident[] = {
 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 };
 
+typedef struct {
+	u16	movabs;
+	u64	imm64;
+	u16	jmp_rax;
+	u32	guards;
+} __packed jmp_t;
+
+int arch_proxify_vdso(void *base_to, void *base_from, symtable_t *to, symtable_t *from)
+{
+	jmp_t jmp = {
+		.movabs		= 0xb848,
+		.jmp_rax	= 0xe0ff,
+		.guards		= 0xcccccccc,
+	};
+	unsigned int i;
+
+	/*
+	 * We support forward jumps only, for simplicity
+	 * reason, thus the caller must provide us validated
+	 * data only.
+	 */
+	for (i = 0; i < ARRAY_SIZE(to->sym); i++) {
+		if (arch_is_vdso_symbol_empty(&from->sym[i]))
+			continue;
+
+		pr_debug("jmp: %lx/%lx -> %lx/%lx\n",
+			 (unsigned long)base_from, from->sym[i].offset,
+			 (unsigned long)base_to, to->sym[i].offset);
+
+		jmp.imm64 = (unsigned long)base_to + to->sym[i].offset;
+
+		memcpy((void *)(base_from + from->sym[i].offset), &jmp, sizeof(jmp));
+	}
+
+	return 0;
+}
+
 int arch_parse_vdso(char *mem, size_t size, symtable_t *t)
 {
 	Elf64_Ehdr *ehdr = (void *)mem;
diff --git a/cr-restore.c b/cr-restore.c
index 0b9d601..a28a152 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -56,6 +56,7 @@
 #include "cpu.h"
 #include "file-lock.h"
 #include "page-read.h"
+#include "vdso.h"
 
 #include "protobuf.h"
 #include "protobuf/sa.pb-c.h"
@@ -75,6 +76,16 @@ static int prepare_restorer_blob(void);
 
 static VM_AREA_LIST(rst_vmas); /* XXX .longest is NOT tracked for this guy */
 
+static struct vdso_proxy_s {
+	symtable_t	sym_rt;		/* Run time symbols retrieved from crtools itself */
+	symtable_t	sym_dumpee;	/* Symbols from dumped image */
+
+	struct vma_area	*vma_to;	/* VMA where vDSO calls redirect to */
+	struct vma_area	*sym_from;	/* vDSO VMA of dumpee which we should patch */
+
+	bool		proxify;
+} vdso_proxy;
+
 static int shmem_remap(void *old_addr, void *new_addr, unsigned long size)
 {
 	void *ret;
@@ -265,7 +276,7 @@ static int map_private_vma(pid_t pid, struct vma_area *vma, void *tgt_addr,
 	return 0;
 }
 
-static int restore_priv_vma_content(pid_t pid)
+static int restore_priv_vma_content(pid_t pid, struct vdso_proxy_s *vdso_proxy)
 {
 	struct vma_area *vma;
 	int ret = 0;
@@ -381,6 +392,30 @@ static int restore_priv_vma_content(pid_t pid)
 		}
 	}
 
+	/*
+	 * Proxify vdso content.
+	 */
+	if (vdso_proxy->proxify) {
+		void *base_to, *base_from;
+
+		base_to = decode_pointer(vma_premmaped_start(&vdso_proxy->vma_to->vma));
+		base_from = decode_pointer(vma_premmaped_start(&vdso_proxy->sym_from->vma));
+
+		/*
+		 * Fill new vDSO with content of run-time vDSO
+		 */
+		pr_debug("vdso: Copy run-time contents %p -> %p\n",
+			 (void *)vdso_proxy->sym_rt.vma_start, base_to);
+
+		memcpy(base_to, (void *)vdso_proxy->sym_rt.vma_start,
+		       symtable_vma_size(&vdso_proxy->sym_rt));
+
+		if (arch_proxify_vdso(base_to, base_from,
+				      &vdso_proxy->sym_rt,
+				      &vdso_proxy->sym_dumpee))
+			return -1;
+	}
+
 	pr_info("nr_restored_pages: %d\n", nr_restored);
 	pr_info("nr_shared_pages:   %d\n", nr_shared);
 	pr_info("nr_droped_pages:   %d\n", nr_droped);
@@ -415,6 +450,9 @@ static int read_vmas(int pid)
 		goto out;
 	}
 
+	vdso_proxy.sym_from = NULL;
+	vdso_proxy.vma_to = NULL;
+
 	while (1) {
 		struct vma_area *vma;
 		VmaEntry *e;
@@ -446,12 +484,65 @@ static int read_vmas(int pid)
 		if (!vma_priv(&vma->vma))
 			continue;
 
+		if (vma_entry_is(&vma->vma, VMA_AREA_VDSO)) {
+			pr_debug("vdso: Got dumpee area %lx-%lx\n",
+				 (long)vma->vma.start, (long)vma->vma.end);
+
+			vdso_proxy.sym_from = vma;
+		}
+
 		priv_size += vma_area_len(vma);
 	}
 
 	if (ret < 0)
 		goto out;
 
+	/*
+	 * If no vDSO area present in image, we have nothing
+	 * to do more here.
+	 *
+	 * FIXME At moment we lookup for last VMA present in image
+	 *       and append new VMA after it. This new VMA get filled
+	 *       with vDSO contents from run time vDSO data, but this
+	 *       is overkill.
+	 *
+	 *       - there might be no space after the last image VMA
+	 *       - even being small (typically 8K) vDSO should not be
+	 *         copied to a new place but rather remapped lately
+	 *         in restore last phase.
+	 *
+	 */
+	if (vdso_proxy.proxify) {
+		if (vdso_proxy.sym_from) {
+			struct vma_area *last;
+			struct vma_area *vma;
+
+			ret = -1;
+			vma = alloc_vma_area();
+			if (!vma)
+				goto out;
+
+			last = list_entry(rst_vmas.h.prev, struct vma_area, list);
+
+			vma->vma.start	= last->vma.end;
+			vma->vma.end	= vma->vma.start + symtable_vma_size(&vdso_proxy.sym_rt);
+			vma->vma.prot	= PROT_READ | PROT_EXEC;
+			vma->vma.flags	= MAP_PRIVATE | MAP_ANONYMOUS;
+			vma->vma.status	= VMA_AREA_REGULAR | VMA_AREA_VDSO | VMA_ANON_PRIVATE;
+
+			rst_vmas.nr++;
+			list_add_tail(&vma->list, &rst_vmas.h);
+
+			priv_size += vma_area_len(vma);
+
+			pr_debug("vdso: Add proxy area %lx-%lx\n",
+				 (long)vma->vma.start, (long)vma->vma.end);
+
+			vdso_proxy.vma_to = vma;
+		} else
+			vdso_proxy.proxify = false;
+	}
+
 	/* Reserve a place for mapping private vma-s one by one */
 	addr = mmap(NULL, priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
 	if (addr == MAP_FAILED) {
@@ -485,7 +576,7 @@ static int read_vmas(int pid)
 	}
 
 	if (ret == 0)
-		ret = restore_priv_vma_content(pid);
+		ret = restore_priv_vma_content(pid, &vdso_proxy);
 	close(fd);
 
 out:
@@ -1304,6 +1395,18 @@ int cr_restore_tasks(pid_t pid, struct cr_options *opts)
 	if (cpu_init() < 0)
 		return -1;
 
+	if (arch_read_vdso_layout(&vdso_proxy.sym_dumpee))
+		return -1;
+
+	if (!arch_is_vdso_symbols_empty(&vdso_proxy.sym_dumpee)) {
+		if (arch_fill_self_vdso(&vdso_proxy.sym_rt))
+			return -1;
+		/*
+		 * For a while always proxify calls.
+		 */
+		vdso_proxy.proxify = true;
+	}
+
 	if (prepare_task_entries() < 0)
 		return -1;
 
diff --git a/include/vdso.h b/include/vdso.h
index b41451b..02d60c1 100644
--- a/include/vdso.h
+++ b/include/vdso.h
@@ -85,14 +85,18 @@ static inline bool arch_is_vdso_symbols_empty(symtable_t *t)
 extern const char *arch_vdso_get_symbol_name(unsigned int index);
 extern unsigned int arch_vdso_get_symbol_index(char *symbol);
 extern int arch_fill_self_vdso(symtable_t *t);
+extern int arch_read_vdso_layout(symtable_t *t);
 extern int arch_dump_vdso_layout(void);
 extern int arch_parse_vdso(char *mem, size_t size, symtable_t *t);
+extern int arch_proxify_vdso(void *base_to, void *base_from, symtable_t *to, symtable_t *from);
 #else
 extern const char *arch_vdso_get_symbol_name(unsigned int index) { return NULL; }
 static inline unsigned int arch_vdso_get_symbol_index(char *symbol) { return VDSO_SYMBOL_MAX; };
 static inline int arch_fill_self_vdso(symtable_t *t) { return 0; }
+static inline int arch_read_vdso_layout(symtable_t *t) { return 0; }
 static inline int arch_dump_vdso_layout(void) { }
 static inline int arch_parse_vdso(char *mem, size_t size, symtable_t *t) { return 0; }
+static inline int arch_proxify_vdso(void *base_to, void *base_from, symtable_t *to, symtable_t *from) { return 0; }
 #endif
 
 #endif /* __CR_VDSO_H__ */
-- 
1.8.1.4


--ZoaI/ZTpAVc4A5k6--


More information about the CRIU mailing list