[CRIU] [PATCH 1/2] Copy x86 VDSO code to AArch64

Christopher Covington cov at codeaurora.org
Tue Aug 5 13:20:23 PDT 2014


I don't like duplicating hundreds of lines of code, but it's
what Cyrill wants [1].

1. http://lists.openvz.org/pipermail/criu/2014-August/015218.html

Signed-off-by: Christopher Covington <cov at codeaurora.org>
---
 arch/aarch64/include/asm/vdso.h | 159 +++++++++++++++
 arch/aarch64/vdso-pie.c         | 441 ++++++++++++++++++++++++++++++++++++++++
 arch/aarch64/vdso.c             | 303 +++++++++++++++++++++++++++
 3 files changed, 903 insertions(+)
 create mode 100644 arch/aarch64/include/asm/vdso.h
 create mode 100644 arch/aarch64/vdso-pie.c
 create mode 100644 arch/aarch64/vdso.c

diff --git a/arch/aarch64/include/asm/vdso.h b/arch/aarch64/include/asm/vdso.h
new file mode 100644
index 0000000..56761fa
--- /dev/null
+++ b/arch/aarch64/include/asm/vdso.h
@@ -0,0 +1,159 @@
+#ifndef __CR_ASM_VDSO_H__
+#define __CR_ASM_VDSO_H__
+
+#include <sys/types.h>
+
+#include "asm/int.h"
+#include "protobuf/vma.pb-c.h"
+
+struct parasite_ctl;
+struct vm_area_list;
+
+#define VDSO_PROT		(PROT_READ | PROT_EXEC)
+#define VVAR_PROT		(PROT_READ)
+
+#define VDSO_BAD_ADDR		(-1ul)
+#define VVAR_BAD_ADDR		VDSO_BAD_ADDR
+#define VDSO_BAD_PFN		(-1ull)
+#define VVAR_BAD_PFN		VDSO_BAD_PFN
+
+struct vdso_symbol {
+	char			name[32];
+	unsigned long		offset;
+};
+
+#define VDSO_SYMBOL_INIT	{ .offset = VDSO_BAD_ADDR, }
+
+/* Check if symbol present in symtable */
+static inline bool vdso_symbol_empty(struct vdso_symbol *s)
+{
+	return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
+}
+
+/*
+ * This is a minimal amount of symbols
+ * we should support at the moment.
+ */
+enum {
+	VDSO_SYMBOL_CLOCK_GETTIME,
+	VDSO_SYMBOL_GETCPU,
+	VDSO_SYMBOL_GETTIMEOFDAY,
+	VDSO_SYMBOL_TIME,
+
+	VDSO_SYMBOL_MAX
+};
+
+struct vdso_symtable {
+	unsigned long		vma_start;
+	unsigned long		vma_end;
+	unsigned long		vvar_start;
+	unsigned long		vvar_end;
+	struct vdso_symbol	symbols[VDSO_SYMBOL_MAX];
+};
+
+#define VDSO_SYMTABLE_INIT						\
+	{								\
+		.vma_start	= VDSO_BAD_ADDR,			\
+		.vma_end	= VDSO_BAD_ADDR,			\
+		.vvar_start	= VVAR_BAD_ADDR,			\
+		.vvar_end	= VVAR_BAD_ADDR,			\
+		.symbols		= {				\
+			[0 ... VDSO_SYMBOL_MAX - 1] =			\
+				(struct vdso_symbol)VDSO_SYMBOL_INIT,	\
+			},						\
+	}
+
+/* Size of VMA associated with vdso */
+static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
+{
+	return t->vma_end - t->vma_start;
+}
+
+static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
+{
+	return t->vvar_end - t->vvar_start;
+}
+/*
+ * Special mark which allows to identify runtime vdso where
+ * calls from proxy vdso are redirected. This mark usually
+ * placed at the start of vdso area where Elf header lives.
+ * Since such runtime vdso is solevey used by proxy and
+ * nobody else is supposed to access it, it's more-less
+ * safe to screw the Elf header with @signature and
+ * @proxy_addr.
+ *
+ * The @proxy_addr deserves a few comments. When we redirect
+ * the calls from proxy to runtime vdso, on next checkpoint
+ * it won't be possible to find which VMA is proxy, thus
+ * we save its address in the member.
+ */
+struct vdso_mark {
+	u64			signature;
+	unsigned long		proxy_vdso_addr;
+
+	unsigned long		version;
+
+	/*
+	 * In case of new vDSO format the VVAR area address
+	 * neeed for easier discovering where it lives without
+	 * relying on procfs output.
+	 */
+	unsigned long		proxy_vvar_addr;
+};
+
+#define VDSO_MARK_SIGNATURE	(0x6f73647675697263ULL)	/* Magic number (criuvdso) */
+#define VDSO_MARK_SIGNATURE_V2	(0x4f53447675697263ULL)	/* Magic number (criuvDSO) */
+#define VDSO_MARK_CUR_VERSION	(2)
+
+static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
+{
+	struct vdso_mark *m = where;
+
+	m->signature		= VDSO_MARK_SIGNATURE_V2;
+	m->proxy_vdso_addr	= proxy_vdso_addr;
+	m->version		= VDSO_MARK_CUR_VERSION;
+	m->proxy_vvar_addr	= proxy_vvar_addr;
+}
+
+static inline bool is_vdso_mark(void *addr)
+{
+	struct vdso_mark *m = addr;
+
+	if (m->signature == VDSO_MARK_SIGNATURE_V2) {
+		/*
+		 * New format
+		 */
+		return true;
+	} else if (m->signature == VDSO_MARK_SIGNATURE) {
+		/*
+		 * Old format -- simply extend the mark up
+		 * to the version we support.
+		 */
+		vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
+		return true;
+	}
+	return false;
+}
+
+#define VDSO_SYMBOL_CLOCK_GETTIME_NAME	"__vdso_clock_gettime"
+#define VDSO_SYMBOL_GETCPU_NAME		"__vdso_getcpu"
+#define VDSO_SYMBOL_GETTIMEOFDAY_NAME	"__vdso_gettimeofday"
+#define VDSO_SYMBOL_TIME_NAME		"__vdso_time"
+
+
+
+extern struct vdso_symtable vdso_sym_rt;
+extern u64 vdso_pfn;
+
+extern int vdso_init(void);
+extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
+extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
+extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+			unsigned long vdso_rt_parked_at, size_t index,
+			VmaEntry *vmas, size_t nr_vmas);
+
+extern int vdso_redirect_calls(void *base_to, void *base_from, struct vdso_symtable *to, struct vdso_symtable *from);
+extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+			       struct vm_area_list *vma_area_list);
+
+#endif /* __CR_ASM_VDSO_H__ */
diff --git a/arch/aarch64/vdso-pie.c b/arch/aarch64/vdso-pie.c
new file mode 100644
index 0000000..0a55d71
--- /dev/null
+++ b/arch/aarch64/vdso-pie.c
@@ -0,0 +1,441 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "compiler.h"
+#include "syscall.h"
+#include "image.h"
+#include "vdso.h"
+#include "vma.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+typedef struct {
+	u16	movabs;
+	u64	imm64;
+	u16	jmp_rax;
+	u32	guards;
+} __packed jmp_t;
+
+int vdso_redirect_calls(void *base_to, void *base_from,
+			struct vdso_symtable *to,
+			struct vdso_symtable *from)
+{
+	jmp_t jmp = {
+		.movabs		= 0xb848,
+		.jmp_rax	= 0xe0ff,
+		.guards		= 0xcccccccc,
+	};
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
+		if (vdso_symbol_empty(&from->symbols[i]))
+			continue;
+
+		pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n",
+			 (unsigned long)base_from, from->symbols[i].offset,
+			 (unsigned long)base_to, to->symbols[i].offset, i);
+
+		jmp.imm64 = (unsigned long)base_to + to->symbols[i].offset;
+		builtin_memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp));
+	}
+
+	return 0;
+}
+
+
+/* Check if pointer is out-of-bound */
+static bool __ptr_oob(void *ptr, void *start, size_t size)
+{
+	void *end = (void *)((unsigned long)start + size);
+	return ptr > end || ptr < start;
+}
+
+/*
+ * Elf hash, see format specification.
+ */
+static unsigned long elf_hash(const unsigned char *name)
+{
+	unsigned long h = 0, g;
+
+	while (*name) {
+		h = (h << 4) + *name++;
+		g = h & 0xf0000000ul;
+		if (g)
+			h ^= g >> 24;
+		h &= ~g;
+	}
+	return h;
+}
+
+int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
+{
+	Elf64_Phdr *dynamic = NULL, *load = NULL;
+	Elf64_Ehdr *ehdr = (void *)mem;
+	Elf64_Dyn *dyn_strtab = NULL;
+	Elf64_Dyn *dyn_symtab = NULL;
+	Elf64_Dyn *dyn_strsz = NULL;
+	Elf64_Dyn *dyn_syment = NULL;
+	Elf64_Dyn *dyn_hash = NULL;
+	Elf64_Word *hash = NULL;
+	Elf64_Phdr *phdr;
+	Elf64_Dyn *d;
+
+	Elf64_Word *bucket, *chain;
+	Elf64_Word nbucket, nchain;
+
+	/*
+	 * See Elf specification for this magic values.
+	 */
+	const char elf_ident[] = {
+		0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	};
+
+	const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
+		[VDSO_SYMBOL_CLOCK_GETTIME]	= VDSO_SYMBOL_CLOCK_GETTIME_NAME,
+		[VDSO_SYMBOL_GETCPU]		= VDSO_SYMBOL_GETCPU_NAME,
+		[VDSO_SYMBOL_GETTIMEOFDAY]	= VDSO_SYMBOL_GETTIMEOFDAY_NAME,
+		[VDSO_SYMBOL_TIME]		= VDSO_SYMBOL_TIME_NAME,
+	};
+
+	char *dynsymbol_names;
+	unsigned int i, j, k;
+
+	BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
+
+	pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
+
+	/*
+	 * Make sure it's a file we support.
+	 */
+	if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
+		pr_err("Elf header magic mismatch\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * We need PT_LOAD and PT_DYNAMIC here. Each once.
+	 */
+	phdr = (void *)&mem[ehdr->e_phoff];
+	for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+		if (__ptr_oob(phdr, mem, size))
+			goto err_oob;
+		switch (phdr->p_type) {
+		case PT_DYNAMIC:
+			if (dynamic) {
+				pr_err("Second PT_DYNAMIC header\n");
+				return -EINVAL;
+			}
+			dynamic = phdr;
+			break;
+		case PT_LOAD:
+			if (load) {
+				pr_err("Second PT_LOAD header\n");
+				return -EINVAL;
+			}
+			load = phdr;
+			break;
+		}
+	}
+
+	if (!load || !dynamic) {
+		pr_err("One of obligated program headers is missed\n");
+		return -EINVAL;
+	}
+
+	pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
+
+	/*
+	 * Dynamic section tags should provide us the rest of information
+	 * needed. Note that we're interested in a small set of tags.
+	 */
+	d = (void *)&mem[dynamic->p_offset];
+	for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
+		if (__ptr_oob(d, mem, size))
+			goto err_oob;
+
+		if (d->d_tag == DT_NULL) {
+			break;
+		} else if (d->d_tag == DT_STRTAB) {
+			dyn_strtab = d;
+			pr_debug("DT_STRTAB: %p\n", (void *)d->d_un.d_ptr);
+		} else if (d->d_tag == DT_SYMTAB) {
+			dyn_symtab = d;
+			pr_debug("DT_SYMTAB: %p\n", (void *)d->d_un.d_ptr);
+		} else if (d->d_tag == DT_STRSZ) {
+			dyn_strsz = d;
+			pr_debug("DT_STRSZ: %lu\n", (unsigned long)d->d_un.d_val);
+		} else if (d->d_tag == DT_SYMENT) {
+			dyn_syment = d;
+			pr_debug("DT_SYMENT: %lu\n", (unsigned long)d->d_un.d_val);
+		} else if (d->d_tag == DT_HASH) {
+			dyn_hash = d;
+			pr_debug("DT_HASH: %p\n", (void *)d->d_un.d_ptr);
+		}
+	}
+
+	if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
+		pr_err("Not all dynamic entries are present\n");
+		return -EINVAL;
+	}
+
+	dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
+	if (__ptr_oob(dynsymbol_names, mem, size))
+		goto err_oob;
+
+	hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
+	if (__ptr_oob(hash, mem, size))
+		goto err_oob;
+
+	nbucket = hash[0];
+	nchain = hash[1];
+	bucket = &hash[2];
+	chain = &hash[nbucket + 2];
+
+	pr_debug("nbucket %lu nchain %lu bucket %p chain %p\n",
+		 (long)nbucket, (long)nchain, bucket, chain);
+
+	for (i = 0; i < ARRAY_SIZE(vdso_symbols); i++) {
+		k = elf_hash((const unsigned char *)vdso_symbols[i]);
+
+		for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
+			Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
+			char *name;
+
+			sym = &sym[j];
+			if (__ptr_oob(sym, mem, size))
+				continue;
+
+			if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
+			    ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
+				continue;
+
+			name = &dynsymbol_names[sym->st_name];
+			if (__ptr_oob(name, mem, size))
+				continue;
+
+			if (builtin_strcmp(name, vdso_symbols[i]))
+				continue;
+
+			builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
+			t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
+			break;
+		}
+	}
+
+	return 0;
+
+err_oob:
+	pr_err("Corrupted Elf data\n");
+	return -EFAULT;
+}
+
+static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
+{
+	unsigned long addr;
+
+	pr_debug("Remap %s %lx -> %lx\n", who, from, to);
+
+	addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
+	if (addr != to) {
+		pr_err("Unable to remap %lx -> %lx %lx\n",
+		       from, to, addr);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Park runtime vDSO in some safe place where it can be accessible from restorer */
+int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
+{
+	int ret;
+
+	BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
+
+	if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
+		if (sym_rt->vma_start < sym_rt->vvar_start) {
+			ret  = vdso_remap("rt-vdso", sym_rt->vma_start,
+					  park_at, vdso_vma_size(sym_rt));
+			park_at += vdso_vma_size(sym_rt);
+			ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
+					  park_at, vvar_vma_size(sym_rt));
+		} else {
+			ret  = vdso_remap("rt-vvar", sym_rt->vvar_start,
+					  park_at, vvar_vma_size(sym_rt));
+			park_at += vvar_vma_size(sym_rt);
+			ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
+					  park_at, vdso_vma_size(sym_rt));
+		}
+	} else
+		ret = vdso_remap("rt-vdso", sym_rt->vma_start,
+				 park_at, vdso_vma_size(sym_rt));
+	return ret;
+}
+
+int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+		 unsigned long vdso_rt_parked_at, size_t index,
+		 VmaEntry *vmas, size_t nr_vmas)
+{
+	VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
+	struct vdso_symtable s = VDSO_SYMTABLE_INIT;
+	bool remap_rt = false;
+
+	/*
+	 * Figue out which kind of vdso tuple we get.
+	 */
+	if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
+		vma_vdso = &vmas[index];
+	else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
+		vma_vvar = &vmas[index];
+
+	if (index < (nr_vmas - 1)) {
+		if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
+			vma_vdso = &vmas[index + 1];
+		else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
+			vma_vvar = &vmas[index + 1];
+	}
+
+	if (!vma_vdso) {
+		pr_err("Can't find vDSO area in image\n");
+		return -1;
+	}
+
+	/*
+	 * vDSO mark overwrites Elf program header of proxy vDSO thus
+	 * it must never ever be greater in size.
+	 */
+	BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
+
+	/*
+	 * Find symbols in vDSO zone read from image.
+	 */
+	if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
+		return -1;
+
+	/*
+	 * Proxification strategy
+	 *
+	 *  - There might be two vDSO zones: vdso code and optionally vvar data
+	 *  - To be able to use in-place remapping we need
+	 *
+	 *    a) Size and order of vDSO zones are to match
+	 *    b) Symbols offsets must match
+	 *    c) Have same number of vDSO zones
+	 */
+	if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
+		size_t i;
+
+		for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
+			if (s.symbols[i].offset != sym_rt->symbols[i].offset)
+				break;
+		}
+
+		if (i == ARRAY_SIZE(s.symbols)) {
+			if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
+				remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
+				if (remap_rt) {
+					long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
+					long delta_this = vma_vvar->start - vma_vdso->start;
+
+					remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
+				}
+			}
+		}
+	}
+
+	pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
+		 vma_vdso->start, vma_vdso->end,
+		 vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
+		 vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
+
+	/*
+	 * Easy case -- the vdso from image has same offsets, order and size
+	 * as runtime, so we simply remap runtime vdso to dumpee position
+	 * without generating any proxy.
+	 *
+	 * Note we may remap VVAR vdso as well which might not yet been mapped
+	 * by a caller code. So drop VMA_AREA_REGULAR from it and caller would
+	 * not touch it anymore.
+	 */
+	if (remap_rt) {
+		int ret = 0;
+
+		pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
+
+		if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
+			pr_err("Failed to unmap %s\n", who);
+			return -1;
+		}
+
+		if (vma_vvar) {
+			if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
+				pr_err("Failed to unmap %s\n", who);
+				return -1;
+			}
+		}
+
+		if (vma_vvar) {
+			if (vma_vdso->start < vma_vvar->start) {
+				ret  = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+				vdso_rt_parked_at += vdso_vma_size(sym_rt);
+				ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+			} else {
+				ret  = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+				vdso_rt_parked_at += vvar_vma_size(sym_rt);
+				ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+			}
+		} else
+			ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+
+		return ret;
+	}
+
+	/*
+	 * Now complex case -- we need to proxify calls. We redirect
+	 * calls from dumpee vdso to runtime vdso, making dumpee
+	 * to operate as proxy vdso.
+	 */
+	pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
+
+	/*
+	 * Don't forget to shift if vvar is before vdso.
+	 */
+	if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
+	    sym_rt->vvar_start < sym_rt->vma_start)
+		vdso_rt_parked_at += vvar_vma_size(sym_rt);
+
+	if (vdso_redirect_calls((void *)vdso_rt_parked_at,
+				(void *)vma_vdso->start,
+				sym_rt, &s)) {
+		pr_err("Failed to proxify dumpee contents\n");
+		return -1;
+	}
+
+	/*
+	 * Put a special mark into runtime vdso, thus at next checkpoint
+	 * routine we could detect this vdso and do not dump it, since
+	 * it's auto-generated every new session if proxy required.
+	 */
+	sys_mprotect((void *)vdso_rt_parked_at,  vdso_vma_size(sym_rt), PROT_WRITE);
+	vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
+	sys_mprotect((void *)vdso_rt_parked_at,  vdso_vma_size(sym_rt), VDSO_PROT);
+	return 0;
+}
diff --git a/arch/aarch64/vdso.c b/arch/aarch64/vdso.c
new file mode 100644
index 0000000..ac47a1e
--- /dev/null
+++ b/arch/aarch64/vdso.c
@@ -0,0 +1,303 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <elf.h>
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/types.h"
+#include "asm/parasite-syscall.h"
+
+#include "parasite-syscall.h"
+#include "parasite.h"
+#include "compiler.h"
+#include "kerndat.h"
+#include "vdso.h"
+#include "util.h"
+#include "log.h"
+#include "mem.h"
+#include "vma.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT;
+u64 vdso_pfn = VDSO_BAD_PFN;
+/*
+ * The VMAs list might have proxy vdso/vvar areas left
+ * from previous dump/restore cycle so we need to detect
+ * them and eliminated from the VMAs list, they will be
+ * generated again on restore if needed.
+ */
+int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+			struct vm_area_list *vma_area_list)
+{
+	unsigned long proxy_vdso_addr = VDSO_BAD_ADDR;
+	unsigned long proxy_vvar_addr = VVAR_BAD_ADDR;
+	struct vma_area *proxy_vdso_marked = NULL;
+	struct vma_area *proxy_vvar_marked = NULL;
+	struct parasite_vdso_vma_entry *args;
+	struct vma_area *vma;
+	int fd, ret = -1;
+	off_t off;
+	u64 pfn;
+
+	args = parasite_args(ctl, struct parasite_vdso_vma_entry);
+	fd = open_proc(pid, "pagemap");
+	if (fd < 0)
+		return -1;
+
+	list_for_each_entry(vma, &vma_area_list->h, list) {
+		if (!vma_area_is(vma, VMA_AREA_REGULAR))
+			continue;
+
+		if (vma_area_is(vma, VMA_FILE_SHARED) ||
+				vma_area_is(vma, VMA_FILE_PRIVATE))
+			continue;
+		/*
+		 * It might be possible VVAR area from marked
+		 * vDSO zone, we need to detect it earlier than
+		 * VDSO_PROT test because VVAR_PROT is a subset
+		 * of it but don't yield continue here,
+		 * sigh... what a mess.
+		 */
+		BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT));
+
+		if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) {
+			if (proxy_vvar_addr != VVAR_BAD_ADDR &&
+			    proxy_vvar_addr == vma->e->start) {
+				BUG_ON(proxy_vvar_marked);
+				proxy_vvar_marked = vma;
+				continue;
+			}
+		}
+
+		if ((vma->e->prot & VDSO_PROT) != VDSO_PROT)
+			continue;
+
+		if (vma->e->start > TASK_SIZE)
+			continue;
+
+		if (vma->e->flags & MAP_GROWSDOWN)
+			continue;
+
+		/*
+		 * I need to poke every potentially marked vma,
+		 * otherwise if task never called for vdso functions
+		 * page frame number won't be reported.
+		 */
+		args->start = vma->e->start;
+		args->len = vma_area_len(vma);
+
+		if (parasite_execute_daemon(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) {
+			pr_err("vdso: Parasite failed to poke for mark\n");
+			ret = -1;
+			goto err;
+		}
+
+		/*
+		 * Defer handling marked vdso until we walked over
+		 * all vmas and restore potentially remapped vDSO
+		 * area status.
+		 */
+		if (unlikely(args->is_marked)) {
+			if (proxy_vdso_marked) {
+				pr_err("Ow! Second vdso mark detected!\n");
+				ret = -1;
+				goto err;
+			}
+			proxy_vdso_marked = vma;
+			proxy_vdso_addr = args->proxy_vdso_addr;
+			proxy_vvar_addr = args->proxy_vvar_addr;
+			continue;
+		}
+
+		off = (vma->e->start / PAGE_SIZE) * sizeof(u64);
+		ret = pread(fd, &pfn, sizeof(pfn), off);
+		if (ret < 0 || ret != sizeof(pfn)) {
+			pr_perror("Can't read pme for pid %d", pid);
+			ret = -1;
+			goto err;
+		}
+
+		pfn = PME_PFRAME(pfn);
+		if (!pfn) {
+			pr_err("Unexpected page fram number 0 for pid %d\n", pid);
+			ret = -1;
+			goto err;
+		}
+
+		/*
+		 * Setup proper VMA status. Note starting with 3.16
+		 * the [vdso]/[vvar] marks are reported correctly
+		 * even when they are remapped into a new place,
+		 * but only since that particular version of the
+		 * kernel!
+		 */
+		if (pfn == vdso_pfn) {
+			if (!vma_area_is(vma, VMA_AREA_VDSO)) {
+				pr_debug("vdso: Restore vDSO status by pfn at %lx\n",
+					 (long)vma->e->start);
+				vma->e->status |= VMA_AREA_VDSO;
+			}
+		} else {
+			if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) {
+				pr_debug("vdso: Drop mishinted vDSO status at %lx\n",
+					 (long)vma->e->start);
+				vma->e->status &= ~VMA_AREA_VDSO;
+			}
+		}
+	}
+
+	/*
+	 * There is marked vdso, it means such vdso is autogenerated
+	 * and must be dropped from vma list.
+	 */
+	if (proxy_vdso_marked) {
+		pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n",
+			 (long)proxy_vdso_marked->e->start,
+			 (long)proxy_vdso_addr, (long)proxy_vvar_addr);
+
+		/*
+		 * Don't forget to restore the proxy vdso/vvar status, since
+		 * it's unknown to the kernel.
+		 */
+		list_for_each_entry(vma, &vma_area_list->h, list) {
+			if (vma->e->start == proxy_vdso_addr) {
+				vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO;
+				pr_debug("vdso: Restore proxy vDSO status at %lx\n",
+					 (long)vma->e->start);
+			} else if (vma->e->start == proxy_vvar_addr) {
+				vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR;
+				pr_debug("vdso: Restore proxy VVAR status at %lx\n",
+					 (long)vma->e->start);
+			}
+		}
+
+		pr_debug("vdso: Droppping marked vdso at %lx\n",
+			 (long)proxy_vdso_marked->e->start);
+		list_del(&proxy_vdso_marked->list);
+		xfree(proxy_vdso_marked);
+		vma_area_list->nr--;
+
+		if (proxy_vvar_marked) {
+			pr_debug("vdso: Droppping marked vvar at %lx\n",
+				 (long)proxy_vvar_marked->e->start);
+			list_del(&proxy_vvar_marked->list);
+			xfree(proxy_vvar_marked);
+			vma_area_list->nr--;
+		}
+	}
+	ret = 0;
+err:
+	close(fd);
+	return ret;
+}
+
+static int vdso_fill_self_symtable(struct vdso_symtable *s)
+{
+	char buf[512];
+	int ret = -1;
+	FILE *maps;
+
+	*s = (struct vdso_symtable)VDSO_SYMTABLE_INIT;
+
+	maps = fopen("/proc/self/maps", "r");
+	if (!maps) {
+		pr_perror("Can't open self-vma");
+		return -1;
+	}
+
+	while (fgets(buf, sizeof(buf), maps)) {
+		unsigned long start, end;
+		char *has_vdso, *has_vvar;
+
+		has_vdso = strstr(buf, "[vdso]");
+		if (!has_vdso)
+			has_vvar = strstr(buf, "[vvar]");
+		else
+			has_vvar = NULL;
+
+		if (!has_vdso && !has_vvar)
+			continue;
+
+		ret = sscanf(buf, "%lx-%lx", &start, &end);
+		if (ret != 2) {
+			ret = -1;
+			pr_err("Can't find vDSO/VVAR bounds\n");
+			goto err;
+		}
+
+		if (has_vdso) {
+			if (s->vma_start != VDSO_BAD_ADDR) {
+				pr_err("Got second vDSO entry\n");
+				ret = -1;
+				goto err;
+			}
+			s->vma_start = start;
+			s->vma_end = end;
+
+			ret = vdso_fill_symtable((void *)start, end - start, s);
+			if (ret)
+				goto err;
+		} else {
+			if (s->vvar_start != VVAR_BAD_ADDR) {
+				pr_err("Got second VVAR entry\n");
+				ret = -1;
+				goto err;
+			}
+			s->vvar_start = start;
+			s->vvar_end = end;
+		}
+	}
+
+	/*
+	 * Validate its structure -- for new vDSO format the
+	 * structure must be like
+	 *
+	 * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso]
+	 * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar]
+	 *
+	 * The areas may be in reverse order.
+	 *
+	 * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar]
+	 * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso]
+	 *
+	 */
+	ret = 0;
+	if (s->vma_start != VDSO_BAD_ADDR) {
+		if (s->vvar_start != VVAR_BAD_ADDR) {
+			if (s->vma_end != s->vvar_start &&
+			    s->vvar_end != s->vma_start) {
+				ret = -1;
+				pr_err("Unexpected rt vDSO area bounds\n");
+				goto err;
+			}
+		}
+	} else {
+		ret = -1;
+		pr_err("Can't find rt vDSO\n");
+		goto err;
+	}
+
+	pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n",
+		 s->vma_start, s->vma_end,
+		 s->vvar_start, s->vvar_end);
+
+err:
+	fclose(maps);
+	return ret;
+}
+
+int vdso_init(void)
+{
+	if (vdso_fill_self_symtable(&vdso_sym_rt))
+		return -1;
+	return vaddr_to_pfn(vdso_sym_rt.vma_start, &vdso_pfn);
+}
-- 
Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by the Linux Foundation.



More information about the CRIU mailing list