[CRIU] [PATCH 1/2] DO NOT MERGE: Move x86 VDSO code to AArch64
Christopher Covington
cov at codeaurora.org
Fri Jul 25 12:19:58 PDT 2014
Purely to illustrate how exactly the AArch64 VDSO handling differs
from x86 in the following patch. Not intended for merge.
Signed-off-by: Christopher Covington <cov at codeaurora.org>
---
arch/aarch64/include/asm/vdso.h | 159 +++++++++++++++
arch/aarch64/vdso-pie.c | 441 ++++++++++++++++++++++++++++++++++++++++
arch/aarch64/vdso.c | 303 +++++++++++++++++++++++++++
arch/x86/include/asm/vdso.h | 159 ---------------
arch/x86/vdso-pie.c | 441 ----------------------------------------
arch/x86/vdso.c | 303 ---------------------------
6 files changed, 903 insertions(+), 903 deletions(-)
create mode 100644 arch/aarch64/include/asm/vdso.h
create mode 100644 arch/aarch64/vdso-pie.c
create mode 100644 arch/aarch64/vdso.c
delete mode 100644 arch/x86/include/asm/vdso.h
delete mode 100644 arch/x86/vdso-pie.c
delete mode 100644 arch/x86/vdso.c
diff --git a/arch/aarch64/include/asm/vdso.h b/arch/aarch64/include/asm/vdso.h
new file mode 100644
index 0000000..56761fa
--- /dev/null
+++ b/arch/aarch64/include/asm/vdso.h
@@ -0,0 +1,159 @@
+#ifndef __CR_ASM_VDSO_H__
+#define __CR_ASM_VDSO_H__
+
+#include <sys/types.h>
+
+#include "asm/int.h"
+#include "protobuf/vma.pb-c.h"
+
+struct parasite_ctl;
+struct vm_area_list;
+
+#define VDSO_PROT (PROT_READ | PROT_EXEC)
+#define VVAR_PROT (PROT_READ)
+
+#define VDSO_BAD_ADDR (-1ul)
+#define VVAR_BAD_ADDR VDSO_BAD_ADDR
+#define VDSO_BAD_PFN (-1ull)
+#define VVAR_BAD_PFN VDSO_BAD_PFN
+
+struct vdso_symbol {
+ char name[32];
+ unsigned long offset;
+};
+
+#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
+
+/* Check if symbol present in symtable */
+static inline bool vdso_symbol_empty(struct vdso_symbol *s)
+{
+ return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
+}
+
+/*
+ * This is a minimal amount of symbols
+ * we should support at the moment.
+ */
+enum {
+ VDSO_SYMBOL_CLOCK_GETTIME,
+ VDSO_SYMBOL_GETCPU,
+ VDSO_SYMBOL_GETTIMEOFDAY,
+ VDSO_SYMBOL_TIME,
+
+ VDSO_SYMBOL_MAX
+};
+
+struct vdso_symtable {
+ unsigned long vma_start;
+ unsigned long vma_end;
+ unsigned long vvar_start;
+ unsigned long vvar_end;
+ struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
+};
+
+#define VDSO_SYMTABLE_INIT \
+ { \
+ .vma_start = VDSO_BAD_ADDR, \
+ .vma_end = VDSO_BAD_ADDR, \
+ .vvar_start = VVAR_BAD_ADDR, \
+ .vvar_end = VVAR_BAD_ADDR, \
+ .symbols = { \
+ [0 ... VDSO_SYMBOL_MAX - 1] = \
+ (struct vdso_symbol)VDSO_SYMBOL_INIT, \
+ }, \
+ }
+
+/* Size of VMA associated with vdso */
+static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
+{
+ return t->vma_end - t->vma_start;
+}
+
+static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
+{
+ return t->vvar_end - t->vvar_start;
+}
+/*
+ * Special mark which allows to identify runtime vdso where
+ * calls from proxy vdso are redirected. This mark usually
+ * placed at the start of vdso area where Elf header lives.
+ * Since such runtime vdso is solevey used by proxy and
+ * nobody else is supposed to access it, it's more-less
+ * safe to screw the Elf header with @signature and
+ * @proxy_addr.
+ *
+ * The @proxy_addr deserves a few comments. When we redirect
+ * the calls from proxy to runtime vdso, on next checkpoint
+ * it won't be possible to find which VMA is proxy, thus
+ * we save its address in the member.
+ */
+struct vdso_mark {
+ u64 signature;
+ unsigned long proxy_vdso_addr;
+
+ unsigned long version;
+
+ /*
+ * In case of new vDSO format the VVAR area address
+ * neeed for easier discovering where it lives without
+ * relying on procfs output.
+ */
+ unsigned long proxy_vvar_addr;
+};
+
+#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
+#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
+#define VDSO_MARK_CUR_VERSION (2)
+
+static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
+{
+ struct vdso_mark *m = where;
+
+ m->signature = VDSO_MARK_SIGNATURE_V2;
+ m->proxy_vdso_addr = proxy_vdso_addr;
+ m->version = VDSO_MARK_CUR_VERSION;
+ m->proxy_vvar_addr = proxy_vvar_addr;
+}
+
+static inline bool is_vdso_mark(void *addr)
+{
+ struct vdso_mark *m = addr;
+
+ if (m->signature == VDSO_MARK_SIGNATURE_V2) {
+ /*
+ * New format
+ */
+ return true;
+ } else if (m->signature == VDSO_MARK_SIGNATURE) {
+ /*
+ * Old format -- simply extend the mark up
+ * to the version we support.
+ */
+ vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
+ return true;
+ }
+ return false;
+}
+
+#define VDSO_SYMBOL_CLOCK_GETTIME_NAME "__vdso_clock_gettime"
+#define VDSO_SYMBOL_GETCPU_NAME "__vdso_getcpu"
+#define VDSO_SYMBOL_GETTIMEOFDAY_NAME "__vdso_gettimeofday"
+#define VDSO_SYMBOL_TIME_NAME "__vdso_time"
+
+
+
+extern struct vdso_symtable vdso_sym_rt;
+extern u64 vdso_pfn;
+
+extern int vdso_init(void);
+extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
+extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
+extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ unsigned long vdso_rt_parked_at, size_t index,
+ VmaEntry *vmas, size_t nr_vmas);
+
+extern int vdso_redirect_calls(void *base_to, void *base_from, struct vdso_symtable *to, struct vdso_symtable *from);
+extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+ struct vm_area_list *vma_area_list);
+
+#endif /* __CR_ASM_VDSO_H__ */
diff --git a/arch/aarch64/vdso-pie.c b/arch/aarch64/vdso-pie.c
new file mode 100644
index 0000000..0a55d71
--- /dev/null
+++ b/arch/aarch64/vdso-pie.c
@@ -0,0 +1,441 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "compiler.h"
+#include "syscall.h"
+#include "image.h"
+#include "vdso.h"
+#include "vma.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+typedef struct {
+ u16 movabs;
+ u64 imm64;
+ u16 jmp_rax;
+ u32 guards;
+} __packed jmp_t;
+
+int vdso_redirect_calls(void *base_to, void *base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from)
+{
+ jmp_t jmp = {
+ .movabs = 0xb848,
+ .jmp_rax = 0xe0ff,
+ .guards = 0xcccccccc,
+ };
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
+ if (vdso_symbol_empty(&from->symbols[i]))
+ continue;
+
+ pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n",
+ (unsigned long)base_from, from->symbols[i].offset,
+ (unsigned long)base_to, to->symbols[i].offset, i);
+
+ jmp.imm64 = (unsigned long)base_to + to->symbols[i].offset;
+ builtin_memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp));
+ }
+
+ return 0;
+}
+
+
+/* Check if pointer is out-of-bound */
+static bool __ptr_oob(void *ptr, void *start, size_t size)
+{
+ void *end = (void *)((unsigned long)start + size);
+ return ptr > end || ptr < start;
+}
+
+/*
+ * Elf hash, see format specification.
+ */
+static unsigned long elf_hash(const unsigned char *name)
+{
+ unsigned long h = 0, g;
+
+ while (*name) {
+ h = (h << 4) + *name++;
+ g = h & 0xf0000000ul;
+ if (g)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
+{
+ Elf64_Phdr *dynamic = NULL, *load = NULL;
+ Elf64_Ehdr *ehdr = (void *)mem;
+ Elf64_Dyn *dyn_strtab = NULL;
+ Elf64_Dyn *dyn_symtab = NULL;
+ Elf64_Dyn *dyn_strsz = NULL;
+ Elf64_Dyn *dyn_syment = NULL;
+ Elf64_Dyn *dyn_hash = NULL;
+ Elf64_Word *hash = NULL;
+ Elf64_Phdr *phdr;
+ Elf64_Dyn *d;
+
+ Elf64_Word *bucket, *chain;
+ Elf64_Word nbucket, nchain;
+
+ /*
+ * See Elf specification for this magic values.
+ */
+ const char elf_ident[] = {
+ 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ };
+
+ const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
+ [VDSO_SYMBOL_CLOCK_GETTIME] = VDSO_SYMBOL_CLOCK_GETTIME_NAME,
+ [VDSO_SYMBOL_GETCPU] = VDSO_SYMBOL_GETCPU_NAME,
+ [VDSO_SYMBOL_GETTIMEOFDAY] = VDSO_SYMBOL_GETTIMEOFDAY_NAME,
+ [VDSO_SYMBOL_TIME] = VDSO_SYMBOL_TIME_NAME,
+ };
+
+ char *dynsymbol_names;
+ unsigned int i, j, k;
+
+ BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
+
+ pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
+
+ /*
+ * Make sure it's a file we support.
+ */
+ if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
+ pr_err("Elf header magic mismatch\n");
+ return -EINVAL;
+ }
+
+ /*
+ * We need PT_LOAD and PT_DYNAMIC here. Each once.
+ */
+ phdr = (void *)&mem[ehdr->e_phoff];
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (__ptr_oob(phdr, mem, size))
+ goto err_oob;
+ switch (phdr->p_type) {
+ case PT_DYNAMIC:
+ if (dynamic) {
+ pr_err("Second PT_DYNAMIC header\n");
+ return -EINVAL;
+ }
+ dynamic = phdr;
+ break;
+ case PT_LOAD:
+ if (load) {
+ pr_err("Second PT_LOAD header\n");
+ return -EINVAL;
+ }
+ load = phdr;
+ break;
+ }
+ }
+
+ if (!load || !dynamic) {
+ pr_err("One of obligated program headers is missed\n");
+ return -EINVAL;
+ }
+
+ pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
+
+ /*
+ * Dynamic section tags should provide us the rest of information
+ * needed. Note that we're interested in a small set of tags.
+ */
+ d = (void *)&mem[dynamic->p_offset];
+ for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
+ if (__ptr_oob(d, mem, size))
+ goto err_oob;
+
+ if (d->d_tag == DT_NULL) {
+ break;
+ } else if (d->d_tag == DT_STRTAB) {
+ dyn_strtab = d;
+ pr_debug("DT_STRTAB: %p\n", (void *)d->d_un.d_ptr);
+ } else if (d->d_tag == DT_SYMTAB) {
+ dyn_symtab = d;
+ pr_debug("DT_SYMTAB: %p\n", (void *)d->d_un.d_ptr);
+ } else if (d->d_tag == DT_STRSZ) {
+ dyn_strsz = d;
+ pr_debug("DT_STRSZ: %lu\n", (unsigned long)d->d_un.d_val);
+ } else if (d->d_tag == DT_SYMENT) {
+ dyn_syment = d;
+ pr_debug("DT_SYMENT: %lu\n", (unsigned long)d->d_un.d_val);
+ } else if (d->d_tag == DT_HASH) {
+ dyn_hash = d;
+ pr_debug("DT_HASH: %p\n", (void *)d->d_un.d_ptr);
+ }
+ }
+
+ if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
+ pr_err("Not all dynamic entries are present\n");
+ return -EINVAL;
+ }
+
+ dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
+ if (__ptr_oob(dynsymbol_names, mem, size))
+ goto err_oob;
+
+ hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
+ if (__ptr_oob(hash, mem, size))
+ goto err_oob;
+
+ nbucket = hash[0];
+ nchain = hash[1];
+ bucket = &hash[2];
+ chain = &hash[nbucket + 2];
+
+ pr_debug("nbucket %lu nchain %lu bucket %p chain %p\n",
+ (long)nbucket, (long)nchain, bucket, chain);
+
+ for (i = 0; i < ARRAY_SIZE(vdso_symbols); i++) {
+ k = elf_hash((const unsigned char *)vdso_symbols[i]);
+
+ for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
+ Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
+ char *name;
+
+ sym = &sym[j];
+ if (__ptr_oob(sym, mem, size))
+ continue;
+
+ if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
+ ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
+ continue;
+
+ name = &dynsymbol_names[sym->st_name];
+ if (__ptr_oob(name, mem, size))
+ continue;
+
+ if (builtin_strcmp(name, vdso_symbols[i]))
+ continue;
+
+ builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
+ t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
+ break;
+ }
+ }
+
+ return 0;
+
+err_oob:
+ pr_err("Corrupted Elf data\n");
+ return -EFAULT;
+}
+
+static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
+{
+ unsigned long addr;
+
+ pr_debug("Remap %s %lx -> %lx\n", who, from, to);
+
+ addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
+ if (addr != to) {
+ pr_err("Unable to remap %lx -> %lx %lx\n",
+ from, to, addr);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Park runtime vDSO in some safe place where it can be accessible from restorer */
+int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
+{
+ int ret;
+
+ BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
+
+ if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
+ if (sym_rt->vma_start < sym_rt->vvar_start) {
+ ret = vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ park_at += vdso_vma_size(sym_rt);
+ ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
+ park_at, vvar_vma_size(sym_rt));
+ } else {
+ ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
+ park_at, vvar_vma_size(sym_rt));
+ park_at += vvar_vma_size(sym_rt);
+ ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ }
+ } else
+ ret = vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ return ret;
+}
+
+int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ unsigned long vdso_rt_parked_at, size_t index,
+ VmaEntry *vmas, size_t nr_vmas)
+{
+ VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
+ struct vdso_symtable s = VDSO_SYMTABLE_INIT;
+ bool remap_rt = false;
+
+ /*
+ * Figue out which kind of vdso tuple we get.
+ */
+ if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
+ vma_vdso = &vmas[index];
+ else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
+ vma_vvar = &vmas[index];
+
+ if (index < (nr_vmas - 1)) {
+ if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
+ vma_vdso = &vmas[index + 1];
+ else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
+ vma_vvar = &vmas[index + 1];
+ }
+
+ if (!vma_vdso) {
+ pr_err("Can't find vDSO area in image\n");
+ return -1;
+ }
+
+ /*
+ * vDSO mark overwrites Elf program header of proxy vDSO thus
+ * it must never ever be greater in size.
+ */
+ BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
+
+ /*
+ * Find symbols in vDSO zone read from image.
+ */
+ if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
+ return -1;
+
+ /*
+ * Proxification strategy
+ *
+ * - There might be two vDSO zones: vdso code and optionally vvar data
+ * - To be able to use in-place remapping we need
+ *
+ * a) Size and order of vDSO zones are to match
+ * b) Symbols offsets must match
+ * c) Have same number of vDSO zones
+ */
+ if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
+ if (s.symbols[i].offset != sym_rt->symbols[i].offset)
+ break;
+ }
+
+ if (i == ARRAY_SIZE(s.symbols)) {
+ if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
+ remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
+ if (remap_rt) {
+ long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
+ long delta_this = vma_vvar->start - vma_vdso->start;
+
+ remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
+ }
+ }
+ }
+ }
+
+ pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
+ vma_vdso->start, vma_vdso->end,
+ vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
+ vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
+
+ /*
+ * Easy case -- the vdso from image has same offsets, order and size
+ * as runtime, so we simply remap runtime vdso to dumpee position
+ * without generating any proxy.
+ *
+ * Note we may remap VVAR vdso as well which might not yet been mapped
+ * by a caller code. So drop VMA_AREA_REGULAR from it and caller would
+ * not touch it anymore.
+ */
+ if (remap_rt) {
+ int ret = 0;
+
+ pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
+
+ if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
+ pr_err("Failed to unmap %s\n", who);
+ return -1;
+ }
+
+ if (vma_vvar) {
+ if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
+ pr_err("Failed to unmap %s\n", who);
+ return -1;
+ }
+ }
+
+ if (vma_vvar) {
+ if (vma_vdso->start < vma_vvar->start) {
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+ vdso_rt_parked_at += vdso_vma_size(sym_rt);
+ ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+ } else {
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+ vdso_rt_parked_at += vvar_vma_size(sym_rt);
+ ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+ }
+ } else
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+
+ return ret;
+ }
+
+ /*
+ * Now complex case -- we need to proxify calls. We redirect
+ * calls from dumpee vdso to runtime vdso, making dumpee
+ * to operate as proxy vdso.
+ */
+ pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
+
+ /*
+ * Don't forget to shift if vvar is before vdso.
+ */
+ if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
+ sym_rt->vvar_start < sym_rt->vma_start)
+ vdso_rt_parked_at += vvar_vma_size(sym_rt);
+
+ if (vdso_redirect_calls((void *)vdso_rt_parked_at,
+ (void *)vma_vdso->start,
+ sym_rt, &s)) {
+ pr_err("Failed to proxify dumpee contents\n");
+ return -1;
+ }
+
+ /*
+ * Put a special mark into runtime vdso, thus at next checkpoint
+ * routine we could detect this vdso and do not dump it, since
+ * it's auto-generated every new session if proxy required.
+ */
+ sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
+ vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
+ sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
+ return 0;
+}
diff --git a/arch/aarch64/vdso.c b/arch/aarch64/vdso.c
new file mode 100644
index 0000000..ac47a1e
--- /dev/null
+++ b/arch/aarch64/vdso.c
@@ -0,0 +1,303 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <elf.h>
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/types.h"
+#include "asm/parasite-syscall.h"
+
+#include "parasite-syscall.h"
+#include "parasite.h"
+#include "compiler.h"
+#include "kerndat.h"
+#include "vdso.h"
+#include "util.h"
+#include "log.h"
+#include "mem.h"
+#include "vma.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT;
+u64 vdso_pfn = VDSO_BAD_PFN;
+/*
+ * The VMAs list might have proxy vdso/vvar areas left
+ * from previous dump/restore cycle so we need to detect
+ * them and eliminated from the VMAs list, they will be
+ * generated again on restore if needed.
+ */
+int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+ struct vm_area_list *vma_area_list)
+{
+ unsigned long proxy_vdso_addr = VDSO_BAD_ADDR;
+ unsigned long proxy_vvar_addr = VVAR_BAD_ADDR;
+ struct vma_area *proxy_vdso_marked = NULL;
+ struct vma_area *proxy_vvar_marked = NULL;
+ struct parasite_vdso_vma_entry *args;
+ struct vma_area *vma;
+ int fd, ret = -1;
+ off_t off;
+ u64 pfn;
+
+ args = parasite_args(ctl, struct parasite_vdso_vma_entry);
+ fd = open_proc(pid, "pagemap");
+ if (fd < 0)
+ return -1;
+
+ list_for_each_entry(vma, &vma_area_list->h, list) {
+ if (!vma_area_is(vma, VMA_AREA_REGULAR))
+ continue;
+
+ if (vma_area_is(vma, VMA_FILE_SHARED) ||
+ vma_area_is(vma, VMA_FILE_PRIVATE))
+ continue;
+ /*
+ * It might be possible VVAR area from marked
+ * vDSO zone, we need to detect it earlier than
+ * VDSO_PROT test because VVAR_PROT is a subset
+ * of it but don't yield continue here,
+ * sigh... what a mess.
+ */
+ BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT));
+
+ if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) {
+ if (proxy_vvar_addr != VVAR_BAD_ADDR &&
+ proxy_vvar_addr == vma->e->start) {
+ BUG_ON(proxy_vvar_marked);
+ proxy_vvar_marked = vma;
+ continue;
+ }
+ }
+
+ if ((vma->e->prot & VDSO_PROT) != VDSO_PROT)
+ continue;
+
+ if (vma->e->start > TASK_SIZE)
+ continue;
+
+ if (vma->e->flags & MAP_GROWSDOWN)
+ continue;
+
+ /*
+ * I need to poke every potentially marked vma,
+ * otherwise if task never called for vdso functions
+ * page frame number won't be reported.
+ */
+ args->start = vma->e->start;
+ args->len = vma_area_len(vma);
+
+ if (parasite_execute_daemon(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) {
+ pr_err("vdso: Parasite failed to poke for mark\n");
+ ret = -1;
+ goto err;
+ }
+
+ /*
+ * Defer handling marked vdso until we walked over
+ * all vmas and restore potentially remapped vDSO
+ * area status.
+ */
+ if (unlikely(args->is_marked)) {
+ if (proxy_vdso_marked) {
+ pr_err("Ow! Second vdso mark detected!\n");
+ ret = -1;
+ goto err;
+ }
+ proxy_vdso_marked = vma;
+ proxy_vdso_addr = args->proxy_vdso_addr;
+ proxy_vvar_addr = args->proxy_vvar_addr;
+ continue;
+ }
+
+ off = (vma->e->start / PAGE_SIZE) * sizeof(u64);
+ ret = pread(fd, &pfn, sizeof(pfn), off);
+ if (ret < 0 || ret != sizeof(pfn)) {
+ pr_perror("Can't read pme for pid %d", pid);
+ ret = -1;
+ goto err;
+ }
+
+ pfn = PME_PFRAME(pfn);
+ if (!pfn) {
+ pr_err("Unexpected page fram number 0 for pid %d\n", pid);
+ ret = -1;
+ goto err;
+ }
+
+ /*
+ * Setup proper VMA status. Note starting with 3.16
+ * the [vdso]/[vvar] marks are reported correctly
+ * even when they are remapped into a new place,
+ * but only since that particular version of the
+ * kernel!
+ */
+ if (pfn == vdso_pfn) {
+ if (!vma_area_is(vma, VMA_AREA_VDSO)) {
+ pr_debug("vdso: Restore vDSO status by pfn at %lx\n",
+ (long)vma->e->start);
+ vma->e->status |= VMA_AREA_VDSO;
+ }
+ } else {
+ if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) {
+ pr_debug("vdso: Drop mishinted vDSO status at %lx\n",
+ (long)vma->e->start);
+ vma->e->status &= ~VMA_AREA_VDSO;
+ }
+ }
+ }
+
+ /*
+ * There is marked vdso, it means such vdso is autogenerated
+ * and must be dropped from vma list.
+ */
+ if (proxy_vdso_marked) {
+ pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n",
+ (long)proxy_vdso_marked->e->start,
+ (long)proxy_vdso_addr, (long)proxy_vvar_addr);
+
+ /*
+ * Don't forget to restore the proxy vdso/vvar status, since
+ * it's unknown to the kernel.
+ */
+ list_for_each_entry(vma, &vma_area_list->h, list) {
+ if (vma->e->start == proxy_vdso_addr) {
+ vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO;
+ pr_debug("vdso: Restore proxy vDSO status at %lx\n",
+ (long)vma->e->start);
+ } else if (vma->e->start == proxy_vvar_addr) {
+ vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR;
+ pr_debug("vdso: Restore proxy VVAR status at %lx\n",
+ (long)vma->e->start);
+ }
+ }
+
+ pr_debug("vdso: Droppping marked vdso at %lx\n",
+ (long)proxy_vdso_marked->e->start);
+ list_del(&proxy_vdso_marked->list);
+ xfree(proxy_vdso_marked);
+ vma_area_list->nr--;
+
+ if (proxy_vvar_marked) {
+ pr_debug("vdso: Droppping marked vvar at %lx\n",
+ (long)proxy_vvar_marked->e->start);
+ list_del(&proxy_vvar_marked->list);
+ xfree(proxy_vvar_marked);
+ vma_area_list->nr--;
+ }
+ }
+ ret = 0;
+err:
+ close(fd);
+ return ret;
+}
+
+static int vdso_fill_self_symtable(struct vdso_symtable *s)
+{
+ char buf[512];
+ int ret = -1;
+ FILE *maps;
+
+ *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps) {
+ pr_perror("Can't open self-vma");
+ return -1;
+ }
+
+ while (fgets(buf, sizeof(buf), maps)) {
+ unsigned long start, end;
+ char *has_vdso, *has_vvar;
+
+ has_vdso = strstr(buf, "[vdso]");
+ if (!has_vdso)
+ has_vvar = strstr(buf, "[vvar]");
+ else
+ has_vvar = NULL;
+
+ if (!has_vdso && !has_vvar)
+ continue;
+
+ ret = sscanf(buf, "%lx-%lx", &start, &end);
+ if (ret != 2) {
+ ret = -1;
+ pr_err("Can't find vDSO/VVAR bounds\n");
+ goto err;
+ }
+
+ if (has_vdso) {
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ pr_err("Got second vDSO entry\n");
+ ret = -1;
+ goto err;
+ }
+ s->vma_start = start;
+ s->vma_end = end;
+
+ ret = vdso_fill_symtable((void *)start, end - start, s);
+ if (ret)
+ goto err;
+ } else {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ pr_err("Got second VVAR entry\n");
+ ret = -1;
+ goto err;
+ }
+ s->vvar_start = start;
+ s->vvar_end = end;
+ }
+ }
+
+ /*
+ * Validate its structure -- for new vDSO format the
+ * structure must be like
+ *
+ * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso]
+ * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar]
+ *
+ * The areas may be in reverse order.
+ *
+ * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar]
+ * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso]
+ *
+ */
+ ret = 0;
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ if (s->vma_end != s->vvar_start &&
+ s->vvar_end != s->vma_start) {
+ ret = -1;
+ pr_err("Unexpected rt vDSO area bounds\n");
+ goto err;
+ }
+ }
+ } else {
+ ret = -1;
+ pr_err("Can't find rt vDSO\n");
+ goto err;
+ }
+
+ pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n",
+ s->vma_start, s->vma_end,
+ s->vvar_start, s->vvar_end);
+
+err:
+ fclose(maps);
+ return ret;
+}
+
+int vdso_init(void)
+{
+ if (vdso_fill_self_symtable(&vdso_sym_rt))
+ return -1;
+ return vaddr_to_pfn(vdso_sym_rt.vma_start, &vdso_pfn);
+}
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
deleted file mode 100644
index 56761fa..0000000
--- a/arch/x86/include/asm/vdso.h
+++ /dev/null
@@ -1,159 +0,0 @@
-#ifndef __CR_ASM_VDSO_H__
-#define __CR_ASM_VDSO_H__
-
-#include <sys/types.h>
-
-#include "asm/int.h"
-#include "protobuf/vma.pb-c.h"
-
-struct parasite_ctl;
-struct vm_area_list;
-
-#define VDSO_PROT (PROT_READ | PROT_EXEC)
-#define VVAR_PROT (PROT_READ)
-
-#define VDSO_BAD_ADDR (-1ul)
-#define VVAR_BAD_ADDR VDSO_BAD_ADDR
-#define VDSO_BAD_PFN (-1ull)
-#define VVAR_BAD_PFN VDSO_BAD_PFN
-
-struct vdso_symbol {
- char name[32];
- unsigned long offset;
-};
-
-#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
-
-/* Check if symbol present in symtable */
-static inline bool vdso_symbol_empty(struct vdso_symbol *s)
-{
- return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
-}
-
-/*
- * This is a minimal amount of symbols
- * we should support at the moment.
- */
-enum {
- VDSO_SYMBOL_CLOCK_GETTIME,
- VDSO_SYMBOL_GETCPU,
- VDSO_SYMBOL_GETTIMEOFDAY,
- VDSO_SYMBOL_TIME,
-
- VDSO_SYMBOL_MAX
-};
-
-struct vdso_symtable {
- unsigned long vma_start;
- unsigned long vma_end;
- unsigned long vvar_start;
- unsigned long vvar_end;
- struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
-};
-
-#define VDSO_SYMTABLE_INIT \
- { \
- .vma_start = VDSO_BAD_ADDR, \
- .vma_end = VDSO_BAD_ADDR, \
- .vvar_start = VVAR_BAD_ADDR, \
- .vvar_end = VVAR_BAD_ADDR, \
- .symbols = { \
- [0 ... VDSO_SYMBOL_MAX - 1] = \
- (struct vdso_symbol)VDSO_SYMBOL_INIT, \
- }, \
- }
-
-/* Size of VMA associated with vdso */
-static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
-{
- return t->vma_end - t->vma_start;
-}
-
-static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
-{
- return t->vvar_end - t->vvar_start;
-}
-/*
- * Special mark which allows to identify runtime vdso where
- * calls from proxy vdso are redirected. This mark usually
- * placed at the start of vdso area where Elf header lives.
- * Since such runtime vdso is solevey used by proxy and
- * nobody else is supposed to access it, it's more-less
- * safe to screw the Elf header with @signature and
- * @proxy_addr.
- *
- * The @proxy_addr deserves a few comments. When we redirect
- * the calls from proxy to runtime vdso, on next checkpoint
- * it won't be possible to find which VMA is proxy, thus
- * we save its address in the member.
- */
-struct vdso_mark {
- u64 signature;
- unsigned long proxy_vdso_addr;
-
- unsigned long version;
-
- /*
- * In case of new vDSO format the VVAR area address
- * neeed for easier discovering where it lives without
- * relying on procfs output.
- */
- unsigned long proxy_vvar_addr;
-};
-
-#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
-#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
-#define VDSO_MARK_CUR_VERSION (2)
-
-static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
-{
- struct vdso_mark *m = where;
-
- m->signature = VDSO_MARK_SIGNATURE_V2;
- m->proxy_vdso_addr = proxy_vdso_addr;
- m->version = VDSO_MARK_CUR_VERSION;
- m->proxy_vvar_addr = proxy_vvar_addr;
-}
-
-static inline bool is_vdso_mark(void *addr)
-{
- struct vdso_mark *m = addr;
-
- if (m->signature == VDSO_MARK_SIGNATURE_V2) {
- /*
- * New format
- */
- return true;
- } else if (m->signature == VDSO_MARK_SIGNATURE) {
- /*
- * Old format -- simply extend the mark up
- * to the version we support.
- */
- vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
- return true;
- }
- return false;
-}
-
-#define VDSO_SYMBOL_CLOCK_GETTIME_NAME "__vdso_clock_gettime"
-#define VDSO_SYMBOL_GETCPU_NAME "__vdso_getcpu"
-#define VDSO_SYMBOL_GETTIMEOFDAY_NAME "__vdso_gettimeofday"
-#define VDSO_SYMBOL_TIME_NAME "__vdso_time"
-
-
-
-extern struct vdso_symtable vdso_sym_rt;
-extern u64 vdso_pfn;
-
-extern int vdso_init(void);
-extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
-extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
-extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
- unsigned long vdso_rt_parked_at, size_t index,
- VmaEntry *vmas, size_t nr_vmas);
-
-extern int vdso_redirect_calls(void *base_to, void *base_from, struct vdso_symtable *to, struct vdso_symtable *from);
-extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
- struct vm_area_list *vma_area_list);
-
-#endif /* __CR_ASM_VDSO_H__ */
diff --git a/arch/x86/vdso-pie.c b/arch/x86/vdso-pie.c
deleted file mode 100644
index 0a55d71..0000000
--- a/arch/x86/vdso-pie.c
+++ /dev/null
@@ -1,441 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <elf.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "asm/string.h"
-#include "asm/types.h"
-
-#include "compiler.h"
-#include "syscall.h"
-#include "image.h"
-#include "vdso.h"
-#include "vma.h"
-#include "log.h"
-#include "bug.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-typedef struct {
- u16 movabs;
- u64 imm64;
- u16 jmp_rax;
- u32 guards;
-} __packed jmp_t;
-
-int vdso_redirect_calls(void *base_to, void *base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from)
-{
- jmp_t jmp = {
- .movabs = 0xb848,
- .jmp_rax = 0xe0ff,
- .guards = 0xcccccccc,
- };
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
- if (vdso_symbol_empty(&from->symbols[i]))
- continue;
-
- pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n",
- (unsigned long)base_from, from->symbols[i].offset,
- (unsigned long)base_to, to->symbols[i].offset, i);
-
- jmp.imm64 = (unsigned long)base_to + to->symbols[i].offset;
- builtin_memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp));
- }
-
- return 0;
-}
-
-
-/* Check if pointer is out-of-bound */
-static bool __ptr_oob(void *ptr, void *start, size_t size)
-{
- void *end = (void *)((unsigned long)start + size);
- return ptr > end || ptr < start;
-}
-
-/*
- * Elf hash, see format specification.
- */
-static unsigned long elf_hash(const unsigned char *name)
-{
- unsigned long h = 0, g;
-
- while (*name) {
- h = (h << 4) + *name++;
- g = h & 0xf0000000ul;
- if (g)
- h ^= g >> 24;
- h &= ~g;
- }
- return h;
-}
-
-int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
-{
- Elf64_Phdr *dynamic = NULL, *load = NULL;
- Elf64_Ehdr *ehdr = (void *)mem;
- Elf64_Dyn *dyn_strtab = NULL;
- Elf64_Dyn *dyn_symtab = NULL;
- Elf64_Dyn *dyn_strsz = NULL;
- Elf64_Dyn *dyn_syment = NULL;
- Elf64_Dyn *dyn_hash = NULL;
- Elf64_Word *hash = NULL;
- Elf64_Phdr *phdr;
- Elf64_Dyn *d;
-
- Elf64_Word *bucket, *chain;
- Elf64_Word nbucket, nchain;
-
- /*
- * See Elf specification for this magic values.
- */
- const char elf_ident[] = {
- 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- };
-
- const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
- [VDSO_SYMBOL_CLOCK_GETTIME] = VDSO_SYMBOL_CLOCK_GETTIME_NAME,
- [VDSO_SYMBOL_GETCPU] = VDSO_SYMBOL_GETCPU_NAME,
- [VDSO_SYMBOL_GETTIMEOFDAY] = VDSO_SYMBOL_GETTIMEOFDAY_NAME,
- [VDSO_SYMBOL_TIME] = VDSO_SYMBOL_TIME_NAME,
- };
-
- char *dynsymbol_names;
- unsigned int i, j, k;
-
- BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
-
- pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
-
- /*
- * Make sure it's a file we support.
- */
- if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
- pr_err("Elf header magic mismatch\n");
- return -EINVAL;
- }
-
- /*
- * We need PT_LOAD and PT_DYNAMIC here. Each once.
- */
- phdr = (void *)&mem[ehdr->e_phoff];
- for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
- if (__ptr_oob(phdr, mem, size))
- goto err_oob;
- switch (phdr->p_type) {
- case PT_DYNAMIC:
- if (dynamic) {
- pr_err("Second PT_DYNAMIC header\n");
- return -EINVAL;
- }
- dynamic = phdr;
- break;
- case PT_LOAD:
- if (load) {
- pr_err("Second PT_LOAD header\n");
- return -EINVAL;
- }
- load = phdr;
- break;
- }
- }
-
- if (!load || !dynamic) {
- pr_err("One of obligated program headers is missed\n");
- return -EINVAL;
- }
-
- pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
-
- /*
- * Dynamic section tags should provide us the rest of information
- * needed. Note that we're interested in a small set of tags.
- */
- d = (void *)&mem[dynamic->p_offset];
- for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
- if (__ptr_oob(d, mem, size))
- goto err_oob;
-
- if (d->d_tag == DT_NULL) {
- break;
- } else if (d->d_tag == DT_STRTAB) {
- dyn_strtab = d;
- pr_debug("DT_STRTAB: %p\n", (void *)d->d_un.d_ptr);
- } else if (d->d_tag == DT_SYMTAB) {
- dyn_symtab = d;
- pr_debug("DT_SYMTAB: %p\n", (void *)d->d_un.d_ptr);
- } else if (d->d_tag == DT_STRSZ) {
- dyn_strsz = d;
- pr_debug("DT_STRSZ: %lu\n", (unsigned long)d->d_un.d_val);
- } else if (d->d_tag == DT_SYMENT) {
- dyn_syment = d;
- pr_debug("DT_SYMENT: %lu\n", (unsigned long)d->d_un.d_val);
- } else if (d->d_tag == DT_HASH) {
- dyn_hash = d;
- pr_debug("DT_HASH: %p\n", (void *)d->d_un.d_ptr);
- }
- }
-
- if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
- pr_err("Not all dynamic entries are present\n");
- return -EINVAL;
- }
-
- dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
- if (__ptr_oob(dynsymbol_names, mem, size))
- goto err_oob;
-
- hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
- if (__ptr_oob(hash, mem, size))
- goto err_oob;
-
- nbucket = hash[0];
- nchain = hash[1];
- bucket = &hash[2];
- chain = &hash[nbucket + 2];
-
- pr_debug("nbucket %lu nchain %lu bucket %p chain %p\n",
- (long)nbucket, (long)nchain, bucket, chain);
-
- for (i = 0; i < ARRAY_SIZE(vdso_symbols); i++) {
- k = elf_hash((const unsigned char *)vdso_symbols[i]);
-
- for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
- Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
- char *name;
-
- sym = &sym[j];
- if (__ptr_oob(sym, mem, size))
- continue;
-
- if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
- ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
- continue;
-
- name = &dynsymbol_names[sym->st_name];
- if (__ptr_oob(name, mem, size))
- continue;
-
- if (builtin_strcmp(name, vdso_symbols[i]))
- continue;
-
- builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
- t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
- break;
- }
- }
-
- return 0;
-
-err_oob:
- pr_err("Corrupted Elf data\n");
- return -EFAULT;
-}
-
-static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
-{
- unsigned long addr;
-
- pr_debug("Remap %s %lx -> %lx\n", who, from, to);
-
- addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
- if (addr != to) {
- pr_err("Unable to remap %lx -> %lx %lx\n",
- from, to, addr);
- return -1;
- }
-
- return 0;
-}
-
-/* Park runtime vDSO in some safe place where it can be accessible from restorer */
-int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
-{
- int ret;
-
- BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
-
- if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
- if (sym_rt->vma_start < sym_rt->vvar_start) {
- ret = vdso_remap("rt-vdso", sym_rt->vma_start,
- park_at, vdso_vma_size(sym_rt));
- park_at += vdso_vma_size(sym_rt);
- ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
- park_at, vvar_vma_size(sym_rt));
- } else {
- ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
- park_at, vvar_vma_size(sym_rt));
- park_at += vvar_vma_size(sym_rt);
- ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
- park_at, vdso_vma_size(sym_rt));
- }
- } else
- ret = vdso_remap("rt-vdso", sym_rt->vma_start,
- park_at, vdso_vma_size(sym_rt));
- return ret;
-}
-
-int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
- unsigned long vdso_rt_parked_at, size_t index,
- VmaEntry *vmas, size_t nr_vmas)
-{
- VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
- struct vdso_symtable s = VDSO_SYMTABLE_INIT;
- bool remap_rt = false;
-
- /*
- * Figue out which kind of vdso tuple we get.
- */
- if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
- vma_vdso = &vmas[index];
- else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
- vma_vvar = &vmas[index];
-
- if (index < (nr_vmas - 1)) {
- if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
- vma_vdso = &vmas[index + 1];
- else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
- vma_vvar = &vmas[index + 1];
- }
-
- if (!vma_vdso) {
- pr_err("Can't find vDSO area in image\n");
- return -1;
- }
-
- /*
- * vDSO mark overwrites Elf program header of proxy vDSO thus
- * it must never ever be greater in size.
- */
- BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
-
- /*
- * Find symbols in vDSO zone read from image.
- */
- if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
- return -1;
-
- /*
- * Proxification strategy
- *
- * - There might be two vDSO zones: vdso code and optionally vvar data
- * - To be able to use in-place remapping we need
- *
- * a) Size and order of vDSO zones are to match
- * b) Symbols offsets must match
- * c) Have same number of vDSO zones
- */
- if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
- size_t i;
-
- for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
- if (s.symbols[i].offset != sym_rt->symbols[i].offset)
- break;
- }
-
- if (i == ARRAY_SIZE(s.symbols)) {
- if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
- remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
- if (remap_rt) {
- long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
- long delta_this = vma_vvar->start - vma_vdso->start;
-
- remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
- }
- }
- }
- }
-
- pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
- vma_vdso->start, vma_vdso->end,
- vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
- vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
-
- /*
- * Easy case -- the vdso from image has same offsets, order and size
- * as runtime, so we simply remap runtime vdso to dumpee position
- * without generating any proxy.
- *
- * Note we may remap VVAR vdso as well which might not yet been mapped
- * by a caller code. So drop VMA_AREA_REGULAR from it and caller would
- * not touch it anymore.
- */
- if (remap_rt) {
- int ret = 0;
-
- pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
-
- if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
- pr_err("Failed to unmap %s\n", who);
- return -1;
- }
-
- if (vma_vvar) {
- if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
- pr_err("Failed to unmap %s\n", who);
- return -1;
- }
- }
-
- if (vma_vvar) {
- if (vma_vdso->start < vma_vvar->start) {
- ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
- vdso_rt_parked_at += vdso_vma_size(sym_rt);
- ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
- } else {
- ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
- vdso_rt_parked_at += vvar_vma_size(sym_rt);
- ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
- }
- } else
- ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
-
- return ret;
- }
-
- /*
- * Now complex case -- we need to proxify calls. We redirect
- * calls from dumpee vdso to runtime vdso, making dumpee
- * to operate as proxy vdso.
- */
- pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
-
- /*
- * Don't forget to shift if vvar is before vdso.
- */
- if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
- sym_rt->vvar_start < sym_rt->vma_start)
- vdso_rt_parked_at += vvar_vma_size(sym_rt);
-
- if (vdso_redirect_calls((void *)vdso_rt_parked_at,
- (void *)vma_vdso->start,
- sym_rt, &s)) {
- pr_err("Failed to proxify dumpee contents\n");
- return -1;
- }
-
- /*
- * Put a special mark into runtime vdso, thus at next checkpoint
- * routine we could detect this vdso and do not dump it, since
- * it's auto-generated every new session if proxy required.
- */
- sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
- vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
- sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
- return 0;
-}
diff --git a/arch/x86/vdso.c b/arch/x86/vdso.c
deleted file mode 100644
index ac47a1e..0000000
--- a/arch/x86/vdso.c
+++ /dev/null
@@ -1,303 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <elf.h>
-#include <fcntl.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "asm/types.h"
-#include "asm/parasite-syscall.h"
-
-#include "parasite-syscall.h"
-#include "parasite.h"
-#include "compiler.h"
-#include "kerndat.h"
-#include "vdso.h"
-#include "util.h"
-#include "log.h"
-#include "mem.h"
-#include "vma.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT;
-u64 vdso_pfn = VDSO_BAD_PFN;
-/*
- * The VMAs list might have proxy vdso/vvar areas left
- * from previous dump/restore cycle so we need to detect
- * them and eliminated from the VMAs list, they will be
- * generated again on restore if needed.
- */
-int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
- struct vm_area_list *vma_area_list)
-{
- unsigned long proxy_vdso_addr = VDSO_BAD_ADDR;
- unsigned long proxy_vvar_addr = VVAR_BAD_ADDR;
- struct vma_area *proxy_vdso_marked = NULL;
- struct vma_area *proxy_vvar_marked = NULL;
- struct parasite_vdso_vma_entry *args;
- struct vma_area *vma;
- int fd, ret = -1;
- off_t off;
- u64 pfn;
-
- args = parasite_args(ctl, struct parasite_vdso_vma_entry);
- fd = open_proc(pid, "pagemap");
- if (fd < 0)
- return -1;
-
- list_for_each_entry(vma, &vma_area_list->h, list) {
- if (!vma_area_is(vma, VMA_AREA_REGULAR))
- continue;
-
- if (vma_area_is(vma, VMA_FILE_SHARED) ||
- vma_area_is(vma, VMA_FILE_PRIVATE))
- continue;
- /*
- * It might be possible VVAR area from marked
- * vDSO zone, we need to detect it earlier than
- * VDSO_PROT test because VVAR_PROT is a subset
- * of it but don't yield continue here,
- * sigh... what a mess.
- */
- BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT));
-
- if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) {
- if (proxy_vvar_addr != VVAR_BAD_ADDR &&
- proxy_vvar_addr == vma->e->start) {
- BUG_ON(proxy_vvar_marked);
- proxy_vvar_marked = vma;
- continue;
- }
- }
-
- if ((vma->e->prot & VDSO_PROT) != VDSO_PROT)
- continue;
-
- if (vma->e->start > TASK_SIZE)
- continue;
-
- if (vma->e->flags & MAP_GROWSDOWN)
- continue;
-
- /*
- * I need to poke every potentially marked vma,
- * otherwise if task never called for vdso functions
- * page frame number won't be reported.
- */
- args->start = vma->e->start;
- args->len = vma_area_len(vma);
-
- if (parasite_execute_daemon(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) {
- pr_err("vdso: Parasite failed to poke for mark\n");
- ret = -1;
- goto err;
- }
-
- /*
- * Defer handling marked vdso until we walked over
- * all vmas and restore potentially remapped vDSO
- * area status.
- */
- if (unlikely(args->is_marked)) {
- if (proxy_vdso_marked) {
- pr_err("Ow! Second vdso mark detected!\n");
- ret = -1;
- goto err;
- }
- proxy_vdso_marked = vma;
- proxy_vdso_addr = args->proxy_vdso_addr;
- proxy_vvar_addr = args->proxy_vvar_addr;
- continue;
- }
-
- off = (vma->e->start / PAGE_SIZE) * sizeof(u64);
- ret = pread(fd, &pfn, sizeof(pfn), off);
- if (ret < 0 || ret != sizeof(pfn)) {
- pr_perror("Can't read pme for pid %d", pid);
- ret = -1;
- goto err;
- }
-
- pfn = PME_PFRAME(pfn);
- if (!pfn) {
- pr_err("Unexpected page fram number 0 for pid %d\n", pid);
- ret = -1;
- goto err;
- }
-
- /*
- * Setup proper VMA status. Note starting with 3.16
- * the [vdso]/[vvar] marks are reported correctly
- * even when they are remapped into a new place,
- * but only since that particular version of the
- * kernel!
- */
- if (pfn == vdso_pfn) {
- if (!vma_area_is(vma, VMA_AREA_VDSO)) {
- pr_debug("vdso: Restore vDSO status by pfn at %lx\n",
- (long)vma->e->start);
- vma->e->status |= VMA_AREA_VDSO;
- }
- } else {
- if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) {
- pr_debug("vdso: Drop mishinted vDSO status at %lx\n",
- (long)vma->e->start);
- vma->e->status &= ~VMA_AREA_VDSO;
- }
- }
- }
-
- /*
- * There is marked vdso, it means such vdso is autogenerated
- * and must be dropped from vma list.
- */
- if (proxy_vdso_marked) {
- pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n",
- (long)proxy_vdso_marked->e->start,
- (long)proxy_vdso_addr, (long)proxy_vvar_addr);
-
- /*
- * Don't forget to restore the proxy vdso/vvar status, since
- * it's unknown to the kernel.
- */
- list_for_each_entry(vma, &vma_area_list->h, list) {
- if (vma->e->start == proxy_vdso_addr) {
- vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO;
- pr_debug("vdso: Restore proxy vDSO status at %lx\n",
- (long)vma->e->start);
- } else if (vma->e->start == proxy_vvar_addr) {
- vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR;
- pr_debug("vdso: Restore proxy VVAR status at %lx\n",
- (long)vma->e->start);
- }
- }
-
- pr_debug("vdso: Droppping marked vdso at %lx\n",
- (long)proxy_vdso_marked->e->start);
- list_del(&proxy_vdso_marked->list);
- xfree(proxy_vdso_marked);
- vma_area_list->nr--;
-
- if (proxy_vvar_marked) {
- pr_debug("vdso: Droppping marked vvar at %lx\n",
- (long)proxy_vvar_marked->e->start);
- list_del(&proxy_vvar_marked->list);
- xfree(proxy_vvar_marked);
- vma_area_list->nr--;
- }
- }
- ret = 0;
-err:
- close(fd);
- return ret;
-}
-
-static int vdso_fill_self_symtable(struct vdso_symtable *s)
-{
- char buf[512];
- int ret = -1;
- FILE *maps;
-
- *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT;
-
- maps = fopen("/proc/self/maps", "r");
- if (!maps) {
- pr_perror("Can't open self-vma");
- return -1;
- }
-
- while (fgets(buf, sizeof(buf), maps)) {
- unsigned long start, end;
- char *has_vdso, *has_vvar;
-
- has_vdso = strstr(buf, "[vdso]");
- if (!has_vdso)
- has_vvar = strstr(buf, "[vvar]");
- else
- has_vvar = NULL;
-
- if (!has_vdso && !has_vvar)
- continue;
-
- ret = sscanf(buf, "%lx-%lx", &start, &end);
- if (ret != 2) {
- ret = -1;
- pr_err("Can't find vDSO/VVAR bounds\n");
- goto err;
- }
-
- if (has_vdso) {
- if (s->vma_start != VDSO_BAD_ADDR) {
- pr_err("Got second vDSO entry\n");
- ret = -1;
- goto err;
- }
- s->vma_start = start;
- s->vma_end = end;
-
- ret = vdso_fill_symtable((void *)start, end - start, s);
- if (ret)
- goto err;
- } else {
- if (s->vvar_start != VVAR_BAD_ADDR) {
- pr_err("Got second VVAR entry\n");
- ret = -1;
- goto err;
- }
- s->vvar_start = start;
- s->vvar_end = end;
- }
- }
-
- /*
- * Validate its structure -- for new vDSO format the
- * structure must be like
- *
- * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso]
- * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar]
- *
- * The areas may be in reverse order.
- *
- * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar]
- * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso]
- *
- */
- ret = 0;
- if (s->vma_start != VDSO_BAD_ADDR) {
- if (s->vvar_start != VVAR_BAD_ADDR) {
- if (s->vma_end != s->vvar_start &&
- s->vvar_end != s->vma_start) {
- ret = -1;
- pr_err("Unexpected rt vDSO area bounds\n");
- goto err;
- }
- }
- } else {
- ret = -1;
- pr_err("Can't find rt vDSO\n");
- goto err;
- }
-
- pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n",
- s->vma_start, s->vma_end,
- s->vvar_start, s->vvar_end);
-
-err:
- fclose(maps);
- return ret;
-}
-
-int vdso_init(void)
-{
- if (vdso_fill_self_symtable(&vdso_sym_rt))
- return -1;
- return vaddr_to_pfn(vdso_sym_rt.vma_start, &vdso_pfn);
-}
--
Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by the Linux Foundation.
More information about the CRIU
mailing list