[CRIU] [PATCH] arch/ppc64: Add PowerPC 64 LE support
Laurent Dufour
ldufour at linux.vnet.ibm.com
Wed Apr 29 08:47:15 PDT 2015
This patch initiates the ppc64le architecture support in CRIU.
Note that ppc64 (Big Endian) architecture is not yet supported since there
are still several issues to address with this architecture. However, in the
long term, the two architectures should be addressed using the almost the
same code, so sharing the ppc64 directory.
Major ppc64 issues:
Loader is not involved when the parasite code is loaded. So no relocation
is done for the parasite code. As a consequence r2 must be set manually
when entering the parasite code, and GOT is not filled.
Furthermore, the r2 fixup code at the services's global address which has
not been fixed by the loader should not be run. Branching at local address,
as the assembly code does is jumping over it.
On the long term, relocation should be done when loading the parasite code.
We are introducing 2 trampolines for the 2 entry points of the restorer
blob. These entry points are dealing with r2. These ppc64 specific entry
points are overwritting the standard one in sigreturn_restore() from
cr-restore.c. Instead of using #ifdef, we may introduce a per arch wrapper
here.
CRIU needs 2 kernel patches to be run powerpc which are not yet upstream:
- Tracking the vDSO remapping
- Enabling the kcmp system call on powerpc
Feature not yet supported:
- Altivec registers C/R
- VSX registers C/R
- TM support
- all lot of things I missed..
Signed-off-by: Laurent Dufour <ldufour at linux.vnet.ibm.com>
---
Makefile | 18 +-
arch/ppc64/Makefile | 55 +++
arch/ppc64/cpu.c | 45 +++
arch/ppc64/crtools.c | 293 +++++++++++++++
arch/ppc64/include/asm/atomic.h | 112 ++++++
arch/ppc64/include/asm/bitops.h | 11 +
arch/ppc64/include/asm/bitsperlong.h | 6 +
arch/ppc64/include/asm/cmpxchg.h | 96 +++++
arch/ppc64/include/asm/cpu.h | 1 +
arch/ppc64/include/asm/dump.h | 11 +
arch/ppc64/include/asm/fpu.h | 4 +
arch/ppc64/include/asm/int.h | 6 +
arch/ppc64/include/asm/linkage.h | 20 +
arch/ppc64/include/asm/page.h | 23 ++
arch/ppc64/include/asm/parasite-syscall.h | 17 +
arch/ppc64/include/asm/parasite.h | 7 +
arch/ppc64/include/asm/prlimit.h | 14 +
arch/ppc64/include/asm/processor-flags.h | 4 +
arch/ppc64/include/asm/restore.h | 33 ++
arch/ppc64/include/asm/restorer.h | 136 +++++++
arch/ppc64/include/asm/string.h | 11 +
arch/ppc64/include/asm/types.h | 111 ++++++
arch/ppc64/include/asm/vdso.h | 172 +++++++++
arch/ppc64/parasite-head.S | 44 +++
arch/ppc64/restorer-trampoline.S | 33 ++
arch/ppc64/restorer.c | 14 +
arch/ppc64/syscall-common-ppc64.S | 32 ++
arch/ppc64/syscall-ppc64.def | 99 +++++
arch/ppc64/syscalls-ppc64.sh | 54 +++
arch/ppc64/vdso-pie.c | 594 ++++++++++++++++++++++++++++++
arch/ppc64/vdso-trampoline.S | 11 +
arch/ppc64/vdso.c | 309 ++++++++++++++++
cr-restore.c | 5 +
include/image.h | 4 +
pie/Makefile | 6 +
pie/pie.lds.S.in | 2 +
protobuf/Makefile | 1 +
protobuf/core-ppc64.proto | 23 ++
protobuf/core.proto | 3 +
39 files changed, 2439 insertions(+), 1 deletion(-)
diff --git a/Makefile b/Makefile
index ed2a1992efd5..208557e33b21 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,6 @@ ARCH ?= $(shell uname -m | sed \
-e s/sun4u/sparc64/ \
-e s/s390x/s390/ \
-e s/parisc64/parisc/ \
- -e s/ppc.*/powerpc/ \
-e s/mips.*/mips/ \
-e s/sh[234].*/sh/)
@@ -85,6 +84,20 @@ ifeq ($(SRCARCH),arm)
export PROTOUFIX
endif
+#
+# The PowerPC 64 bits architecture could be big or little endian.
+# They are handled in the same way.
+#
+ifeq ($(shell echo $(ARCH) | sed -e 's/ppc64.*/ppc64/'),ppc64)
+ ifeq ($(ARCH),ppc64)
+ error := $(error ppc64 big endian not yet supported)
+ endif
+ SRCARCH := ppc64
+ DEFINES := -DCONFIG_PPC64
+ LDARCH := powerpc:common64
+ VDSO := y
+endif
+
SRCARCH ?= $(ARCH)
LDARCH ?= $(SRCARCH)
@@ -193,6 +206,9 @@ PROGRAM-BUILTINS += $(ARCH_DIR)/vdso-pie.o
ifeq ($(SRCARCH),aarch64)
PROGRAM-BUILTINS += $(ARCH_DIR)/intraprocedure.o
endif
+ifeq ($(SRCARCH),ppc64)
+PROGRAM-BUILTINS += $(ARCH_DIR)/vdso-trampoline.o
+endif
endif
PROGRAM-BUILTINS += pie/util-fd.o
diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile
new file mode 100644
index 000000000000..c5d332364aa2
--- /dev/null
+++ b/arch/ppc64/Makefile
@@ -0,0 +1,55 @@
+targets += syscalls
+targets += crtools
+
+SYS-ASM := syscalls.S
+
+syscalls-asm-y += $(SYS-ASM:.S=).o
+crtools-obj-y += crtools.o
+crtools-obj-y += cpu.o
+
+SYS-DEF := syscall-ppc64.def
+SYS-ASM-COMMON := syscall-common-ppc64.S
+
+SYS-TYPES := include/syscall-types.h
+SYS-CODES := include/syscall-codes.h
+SYS-PROTO := include/syscall.h
+
+SYS-GEN := syscalls-ppc64.sh
+
+SYS-EXEC-TBL := sys-exec-tbl.c
+
+syscalls-asm-y-asmflags := -fpie -Wstrict-prototypes -Wa,--noexecstack
+syscalls-asm-y-asmflags += -nostdlib -fomit-frame-pointer -I$(obj)
+
+ASMFLAGS += -D__ASSEMBLY__
+
+$(obj)/$(SYS-ASM): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF) $(obj)/$(SYS-ASM-COMMON) $(SYS-TYPES)
+ $(E) " GEN " $@
+ $(Q) $(SH) \
+ $(obj)/$(SYS-GEN) --asm \
+ $(obj)/$(SYS-DEF) \
+ $(SYS-CODES) \
+ $(SYS-PROTO) \
+ $(obj)/$(SYS-ASM) \
+ $(SYS-ASM-COMMON) \
+ $(SYS-TYPES)
+
+$(obj)/syscalls.o: $(obj)/$(SYS-ASM)
+
+$(obj)/$(SYS-EXEC-TBL): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF)
+ $(E) " GEN " $@
+ $(Q) $(SH) \
+ $(obj)/$(SYS-GEN) --exec \
+ $(obj)/$(SYS-DEF) \
+ $(obj)/$(SYS-EXEC-TBL)
+
+_all += $(obj)/$(SYS-EXEC-TBL)
+
+cleanup-y += $(obj)/$(SYS-EXEC-TBL) $(obj)/$(SYS-ASM)
+cleanup-y += $(SYS-CODES)
+cleanup-y += $(SYS-PROTO)
+
+ifneq ($(MAKECMDGOALS),clean)
+deps-after := $(obj)/$(SYS-ASM)
+incdeps := y
+endif
diff --git a/arch/ppc64/cpu.c b/arch/ppc64/cpu.c
new file mode 100644
index 000000000000..040fe14fcfb7
--- /dev/null
+++ b/arch/ppc64/cpu.c
@@ -0,0 +1,45 @@
+#undef LOG_PREFIX
+#define LOG_PREFIX "cpu: "
+
+#include <errno.h>
+#include "cpu.h"
+
+bool cpu_has_feature(unsigned int feature)
+{
+ return false;
+}
+
+int cpu_init(void)
+{
+ return 0;
+}
+
+int cpu_dump_cpuinfo(void)
+{
+ return 0;
+}
+
+int cpu_validate_cpuinfo(void)
+{
+ return 0;
+}
+
+int cpu_dump_cpuinfo_single(void)
+{
+ return -ENOTSUP;
+}
+
+int cpu_validate_image_cpuinfo_single(void)
+{
+ return -ENOTSUP;
+}
+
+int cpuinfo_dump(void)
+{
+ return -ENOTSUP;
+}
+
+int cpuinfo_check(void)
+{
+ return -ENOTSUP;
+}
diff --git a/arch/ppc64/crtools.c b/arch/ppc64/crtools.c
new file mode 100644
index 000000000000..31cef5d222d8
--- /dev/null
+++ b/arch/ppc64/crtools.c
@@ -0,0 +1,293 @@
+#include <string.h>
+#include <unistd.h>
+#include <elf.h>
+#include <sys/user.h>
+
+#include "asm/types.h"
+#include "asm/fpu.h"
+#include "asm/restorer.h"
+
+#include "cr_options.h"
+#include "compiler.h"
+#include "ptrace.h"
+#include "parasite-syscall.h"
+#include "syscall.h"
+#include "log.h"
+#include "util.h"
+#include "cpu.h"
+#include "errno.h"
+
+#include "protobuf.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+
+/*
+ * Injected syscall instruction
+ */
+const u32 code_syscall[] = {
+ 0x44000002, /* sc */
+ 0x0fe00000 /* twi 31,0,0 */
+};
+
+const int code_syscall_size = sizeof(code_syscall);
+
+static inline void __check_code_syscall(void)
+{
+ BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ regs->nip = new_ip;
+ if (stack)
+ regs->gpr[1] = (unsigned long) stack;
+ regs->trap = 0;
+}
+
+bool arch_can_dump_task(pid_t pid)
+{
+ /*
+ * TODO: We should detect 32bit task when BE support is done.
+ */
+ return true;
+}
+
+int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.gpr[0] = (unsigned long)nr;
+ regs.gpr[3] = arg1;
+ regs.gpr[4] = arg2;
+ regs.gpr[5] = arg3;
+ regs.gpr[6] = arg4;
+ regs.gpr[7] = arg5;
+ regs.gpr[8] = arg6;
+
+ err = __parasite_execute_syscall(ctl, ®s);
+
+ *ret = regs.gpr[3];
+ return err;
+}
+
+/* This is the layout of the POWER7 VSX registers and the way they
+ * overlap with the existing FPR and VMX registers.
+ *
+ * VSR doubleword 0 VSR doubleword 1
+ * ----------------------------------------------------------------
+ * VSR[0] | FPR[0] | |
+ * ----------------------------------------------------------------
+ * VSR[1] | FPR[1] | |
+ * ----------------------------------------------------------------
+ * | ... | |
+ * ----------------------------------------------------------------
+ * VSR[30] | FPR[30] | |
+ * ----------------------------------------------------------------
+ * VSR[31] | FPR[31] | |
+ * ----------------------------------------------------------------
+ * VSR[32] | VR[0] |
+ * ----------------------------------------------------------------
+ * VSR[33] | VR[1] |
+ * ----------------------------------------------------------------
+ * | ... |
+ * ----------------------------------------------------------------
+ * VSR[62] | VR[30] |
+ * ----------------------------------------------------------------
+ * VSR[63] | VR[31] |
+ * ----------------------------------------------------------------
+ *
+ * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR
+ * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE
+ * PTRACE_GETVSRREGS returns VSR[0..31]
+ *
+ * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need
+ * to save FPSCR too.
+ */
+static int get_fpu_regs(pid_t pid, CoreEntry *core)
+{
+ elf_fpregset_t fpregs;
+ UserPpc64FpstateEntry *fpe;
+ int i;
+
+ if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fpregs) < 0) {
+ pr_err("Couldn't get floating-point registers.");
+ return -1;
+ }
+
+ fpe = xmalloc(sizeof(UserPpc64FpstateEntry));
+ if (!fpe)
+ return -1;
+ user_ppc64_fpstate_entry__init(fpe);
+
+ fpe->n_fpregs = NFPREG;
+ fpe->fpregs = xmalloc(fpe->n_fpregs * sizeof(fpe->fpregs[0]));
+ if (!fpe->fpregs) {
+ xfree(fpe);
+ return -1;
+ }
+
+ /* FPSRC is the last (33th) register in the set */
+ for (i=0; i<NFPREG; i++)
+ fpe->fpregs[i] = fpregs[i];
+
+ core->ti_ppc64->fpstate = fpe;
+ return 0;
+}
+
+static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe)
+{
+ int i;
+
+ for (i=0; i<fpe->n_fpregs; i++)
+ mc->fp_regs[i] = (double)(fpe->fpregs[i]);
+}
+
+int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
+{
+ int i;
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ /*
+ * This is inspired by kernel function check_syscall_restart in
+ * arch/powerpc/kernel/signal.c
+ */
+#ifndef TRAP
+#define TRAP(r) ((r).trap & ~0xF)
+#endif
+
+ if (TRAP(regs) == 0x0C00 && regs.ccr & 0x10000000) {
+ /* Restart the system call */
+ switch (regs.gpr[3]) {
+ case ERESTARTNOHAND:
+ case ERESTARTSYS:
+ case ERESTARTNOINTR:
+ regs.gpr[3] = regs.orig_gpr3;
+ regs.nip -= 4;
+ break;
+ case ERESTART_RESTARTBLOCK:
+ regs.gpr[0] = __NR_restart_syscall;
+ regs.nip -= 4;
+ break;
+ }
+ }
+
+ /* Resetting trap since we are now comming from user space. */
+ regs.trap = 0;
+
+#define assign_reg(dst, src, e) do { \
+ dst->e = (__typeof__(dst->e))src.e; \
+} while (0)
+
+ for (i=0; i<32; i++)
+ assign_reg(core->ti_ppc64->gpregs, regs, gpr[i]);
+
+ assign_reg(core->ti_ppc64->gpregs, regs, nip);
+ assign_reg(core->ti_ppc64->gpregs, regs, msr);
+ assign_reg(core->ti_ppc64->gpregs, regs, orig_gpr3);
+ assign_reg(core->ti_ppc64->gpregs, regs, ctr);
+ assign_reg(core->ti_ppc64->gpregs, regs, link);
+ assign_reg(core->ti_ppc64->gpregs, regs, xer);
+ assign_reg(core->ti_ppc64->gpregs, regs, ccr);
+ assign_reg(core->ti_ppc64->gpregs, regs, trap);
+#undef assign_reg
+
+ if (get_fpu_regs(pid, core))
+ return -1;
+
+ return 0;
+}
+
+int arch_alloc_thread_info(CoreEntry *core)
+{
+ ThreadInfoPpc64 *ti_ppc64;
+ UserPpc64RegsEntry *regs;
+
+ ti_ppc64 = xmalloc(sizeof(*ti_ppc64));
+ if(!ti_ppc64)
+ goto err;
+ thread_info_ppc64__init(ti_ppc64);
+ CORE_THREAD_ARCH_INFO(core) = ti_ppc64;
+
+ /* user_ppc64_regs_entry */
+ regs = xmalloc(sizeof(*regs));
+ if (!regs)
+ goto err;
+ user_ppc64_regs_entry__init(regs);
+
+ regs->gpr = xmalloc(32*sizeof(uint64_t));
+ if (!regs->gpr)
+ goto err;
+ regs->n_gpr = 32;
+
+ ti_ppc64->gpregs = regs;
+
+ return 0;
+err:
+ return -1;
+}
+
+void arch_free_thread_info(CoreEntry *core)
+{
+ if (CORE_THREAD_ARCH_INFO(core)) {
+ if (CORE_THREAD_ARCH_INFO(core)->fpstate) {
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->fpregs);
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpstate);
+ }
+ xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->gpr);
+ xfree(CORE_THREAD_ARCH_INFO(core)->gpregs);
+ xfree(CORE_THREAD_ARCH_INFO(core));
+ CORE_THREAD_ARCH_INFO(core) = NULL;
+ }
+}
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
+{
+ if (CORE_THREAD_ARCH_INFO(core)->fpstate)
+ put_fpu_regs(&sigframe->uc.uc_mcontext,
+ CORE_THREAD_ARCH_INFO(core)->fpstate);
+ return 0;
+}
+
+int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r)
+{
+ int i;
+
+ /* r0 to r31 */
+ for (i=0; i<32; i++)
+ f->uc.uc_mcontext.gp_regs[i] = r->gpr[i];
+
+ f->uc.uc_mcontext.gp_regs[PT_NIP] = r->nip;
+ f->uc.uc_mcontext.gp_regs[PT_MSR] = r->msr;
+ f->uc.uc_mcontext.gp_regs[PT_ORIG_R3] = r->orig_gpr3;
+ f->uc.uc_mcontext.gp_regs[PT_CTR] = r->ctr;
+ f->uc.uc_mcontext.gp_regs[PT_LNK] = r->link;
+ f->uc.uc_mcontext.gp_regs[PT_XER] = r->xer;
+ f->uc.uc_mcontext.gp_regs[PT_CCR] = r->ccr;
+ f->uc.uc_mcontext.gp_regs[PT_TRAP] = r->trap;
+
+ return 0;
+}
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map = 0;
+ int err;
+
+ err = syscall_seized(ctl, __NR_mmap, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset);
+ if (err < 0 || (long)map < 0)
+ map = 0;
+
+ return (void *)map;
+}
diff --git a/arch/ppc64/include/asm/atomic.h b/arch/ppc64/include/asm/atomic.h
new file mode 100644
index 000000000000..4fa33b1c7005
--- /dev/null
+++ b/arch/ppc64/include/asm/atomic.h
@@ -0,0 +1,112 @@
+#ifndef __CR_ATOMIC_H__
+#define __CR_ATOMIC_H__
+
+/*
+ * PowerPC atomic operations
+ *
+ * Copied from kernel header file arch/powerpc/include/asm/atomic.h
+ */
+
+typedef struct {
+ int counter;
+} atomic_t;
+
+#include "asm/cmpxchg.h"
+
+#define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n"
+#define PPC_ATOMIC_EXIT_BARRIER "sync \n"
+
+#define ATOMIC_INIT(i) { (i) }
+
+static __inline__ int atomic_read(const atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter));
+
+ return t;
+}
+
+static __inline__ void atomic_set(atomic_t *v, int i)
+{
+ __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i));
+}
+
+#define ATOMIC_OP(op, asm_op) \
+static __inline__ void atomic_##op(int a, atomic_t *v) \
+{ \
+ int t; \
+ \
+ __asm__ __volatile__( \
+"1: lwarx %0,0,%3 # atomic_" #op "\n" \
+ #asm_op " %0,%2,%0\n" \
+" stwcx. %0,0,%3 \n" \
+" bne- 1b\n" \
+ : "=&r" (t), "+m" (v->counter) \
+ : "r" (a), "r" (&v->counter) \
+ : "cc"); \
+} \
+
+ATOMIC_OP(add, add)
+ATOMIC_OP(sub, subf)
+
+#undef ATOMIC_OP
+
+static __inline__ void atomic_inc(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+"1: lwarx %0,0,%2 # atomic_inc\n\
+ addic %0,%0,1\n"
+" stwcx. %0,0,%2 \n\
+ bne- 1b"
+ : "=&r" (t), "+m" (v->counter)
+ : "r" (&v->counter)
+ : "cc", "xer");
+}
+
+static __inline__ int atomic_inc_return(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+ PPC_ATOMIC_ENTRY_BARRIER \
+"1: lwarx %0,0,%1 # atomic_inc_return\n\
+ addic %0,%0,1\n"
+" stwcx. %0,0,%1 \n\
+ bne- 1b \n" \
+ PPC_ATOMIC_EXIT_BARRIER
+ : "=&r" (t)
+ : "r" (&v->counter)
+ : "cc", "xer", "memory");
+
+ return t;
+}
+
+/*
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+
+static __inline__ void atomic_dec(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+"1: lwarx %0,0,%2 # atomic_dec\n\
+ addic %0,%0,-1\n"
+" stwcx. %0,0,%2\n\
+ bne- 1b"
+ : "=&r" (t), "+m" (v->counter)
+ : "r" (&v->counter)
+ : "cc", "xer");
+}
+
+#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+
+#endif /* __CR_ATOMIC_H__ */
diff --git a/arch/ppc64/include/asm/bitops.h b/arch/ppc64/include/asm/bitops.h
new file mode 100644
index 000000000000..f310c5284a2f
--- /dev/null
+++ b/arch/ppc64/include/asm/bitops.h
@@ -0,0 +1,11 @@
+#ifndef __CR_BITOPS_H__
+#define __CR_BITOPS_H__
+
+#include "compiler.h"
+/*
+ * TODO: create some optimized version instead of falling down with the
+ * generic ones.
+ */
+#include "asm-generic/bitops.h"
+
+#endif /* __CR_BITOPS_H__ */
diff --git a/arch/ppc64/include/asm/bitsperlong.h b/arch/ppc64/include/asm/bitsperlong.h
new file mode 100644
index 000000000000..d95727d193e8
--- /dev/null
+++ b/arch/ppc64/include/asm/bitsperlong.h
@@ -0,0 +1,6 @@
+#ifndef __CR_BITSPERLONG_H__
+#define __CR_BITSPERLONG_H__
+
+#define BITS_PER_LONG 64
+
+#endif /* __CR_BITSPERLONG_H__ */
diff --git a/arch/ppc64/include/asm/cmpxchg.h b/arch/ppc64/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..b93fbdef06c7
--- /dev/null
+++ b/arch/ppc64/include/asm/cmpxchg.h
@@ -0,0 +1,96 @@
+#ifndef __CR_CMPXCHG_H__
+#define __CR_CMPXCHG_H__
+
+/*
+ * Copied from kernel header file arch/powerpc/include/asm/cmpxchg.h
+ */
+
+#define PPC_ACQUIRE_BARRIER "isync \n"
+#define PPC_RELEASE_BARRIER "lwsync \n"
+
+/*
+ * Compare and exchange - if *p == old, set it to new,
+ * and return the old value of *p.
+ */
+
+static __always_inline unsigned long
+__cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
+{
+ unsigned int prev;
+
+ __asm__ __volatile__ (
+ PPC_RELEASE_BARRIER \
+"1: lwarx %0,0,%2 # __cmpxchg_u32\n\
+ cmpw 0,%0,%3\n\
+ bne- 2f\n"
+" stwcx. %4,0,%2\n\
+ bne- 1b \n" \
+ PPC_ACQUIRE_BARRIER
+ "\n\
+2:"
+ : "=&r" (prev), "+m" (*p)
+ : "r" (p), "r" (old), "r" (new)
+ : "cc", "memory");
+
+ return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
+{
+ unsigned long prev;
+
+ __asm__ __volatile__ (
+ PPC_RELEASE_BARRIER \
+"1: ldarx %0,0,%2 # __cmpxchg_u64\n\
+ cmpd 0,%0,%3\n\
+ bne- 2f\n\
+ stdcx. %4,0,%2\n\
+ bne- 1b \n" \
+ PPC_ACQUIRE_BARRIER
+ "\n\
+2:"
+ : "=&r" (prev), "+m" (*p)
+ : "r" (p), "r" (old), "r" (new)
+ : "cc", "memory");
+
+ return prev;
+}
+
+/* This function doesn't exist, so you'll get a linker error
+ if something tries to do an invalid cmpxchg(). */
+#ifdef CR_DEBUG
+static inline void __cmpxchg_called_with_bad_pointer(void)
+{
+ __asm__ __volatile__ (
+ "1: twi 31,0,0 # trap\n"
+ " b 1b"
+ : : : "memory");
+}
+#else
+extern void __cmpxchg_called_with_bad_pointer(void);
+#endif
+
+static __always_inline unsigned long
+__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
+ unsigned int size)
+{
+ switch (size) {
+ case 4:
+ return __cmpxchg_u32(ptr, old, new);
+ case 8:
+ return __cmpxchg_u64(ptr, old, new);
+ }
+ __cmpxchg_called_with_bad_pointer();
+ return old;
+}
+
+#define cmpxchg(ptr, o, n) \
+ ({ \
+ __typeof__(*(ptr)) _o_ = (o); \
+ __typeof__(*(ptr)) _n_ = (n); \
+ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \
+ (unsigned long)_n_, sizeof(*(ptr))); \
+ })
+
+#endif /* __CR_CMPXCHG_H__ */
diff --git a/arch/ppc64/include/asm/cpu.h b/arch/ppc64/include/asm/cpu.h
new file mode 100644
index 000000000000..59118c211d10
--- /dev/null
+++ b/arch/ppc64/include/asm/cpu.h
@@ -0,0 +1 @@
+#include <stdbool.h>
diff --git a/arch/ppc64/include/asm/dump.h b/arch/ppc64/include/asm/dump.h
new file mode 100644
index 000000000000..1505fd2983b0
--- /dev/null
+++ b/arch/ppc64/include/asm/dump.h
@@ -0,0 +1,11 @@
+#ifndef __CR_ASM_DUMP_H__
+#define __CR_ASM_DUMP_H__
+
+extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
+extern int arch_alloc_thread_info(CoreEntry *core);
+extern void arch_free_thread_info(CoreEntry *core);
+
+
+#define core_put_tls(core, tls)
+
+#endif
diff --git a/arch/ppc64/include/asm/fpu.h b/arch/ppc64/include/asm/fpu.h
new file mode 100644
index 000000000000..7f476d541a7d
--- /dev/null
+++ b/arch/ppc64/include/asm/fpu.h
@@ -0,0 +1,4 @@
+#ifndef __CR_ASM_FPU_H__
+#define __CR_ASM_FPU_H__
+
+#endif /* __CR_ASM_FPU_H__ */
diff --git a/arch/ppc64/include/asm/int.h b/arch/ppc64/include/asm/int.h
new file mode 100644
index 000000000000..642804e9b485
--- /dev/null
+++ b/arch/ppc64/include/asm/int.h
@@ -0,0 +1,6 @@
+#ifndef __CR_ASM_INT_H__
+#define __CR_ASM_INT_H__
+
+#include "asm-generic/int.h"
+
+#endif /* __CR_ASM_INT_H__ */
diff --git a/arch/ppc64/include/asm/linkage.h b/arch/ppc64/include/asm/linkage.h
new file mode 100644
index 000000000000..03e01dc96543
--- /dev/null
+++ b/arch/ppc64/include/asm/linkage.h
@@ -0,0 +1,20 @@
+#ifndef __CR_LINKAGE_H__
+#define __CR_LINKAGE_H__
+
+#ifdef __ASSEMBLY__
+
+#define GLOBAL(name) \
+ .globl name; \
+ name:
+
+#define ENTRY(name) \
+ .globl name; \
+ .type name, @function; \
+ name:
+
+#define END(sym) \
+ .size sym, . - sym
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __CR_LINKAGE_H__ */
diff --git a/arch/ppc64/include/asm/page.h b/arch/ppc64/include/asm/page.h
new file mode 100644
index 000000000000..169c6943d844
--- /dev/null
+++ b/arch/ppc64/include/asm/page.h
@@ -0,0 +1,23 @@
+#ifndef __CR_ASM_PAGE_H__
+#define __CR_ASM_PAGE_H__
+
+/*
+ * Default config for Pseries is to use 64K pages.
+ * See kernel file arch/powerpc/configs/pseries_*defconfig
+ */
+#ifndef PAGE_SHIFT
+# define PAGE_SHIFT 16
+#endif
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE (1UL << PAGE_SHIFT)
+#endif
+
+#ifndef PAGE_MASK
+# define PAGE_MASK (~(PAGE_SIZE - 1))
+#endif
+
+#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
+#define page_size() PAGE_SIZE
+
+#endif /* __CR_ASM_PAGE_H__ */
diff --git a/arch/ppc64/include/asm/parasite-syscall.h b/arch/ppc64/include/asm/parasite-syscall.h
new file mode 100644
index 000000000000..7665e207b75e
--- /dev/null
+++ b/arch/ppc64/include/asm/parasite-syscall.h
@@ -0,0 +1,17 @@
+#ifndef __CR_ASM_PARASITE_SYSCALL_H__
+#define __CR_ASM_PARASITE_SYSCALL_H__
+
+struct parasite_ctl;
+
+#define ARCH_SI_TRAP TRAP_BRKPT
+
+extern const char code_syscall[];
+extern const int code_syscall_size;
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset);
+
+#endif
diff --git a/arch/ppc64/include/asm/parasite.h b/arch/ppc64/include/asm/parasite.h
new file mode 100644
index 000000000000..fdbc340b05e2
--- /dev/null
+++ b/arch/ppc64/include/asm/parasite.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_PARASITE_H__
+#define __ASM_PARASITE_H__
+
+/* TLS is accessed through r13, which is already processed */
+static inline void arch_get_tls(tls_t *ptls) { (void)ptls; }
+
+#endif
diff --git a/arch/ppc64/include/asm/prlimit.h b/arch/ppc64/include/asm/prlimit.h
new file mode 100644
index 000000000000..6746ba0e6f19
--- /dev/null
+++ b/arch/ppc64/include/asm/prlimit.h
@@ -0,0 +1,14 @@
+#ifndef __CR_PRLIMIT_H__
+#define __CR_PRLIMIT_H__
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+
+#ifndef CONFIG_HAS_PRLIMIT
+extern int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit);
+#endif
+
+#endif /* __CR_PRLIMIT_H__ */
diff --git a/arch/ppc64/include/asm/processor-flags.h b/arch/ppc64/include/asm/processor-flags.h
new file mode 100644
index 000000000000..c1888af36fa0
--- /dev/null
+++ b/arch/ppc64/include/asm/processor-flags.h
@@ -0,0 +1,4 @@
+#ifndef __CR_PROCESSOR_FLAGS_H__
+#define __CR_PROCESSOR_FLAGS_H__
+
+#endif
diff --git a/arch/ppc64/include/asm/restore.h b/arch/ppc64/include/asm/restore.h
new file mode 100644
index 000000000000..3ca0c534d843
--- /dev/null
+++ b/arch/ppc64/include/asm/restore.h
@@ -0,0 +1,33 @@
+#ifndef __CR_ASM_RESTORE_H__
+#define __CR_ASM_RESTORE_H__
+
+#include "asm/restorer.h"
+
+#include "protobuf/core.pb-c.h"
+
+/*
+ * Set R2 to blob + 8000 which is the default value
+ * Jump to restore_task_exec_start + 8 since R2 is already set (local call)
+ */
+#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
+ task_args) \
+ asm volatile( \
+ "mr 1,%0 \n" \
+ "mr 3,%1 \n" \
+ "mtctr 3 \n" \
+ "mr 3,%2 \n" \
+ "mr 2,%3 \n" \
+ "bctr \n" \
+ : \
+ : "r"(new_sp), \
+ "r"((unsigned long)restore_task_exec_start), \
+ "r"(task_args), \
+ "r"((unsigned long)task_args->bootstrap_start + 0x8000) \
+ : "sp", "1", "2", "3", "memory")
+
+/* There is nothing to do since TLS is accessed through r13 */
+#define core_get_tls(pcore, ptls)
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
+
+#endif /* __CR_ASM_RESTORE_H__ */
diff --git a/arch/ppc64/include/asm/restorer.h b/arch/ppc64/include/asm/restorer.h
new file mode 100644
index 000000000000..0549992def65
--- /dev/null
+++ b/arch/ppc64/include/asm/restorer.h
@@ -0,0 +1,136 @@
+#ifndef __CR_ASM_RESTORER_H__
+#define __CR_ASM_RESTORER_H__
+
+#include <asm/ptrace.h>
+#include <asm/elf.h>
+#include <asm/types.h>
+
+/*
+ * sigcontext structure defined in file
+ * /usr/include/powerpc64le-linux-gnu/bits/sigcontext.h,
+ * included from /usr/include/signal.h
+ *
+ * Kernel definition can be found in arch/powerpc/include/uapi/asm/sigcontext.h
+ */
+#include <signal.h>
+
+// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code
+#define rt_sigcontext sigcontext
+
+#include "sigframe.h"
+#define SIGFRAME_OFFSET 0
+
+/* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */
+#define USER_REDZONE_SIZE 512
+
+/* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */
+#define TRAMP_SIZE 6
+
+/*
+ * ucontext defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h
+ */
+struct rt_sigframe {
+ /* sys_rt_sigreturn requires the ucontext be the first field */
+ struct ucontext uc;
+#if 1
+ /*
+ * XXX: Assuming that transactional is turned on by default in
+ * most of the Linux distribution.
+ */
+ struct ucontext uc_transact;
+#endif
+ unsigned long _unused[2];
+ unsigned int tramp[TRAMP_SIZE];
+ struct rt_siginfo *pinfo;
+ void *puc;
+ struct rt_siginfo info;
+ /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */
+ char abigap[USER_REDZONE_SIZE];
+} __attribute__ ((aligned (16)));
+
+#define ARCH_RT_SIGRETURN(new_sp) \
+ asm volatile( \
+ "mr 1, %0 \n" \
+ "li 0, "__stringify(__NR_rt_sigreturn)" \n" \
+ "sc \n" \
+ : \
+ : "r"(new_sp) \
+ : "1", "memory")
+
+/*
+ * Clone trampoline
+ *
+ * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines
+ */
+#if _CALL_ELF != 2
+#error Only supporting ABIv2.
+#else
+#define FRAME_MIN_SIZE_PARM 96
+#endif
+#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
+ thread_args, clone_restore_fn) \
+ asm volatile( \
+ "clone_emul: \n" \
+ "/* Save fn, args, stack across syscall. */ \n" \
+ "mr 14, %5 /* clone_restore_fn in r14 */ \n" \
+ "mr 15, %6 /* &thread_args[i] in r15 */ \n" \
+ "mr 3, %1 /* clone_flags */ \n" \
+ "ld 4, %2 /* new_sp */ \n" \
+ "mr 5, %3 /* &parent_tid */ \n" \
+ "li 6, 0 /* tls = 0 ? */ \n" \
+ "mr 7, %4 /* &thread_args[i].pid */ \n" \
+ "li 0,"__stringify(__NR_clone)" \n" \
+ "sc \n" \
+ "/* Check for child process. */ \n" \
+ "cmpdi cr1,3,0 \n" \
+ "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \
+ "bne- cr1,clone_end \n" \
+ "/* child */ \n" \
+ "addi 14, 14, 8 /* jump over r2 fixup */ \n" \
+ "mtctr 14 \n" \
+ "mr 3,15 \n" \
+ "bctr \n" \
+ "clone_end: \n" \
+ "mr %0,3 \n" \
+ : "=r"(ret) /* %0 */ \
+ : "r"(clone_flags), /* %1 */ \
+ "m"(new_sp), /* %2 */ \
+ "r"(&parent_tid), /* %3 */ \
+ "r"(&thread_args[i].pid), /* %4 */ \
+ "r"(clone_restore_fn), /* %5 */ \
+ "r"(&thread_args[i]) /* %6 */ \
+ : "memory","0","3","4","5","6","7","14","15")
+
+#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc
+#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP])
+#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1)
+#define RT_SIGFRAME_FPU(rt_sigframe) ((rt_sigframe)->uc.uc_mcontext)
+
+int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r);
+int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r);
+
+/* Nothing to do, TLS is accessed through r13 */
+static inline void restore_tls(tls_t *ptls) { (void)ptls; }
+
+static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
+{
+ return 0;
+}
+
+static inline int ptrace_flush_breakpoints(pid_t pid)
+{
+ return 0;
+}
+
+static inline int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe,
+ mcontext_t *sigcontext)
+{
+ return 0;
+}
+
+/*
+ * Defined in arch/ppc64/syscall-common-ppc64.S
+ */
+int sys_shmat(int shmid, const void *shmaddr, int shmflg);
+
+#endif /*__CR_ASM_RESTORER_H__*/
diff --git a/arch/ppc64/include/asm/string.h b/arch/ppc64/include/asm/string.h
new file mode 100644
index 000000000000..034442781678
--- /dev/null
+++ b/arch/ppc64/include/asm/string.h
@@ -0,0 +1,11 @@
+#ifndef __CR_ASM_STRING_H__
+#define __CR_ASM_STRING_H__
+
+#include "compiler.h"
+
+/*
+ * TODO : We may optimized some code here instead of using the generic ones.
+ */
+#include "asm-generic/string.h"
+
+#endif /* __CR_ASM_STRING_H__ */
diff --git a/arch/ppc64/include/asm/types.h b/arch/ppc64/include/asm/types.h
new file mode 100644
index 000000000000..67b7fe2ec4c6
--- /dev/null
+++ b/arch/ppc64/include/asm/types.h
@@ -0,0 +1,111 @@
+#ifndef __CR_ASM_TYPES_H__
+#define __CR_ASM_TYPES_H__
+
+#include <stdbool.h>
+#include <signal.h>
+#include "protobuf/core.pb-c.h"
+
+#include "asm/page.h"
+#include "asm/bitops.h"
+#include "asm/int.h"
+
+/*
+ * Copied from kernel header include/uapi/asm-generic/signal-defs.h
+ */
+typedef void rt_signalfn_t(int, siginfo_t *, void *);
+typedef rt_signalfn_t *rt_sighandler_t;
+
+typedef void rt_restorefn_t(void);
+typedef rt_restorefn_t *rt_sigrestore_t;
+
+#define SIGMAX_OLD 31
+#define SIGMAX 64
+
+/*Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */
+#define _KNSIG 64
+#define _NSIG_BPW 64
+#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
+
+typedef struct {
+ uint64_t sig[_KNSIG_WORDS];
+} k_rtsigset_t;
+
+static inline void ksigfillset(k_rtsigset_t *set)
+{
+ int i;
+ for (i = 0; i < _KNSIG_WORDS; i++)
+ set->sig[i] = (unsigned long)-1;
+}
+
+/* Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */
+#define SA_RESTORER 0x04000000U
+
+typedef struct {
+ rt_sighandler_t rt_sa_handler;
+ unsigned long rt_sa_flags;
+ rt_sigrestore_t rt_sa_restorer;
+ k_rtsigset_t rt_sa_mask; /* mask last for extensibility */
+} rt_sigaction_t;
+
+/*
+ * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h
+ */
+typedef struct {
+ unsigned long gpr[32];
+ unsigned long nip;
+ unsigned long msr;
+ unsigned long orig_gpr3; /* Used for restarting system calls */
+ unsigned long ctr;
+ unsigned long link;
+ unsigned long xer;
+ unsigned long ccr;
+ unsigned long softe; /* Soft enabled/disabled */
+ unsigned long trap; /* Reason for being here */
+ /* N.B. for critical exceptions on 4xx, the dar and dsisr
+ fields are overloaded to hold srr0 and srr1. */
+ unsigned long dar; /* Fault registers */
+ unsigned long dsisr; /* on 4xx/Book-E used for ESR */
+ unsigned long result; /* Result of a system call */
+} user_regs_struct_t;
+
+typedef UserPpc64RegsEntry UserRegsEntry;
+
+#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64
+
+#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
+#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
+
+#define REG_RES(regs) ((u64)(regs).gpr[3])
+#define REG_IP(regs) ((u64)(regs).nip)
+#define REG_SYSCALL_NR(regs) ((u64)(regs).gpr[0])
+
+
+#define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64
+
+/*
+ * Copied from the following kernel header files :
+ * include/linux/auxvec.h
+ * arch/powerpc/include/uapi/asm/auxvec.h
+ * include/linux/mm_types.h
+ */
+#define AT_VECTOR_SIZE_BASE 20
+#define AT_VECTOR_SIZE_ARCH 6
+#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
+
+typedef uint64_t auxv_t;
+
+/* Not used but the structure parasite_dump_thread needs a tls_t field */
+typedef uint64_t tls_t;
+
+/*
+ * Copied for the Linux kernel arch/powerpc/include/asm/processor.h
+ *
+ * NOTE: 32bit task are supported.
+ */
+#define TASK_SIZE_USER64 (0x0000400000000000UL)
+#define TASK_SIZE TASK_SIZE_USER64
+
+static inline void *decode_pointer(uint64_t v) { return (void*)v; }
+static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; }
+
+#endif /* __CR_ASM_TYPES_H__ */
diff --git a/arch/ppc64/include/asm/vdso.h b/arch/ppc64/include/asm/vdso.h
new file mode 100644
index 000000000000..8d089dde3a5d
--- /dev/null
+++ b/arch/ppc64/include/asm/vdso.h
@@ -0,0 +1,172 @@
+#ifndef __CR_ASM_VDSO_H__
+#define __CR_ASM_VDSO_H__
+
+#include <sys/types.h>
+
+#include "asm/int.h"
+#include "protobuf/vma.pb-c.h"
+
+struct parasite_ctl;
+struct vm_area_list;
+
+#define VDSO_PROT (PROT_READ | PROT_EXEC)
+#define VVAR_PROT (PROT_READ)
+
+#define VDSO_BAD_ADDR (-1ul)
+#define VVAR_BAD_ADDR VDSO_BAD_ADDR
+#define VDSO_BAD_PFN (-1ull)
+#define VVAR_BAD_PFN VDSO_BAD_PFN
+
+struct vdso_symbol {
+ char name[32];
+ unsigned long offset;
+};
+
+#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
+
+/* Check if symbol present in symtable */
+static inline bool vdso_symbol_empty(struct vdso_symbol *s)
+{
+ return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
+}
+
+/*
+ * Pick from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S
+ *
+ * Note that '__kernel_datapage_offset' is not a service but mostly a data
+ * inside the text page which should not be used as is from user space.
+ */
+enum {
+ VDSO_SYMBOL_CLOCK_GETRES,
+ VDSO_SYMBOL_CLOCK_GETTIME,
+ VDSO_SYMBOL_GET_SYSCALL_MAP,
+ VDSO_SYMBOL_GET_TBFREQ,
+ VDSO_SYMBOL_GETCPU,
+ VDSO_SYMBOL_GETTIMEOFDAY,
+ VDSO_SYMBOL_SIGTRAMP_RT64,
+ VDSO_SYMBOL_SYNC_DICACHE,
+ VDSO_SYMBOL_SYNC_DICACHE_P5,
+ VDSO_SYMBOL_TIME,
+
+ VDSO_SYMBOL_MAX
+};
+
+#define VDSO_SYMBOL_CLOCK_GETRES_NAME "__kernel_clock_getres"
+#define VDSO_SYMBOL_CLOCK_GETTIME_NAME "__kernel_clock_gettime"
+#define VDSO_SYMBOL_GET_SYSCALL_MAP_NAME "__kernel_get_syscall_map"
+#define VDSO_SYMBOL_GET_TBFREQ_NAME "__kernel_get_tbfreq"
+#define VDSO_SYMBOL_GETCPU_NAME "__kernel_getcpu"
+#define VDSO_SYMBOL_GETTIMEOFDAY_NAME "__kernel_gettimeofday"
+#define VDSO_SYMBOL_SIGTRAMP_RT64_NAME "__kernel_sigtramp_rt64"
+#define VDSO_SYMBOL_SYNC_DICACHE_NAME "__kernel_sync_dicache"
+#define VDSO_SYMBOL_SYNC_DICACHE_P5_NAME "__kernel_sync_dicache_p5"
+#define VDSO_SYMBOL_TIME_NAME "__kernel_time"
+
+struct vdso_symtable {
+ unsigned long vma_start;
+ unsigned long vma_end;
+ unsigned long vvar_start;
+ unsigned long vvar_end;
+ struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
+};
+
+#define VDSO_SYMTABLE_INIT \
+ { \
+ .vma_start = VDSO_BAD_ADDR, \
+ .vma_end = VDSO_BAD_ADDR, \
+ .vvar_start = VVAR_BAD_ADDR, \
+ .vvar_end = VVAR_BAD_ADDR, \
+ .symbols = { \
+ [0 ... VDSO_SYMBOL_MAX - 1] = \
+ (struct vdso_symbol)VDSO_SYMBOL_INIT, \
+ }, \
+ }
+
+/* Size of VMA associated with vdso */
+static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
+{
+ return t->vma_end - t->vma_start;
+}
+
+static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
+{
+ return t->vvar_end - t->vvar_start;
+}
+/*
+ * Special mark which allows to identify runtime vdso where
+ * calls from proxy vdso are redirected. This mark usually
+ * placed at the start of vdso area where Elf header lives.
+ * Since such runtime vdso is solevey used by proxy and
+ * nobody else is supposed to access it, it's more-less
+ * safe to screw the Elf header with @signature and
+ * @proxy_addr.
+ *
+ * The @proxy_addr deserves a few comments. When we redirect
+ * the calls from proxy to runtime vdso, on next checkpoint
+ * it won't be possible to find which VMA is proxy, thus
+ * we save its address in the member.
+ */
+struct vdso_mark {
+ u64 signature;
+ unsigned long proxy_vdso_addr;
+
+ unsigned long version;
+
+ /*
+ * In case of new vDSO format the VVAR area address
+ * neeed for easier discovering where it lives without
+ * relying on procfs output.
+ */
+ unsigned long proxy_vvar_addr;
+};
+
+#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
+#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
+#define VDSO_MARK_CUR_VERSION (2)
+
+static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
+{
+ struct vdso_mark *m = where;
+
+ m->signature = VDSO_MARK_SIGNATURE_V2;
+ m->proxy_vdso_addr = proxy_vdso_addr;
+ m->version = VDSO_MARK_CUR_VERSION;
+ m->proxy_vvar_addr = proxy_vvar_addr;
+}
+
+static inline bool is_vdso_mark(void *addr)
+{
+ struct vdso_mark *m = addr;
+
+ if (m->signature == VDSO_MARK_SIGNATURE_V2) {
+ /*
+ * New format
+ */
+ return true;
+ } else if (m->signature == VDSO_MARK_SIGNATURE) {
+ /*
+ * Old format -- simply extend the mark up
+ * to the version we support.
+ */
+ vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
+ return true;
+ }
+ return false;
+}
+
+
+extern struct vdso_symtable vdso_sym_rt;
+extern u64 vdso_pfn;
+
+extern int vdso_init(void);
+extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
+extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
+extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ unsigned long vdso_rt_parked_at, size_t index,
+ VmaEntry *vmas, size_t nr_vmas);
+
+extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+ struct vm_area_list *vma_area_list);
+extern void write_intraprocedure_branch(void *to, void *from);
+
+#endif /* __CR_ASM_VDSO_H__ */
diff --git a/arch/ppc64/parasite-head.S b/arch/ppc64/parasite-head.S
new file mode 100644
index 000000000000..c7e5bdc66c52
--- /dev/null
+++ b/arch/ppc64/parasite-head.S
@@ -0,0 +1,44 @@
+#include "asm/linkage.h"
+#include "parasite.h"
+
+ .section .head.text
+ .align 8
+
+ENTRY(__export_parasite_head_start)
+
+ // int __used parasite_service(unsigned int cmd, void *args)
+ // cmd = r3 = *__export_parasite_cmd (u32 ?)
+ // args = r4 = @parasite_args_ptr + @pc
+
+ bl 0f
+0: mflr 2
+
+#define LOAD_REG_ADDR(reg, name) \
+ addis reg,2,(name - 0b)@ha; \
+ addi reg,2,(name - 0b)@l;
+
+ LOAD_REG_ADDR(3,__export_parasite_cmd)
+ lwz 3,0(3)
+
+ LOAD_REG_ADDR(4,parasite_args_ptr)
+ lwz 4,0(4)
+ add 4,4,2 // Fix up ptr
+
+ // Set the TOC pointer
+ LOAD_REG_ADDR(5,parasite_toc_ptr)
+ ld 5,0(5)
+ add 2,2,5 // Fix up ptr
+
+ bl parasite_service
+ twi 31,0,0 // Should generate SIGTRAP
+
+parasite_args_ptr:
+ .long __export_parasite_args - (0b - __export_parasite_head_start)
+
+__export_parasite_cmd:
+ .long 0
+
+parasite_toc_ptr:
+ .long .TOC. - (0b - __export_parasite_head_start)
+
+END(__export_parasite_head_start)
diff --git a/arch/ppc64/restorer-trampoline.S b/arch/ppc64/restorer-trampoline.S
new file mode 100644
index 000000000000..5e15615ae1aa
--- /dev/null
+++ b/arch/ppc64/restorer-trampoline.S
@@ -0,0 +1,33 @@
+#include "asm/linkage.h"
+#include "parasite.h"
+
+ .section .head.text
+ .align 8
+
+ // Called through parasite_unmap
+ // This trampoline is there to restore r2 before jumping back to the
+ // C code.
+#define LOAD_REG_ADDR(reg, name) \
+ addis reg,7,(name - 0b)@ha; \
+ addi reg,7,(name - 0b)@l;
+
+ENTRY(__export_unmap_trampoline)
+ bl 0f
+0: mflr 7
+ LOAD_REG_ADDR(8,restorer_r2)
+ ld 2,0(8)
+ b __export_unmap
+ //END(__export_restore_unmap_trampoline)
+
+ // Called from JUMP_TO_RESTORER_BLOB, ctr contains the address where
+ // to jump to, and r3 etc contains the parameter.
+ // Assuming up to 4 parameters here since we are using r7 and r8.
+ENTRY(__export_restore_task_trampoline)
+ bl 0f
+0: mflr 7
+ LOAD_REG_ADDR(8,restorer_r2)
+ std 2,0(8)
+ b __export_restore_task
+
+restorer_r2:
+ .long 0
diff --git a/arch/ppc64/restorer.c b/arch/ppc64/restorer.c
new file mode 100644
index 000000000000..c5e19d9fb977
--- /dev/null
+++ b/arch/ppc64/restorer.c
@@ -0,0 +1,14 @@
+#include <unistd.h>
+
+#include "restorer.h"
+#include "asm/restorer.h"
+#include "asm/fpu.h"
+
+#include "syscall.h"
+#include "log.h"
+//#include "cpu.h"
+
+int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r)
+{
+ return 0;
+}
diff --git a/arch/ppc64/syscall-common-ppc64.S b/arch/ppc64/syscall-common-ppc64.S
new file mode 100644
index 000000000000..78bc1b7e6e85
--- /dev/null
+++ b/arch/ppc64/syscall-common-ppc64.S
@@ -0,0 +1,32 @@
+#include "asm/linkage.h"
+#include <asm/unistd.h> /* for __NR_ipc */
+
+#define SYSCALL(name, opcode) \
+ ENTRY(name); \
+ li 0, opcode; \
+ b __syscall_common; \
+ END(name)
+
+ .text
+ .align 4
+
+ENTRY(__syscall_common)
+ sc
+ bnslr+ /* if no error return to LR */
+ neg 3,3 /* r3 = -r3 to return -errno value */
+ blr
+END(__syscall_common)
+
+ENTRY(__cr_restore_rt)
+ li 0, __NR_rt_sigreturn
+ b __syscall_common
+END(__cr_restore_rt)
+
+ # On Power, shmat is done through the ipc system call.
+ENTRY(sys_shmat)
+ mr 7, 4 # shmaddr -> ptr
+ mr 4, 3 # shmid -> first
+ li 3, 21 # call = SHMAT
+ li 0, __NR_ipc
+ b __syscall_common
+END(sys_shmat)
diff --git a/arch/ppc64/syscall-ppc64.def b/arch/ppc64/syscall-ppc64.def
new file mode 100644
index 000000000000..d8ae4491c679
--- /dev/null
+++ b/arch/ppc64/syscall-ppc64.def
@@ -0,0 +1,99 @@
+#
+# System calls table, please make sure the table consist only the syscalls
+# really used somewhere in project.
+#
+# The template is (name and srguments are optinal if you need only __NR_x
+# defined, but no realy entry point in syscalls lib).
+#
+# name code name arguments
+# -----------------------------------------------------------------------
+#
+__NR_read 3 sys_read (int fd, void *buf, unsigned long count)
+__NR_write 4 sys_write (int fd, const void *buf, unsigned long count)
+__NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode)
+__NR_close 6 sys_close (int fd)
+__NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin)
+__NR_mmap 90 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset)
+__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot)
+__NR_munmap 91 sys_munmap (void *addr, unsigned long len)
+__NR_brk 45 sys_brk (void *addr)
+__NR_rt_sigaction 173 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
+__NR_rt_sigprocmask 174 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize)
+__NR_rt_sigreturn 172 sys_rt_sigreturn (void)
+__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg)
+__NR_pread64 179 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos)
+__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr)
+__NR_mincore 206 sys_mincore (void *addr, unsigned long size, unsigned char *vec)
+__NR_madvise 205 sys_madvise (unsigned long start, size_t len, int behavior)
+__NR_pause 29 sys_pause (void)
+__NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem)
+__NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val)
+__NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old)
+__NR_getpid 20 sys_getpid (void)
+__NR_socket 326 sys_socket (int domain, int type, int protocol)
+__NR_connect 328 sys_connect (int sockfd, struct sockaddr *addr, int addrlen)
+__NR_sendto 335 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
+__NR_recvfrom 337 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
+__NR_sendmsg 341 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags)
+__NR_recvmsg 342 sys_recvmsg (int sockfd, struct msghdr *msg, int flags)
+__NR_shutdown 338 sys_shutdown (int sockfd, int how)
+__NR_bind 327 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen)
+__NR_setsockopt 339 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen)
+__NR_getsockopt 340 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen)
+__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
+__NR_exit 1 sys_exit (unsigned long error_code)
+__NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru)
+__NR_kill 37 sys_kill (long pid, int sig)
+__NR_fcntl 55 sys_fcntl (int fd, int type, long arg)
+__NR_flock 143 sys_flock (int fd, unsigned long cmd)
+__NR_mkdir 39 sys_mkdir (const char *name, int mode)
+__NR_rmdir 40 sys_rmdir (const char *name)
+__NR_unlink 10 sys_unlink (char *pathname)
+__NR_readlink 85 sys_readlink (const char *path, char *buf, int bufsize)
+__NR_umask 60 sys_umask (int mask)
+__NR_getgroups 80 sys_getgroups (int gsize, unsigned int *groups)
+__NR_setresuid 164 sys_setresuid (int uid, int euid, int suid)
+__NR_getresuid 165 sys_getresuid (int *uid, int *euid, int *suid)
+__NR_setresgid 169 sys_setresgid (int gid, int egid, int sgid)
+__NR_getresgid 170 sys_getresgid (int *gid, int *egid, int *sgid)
+__NR_getpgid 132 sys_getpgid (pid_t pid)
+__NR_setfsuid 138 sys_setfsuid (int fsuid)
+__NR_setfsgid 139 sys_setfsgid (int fsgid)
+__NR_getsid 147 sys_getsid (void)
+__NR_capget 183 sys_capget (struct cap_header *h, struct cap_data *d)
+__NR_capset 184 sys_capset (struct cap_header *h, struct cap_data *d)
+__NR_rt_sigqueueinfo 177 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info)
+__NR_sigaltstack 185 sys_sigaltstack (const void *uss, void *uoss)
+__NR_personality 136 sys_personality (unsigned int personality)
+__NR_setpriority 97 sys_setpriority (int which, int who, int nice)
+__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p)
+__NR_prctl 171 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
+__NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim)
+__NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data)
+__NR_umount2 52 sys_umount2 (char *name, int flags)
+__NR_gettid 207 sys_gettid (void)
+__NR_futex 221 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
+__NR_set_tid_address 232 sys_set_tid_address (int *tid_addr)
+__NR_restart_syscall 0 sys_restart_syscall (void)
+__NR_sys_timer_create 240 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, timer_t *created_timer_id)
+__NR_sys_timer_settime 241 sys_timer_settime (timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting)
+__NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting)
+__NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id)
+__NR_sys_timer_delete 244 sys_timer_delete (timer_t timer_id)
+__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp)
+__NR_exit_group 234 sys_exit_group (int error_code)
+__NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len)
+__NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
+__NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
+__NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
+__NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
+__NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+__NR_fanotify_init 323 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
+__NR_fanotify_mark 324 sys_fanotify_mark (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname)
+__NR_prlimit64 325 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim)
+__NR_open_by_handle_at 346 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags)
+__NR_setns 350 sys_setns (int fd, int nstype)
+__NR_kcmp 354 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
+__NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags)
+__NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
+__NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
\ No newline at end of file
diff --git a/arch/ppc64/syscalls-ppc64.sh b/arch/ppc64/syscalls-ppc64.sh
new file mode 100644
index 000000000000..22c81293dfff
--- /dev/null
+++ b/arch/ppc64/syscalls-ppc64.sh
@@ -0,0 +1,54 @@
+#!/bin/sh
+
+gen_asm() {
+ in=$1
+ codesout=$2
+ codesinc=`echo $2 | sed -e 's/.*include\///g'`
+ protosout=$3
+ asmout=$4
+ asmcommon=`echo $5 | sed -e 's/.*include\///g'`
+ prototypes=`echo $6 | sed -e 's/.*include\///g'`
+
+ codesdef=`echo $codesout | sed -e 's/.*include\///g' | tr "[[:space:]].-" _`
+ protosdef=`echo $protosout | sed -e 's/.*include\///g' | tr "[[:space:]].-" _`
+
+ echo "/* Autogenerated, don't edit */" > $codesout
+ echo "#ifndef $codesdef" >> $codesout
+ echo "#define $codesdef" >> $codesout
+
+ echo "/* Autogenerated, don't edit */" > $protosout
+ echo "#ifndef $protosdef" >> $protosout
+ echo "#define $protosdef" >> $protosout
+ echo "#include \"$prototypes\"" >> $protosout
+ echo "#include \"$codesinc\"" >> $protosout
+
+ echo "/* Autogenerated, don't edit */" > $asmout
+ echo "#include \"$codesinc\"" >> $asmout
+ echo "#include \"$asmcommon\"" >> $asmout
+
+ cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "#define", $1, $2}' >> $codesout
+ cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "extern long ", $3, $4, ";"}' >> $protosout
+ cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "SYSCALL(", $3, ",", $2, ")"}' >> $asmout
+
+ echo "#endif /* $codesdef */" >> $codesout
+ echo "#endif /* $protosdef */" >> $protosout
+}
+
+gen_exec() {
+ in=$1
+ codecout=$2
+
+ echo "/* Autogenerated, don't edit */" > $codecout
+
+ cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "SYSCALL(", substr($3, 5), ",", $2, ")"}' >> $codecout
+}
+
+if [ "$1" = "--asm" ]; then
+ shift
+ gen_asm $@
+fi
+
+if [ "$1" = "--exec" ]; then
+ shift
+ gen_exec $@
+fi
diff --git a/arch/ppc64/vdso-pie.c b/arch/ppc64/vdso-pie.c
new file mode 100644
index 000000000000..8219e4af1be3
--- /dev/null
+++ b/arch/ppc64/vdso-pie.c
@@ -0,0 +1,594 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "syscall.h"
+#include "image.h"
+#include "vdso.h"
+#include "vma.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+/* This symbols are defined in vdso-trampoline.S */
+extern char *vdso_trampoline, *vdso_trampoline_end;
+
+static inline void invalidate_caches(unsigned long at)
+{
+ asm volatile("isync \n" \
+ "li 3,0 \n" \
+ "dcbf 3,%0 \n" \
+ "sync \n" \
+ "icbi 3,%0 \n" \
+ "isync \n" \
+ : /* no output */ \
+ : "r"(at) \
+ :"memory", "r3");
+}
+
+/* This is the size of the trampoline call :
+ * mlfr r0
+ * bl trampoline
+ * <64 bit address>
+ */
+#define TRAMP_CALL_SIZE (2*sizeof(uint32_t) + sizeof(uint64_t))
+
+/*
+ * put_trampoline does 2 things :
+ *
+ * 1. it looks for a place in the checkpointed vDSO where to put the
+ * trampoline code (see vdso-trampoline.S).
+ *
+ * 2. for each symbol from the checkpointed vDSO, it checks that there are
+ * enough place to put the call to the vDSO trampoline (see
+ * TRAMP_CALL_SIZE's comment above).
+ * This done by checking that there is no interesting symbols in the range
+ * of current one's offset -> (current one's offset + TRAMP_CALL_SIZE).
+ * Unfortunately the symbols are not sorted by address so we have to look
+ * for the complete table all the time. Since the vDSO is small, this is
+ * not a big issue.
+ */
+static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym)
+{
+ int i,j;
+ unsigned long size;
+ unsigned long trampoline = 0;
+
+ /* First of all we have to find a place where to put the trampoline
+ * code.
+ */
+ size = (unsigned long)&vdso_trampoline_end
+ - (unsigned long)&vdso_trampoline;
+
+ for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) {
+ if (vdso_symbol_empty(&sym->symbols[i]))
+ continue;
+
+ pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name,
+ sym->symbols[i].offset);
+
+ /* find the nearest followin symbol we are interested in */
+ for (j=0; j < ARRAY_SIZE(sym->symbols); j++) {
+ if (i==j || vdso_symbol_empty(&sym->symbols[j]))
+ continue;
+
+ /* pr_debug("next:%s(%lx)\n", sym->symbols[j].name, */
+ /* sym->symbols[j].offset); */
+
+ if (sym->symbols[j].offset <= sym->symbols[i].offset)
+ /* this symbol is above the current one */
+ continue;
+
+ if ((sym->symbols[i].offset+TRAMP_CALL_SIZE) >
+ sym->symbols[j].offset) {
+ /* we have a major issue here since we cannot
+ * even put the trampoline call for this symbol
+ */
+ pr_err("Can't handle small vDSO symbol %s\n",
+ sym->symbols[i].name);
+ return 0;
+ }
+
+ if (trampoline)
+ /* no need to put it twice */
+ continue;
+
+ if ((sym->symbols[j].offset -
+ (sym->symbols[i].offset+TRAMP_CALL_SIZE)) <= size)
+ /* not enough place */
+ continue;
+
+ /* We can put the trampoline there */
+ trampoline = at + sym->symbols[i].offset;
+ trampoline += TRAMP_CALL_SIZE;
+
+ pr_debug("Puting vDSO trampoline in %s at %lx",
+ sym->symbols[i].name, trampoline);
+ builtin_memcpy((void *)trampoline, &vdso_trampoline,
+ size);
+ invalidate_caches(trampoline);
+ }
+ }
+
+ return trampoline;
+}
+
+static inline void put_trampoline_call(unsigned long at, unsigned long to,
+ unsigned long tr)
+{
+ uint32_t *addr = (uint32_t *)at;;
+
+ *addr++ = 0x7C0802a6; /* mflr r0 */
+ *addr++ = 0x48000001 | ((long)(tr-at-4) & 0x3fffffc); /* bl tr */
+ *(uint64_t *)addr = to; /* the address to read by the trampoline */
+
+ invalidate_caches(at);
+}
+
+static int vdso_redirect_calls(unsigned long base_to,
+ unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from)
+{
+ unsigned int i;
+ unsigned long trampoline;
+
+ trampoline = (unsigned long)put_trampoline(base_from, from);
+ if (!trampoline)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
+ if (vdso_symbol_empty(&from->symbols[i]))
+ continue;
+
+ pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n",
+ base_from, from->symbols[i].offset,
+ base_to, to->symbols[i].offset, i,
+ from->symbols[i].name);
+
+ put_trampoline_call(base_from + from->symbols[i].offset,
+ base_to + to->symbols[i].offset,
+ trampoline);
+ }
+
+ return 0;
+}
+
+/* Check if pointer is out-of-bound */
+static bool __ptr_oob(void *ptr, void *start, size_t size)
+{
+ void *end = (void *)((unsigned long)start + size);
+ return ptr > end || ptr < start;
+}
+
+/*
+ * Elf hash, see format specification.
+ */
+static unsigned long elf_hash(const unsigned char *name)
+{
+ unsigned long h = 0, g;
+
+ while (*name) {
+ h = (h << 4) + *name++;
+ g = h & 0xf0000000ul;
+ if (g)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+/*
+ * TODO :
+ * PIE linking doesn't work for this kind of definition.
+ * When build for the parasite code, the pointers to the string are
+ * computed from the start of the object but the generated code is
+ * assuming that the pointers are fixed by the loader.
+ *
+ * In addition, GCC create a call to C library memcpy when the table is
+ * containing more than 9 items. Since the parasite code is not linked
+ * with the C library an undefined symbol error is raised at build time.
+ * By initialising the table at run time, we are working around this
+ * issue.
+ */
+#ifdef __pie__
+static const char *VDSO_SYMBOL(int i)
+{
+ static char *vdso_symbols[VDSO_SYMBOL_MAX];
+ static int init_done = 0;
+
+#define SET_VDSO_SYM(s) vdso_symbols[VDSO_SYMBOL_##s] = VDSO_SYMBOL_##s##_NAME
+ if (!init_done) {
+ SET_VDSO_SYM(CLOCK_GETRES);
+ SET_VDSO_SYM(CLOCK_GETTIME);
+ SET_VDSO_SYM(GET_SYSCALL_MAP);
+ SET_VDSO_SYM(GET_TBFREQ);
+ SET_VDSO_SYM(GETCPU);
+ SET_VDSO_SYM(GETTIMEOFDAY);
+ SET_VDSO_SYM(SIGTRAMP_RT64);
+ SET_VDSO_SYM(SYNC_DICACHE);
+ SET_VDSO_SYM(SYNC_DICACHE_P5);
+ SET_VDSO_SYM(TIME);
+ init_done = 1;
+ }
+ return vdso_symbols[i];
+}
+#else
+#define SET_VDSO_SYM(s) [VDSO_SYMBOL_##s] = VDSO_SYMBOL_##s##_NAME
+const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
+ SET_VDSO_SYM(CLOCK_GETRES),
+ SET_VDSO_SYM(CLOCK_GETTIME),
+ SET_VDSO_SYM(GET_SYSCALL_MAP),
+ SET_VDSO_SYM(GET_TBFREQ),
+ SET_VDSO_SYM(GETCPU),
+ SET_VDSO_SYM(GETTIMEOFDAY),
+ SET_VDSO_SYM(SIGTRAMP_RT64),
+ SET_VDSO_SYM(SYNC_DICACHE),
+ SET_VDSO_SYM(SYNC_DICACHE_P5),
+ SET_VDSO_SYM(TIME)
+};
+#define VDSO_SYMBOL(i) vdso_symbols[i]
+#endif
+
+int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
+{
+ Elf64_Phdr *dynamic = NULL, *load = NULL;
+ Elf64_Ehdr *ehdr = (void *)mem;
+ Elf64_Dyn *dyn_strtab = NULL;
+ Elf64_Dyn *dyn_symtab = NULL;
+ Elf64_Dyn *dyn_strsz = NULL;
+ Elf64_Dyn *dyn_syment = NULL;
+ Elf64_Dyn *dyn_hash = NULL;
+ Elf64_Word *hash = NULL;
+ Elf64_Phdr *phdr;
+ Elf64_Dyn *d;
+
+ Elf64_Word *bucket, *chain;
+ Elf64_Word nbucket, nchain;
+
+ /*
+ * See Elf specification for this magic values.
+ */
+ static const char elf_ident[] = {
+ 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ };
+
+ char *dynsymbol_names;
+ unsigned int i, j, k;
+
+ BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
+
+ pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
+
+ /*
+ * Make sure it's a file we support.
+ */
+ if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
+ pr_err("Elf header magic mismatch\n");
+ return -EINVAL;
+ }
+
+ /*
+ * We need PT_LOAD and PT_DYNAMIC here. Each once.
+ */
+ phdr = (void *)&mem[ehdr->e_phoff];
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (__ptr_oob(phdr, mem, size))
+ goto err_oob;
+ switch (phdr->p_type) {
+ case PT_DYNAMIC:
+ if (dynamic) {
+ pr_err("Second PT_DYNAMIC header\n");
+ return -EINVAL;
+ }
+ dynamic = phdr;
+ break;
+ case PT_LOAD:
+ if (load) {
+ pr_err("Second PT_LOAD header\n");
+ return -EINVAL;
+ }
+ load = phdr;
+ break;
+ }
+ }
+
+ if (!load || !dynamic) {
+ pr_err("One of obligated program headers is missed\n");
+ return -EINVAL;
+ }
+
+ pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
+
+ /*
+ * Dynamic section tags should provide us the rest of information
+ * needed. Note that we're interested in a small set of tags.
+ */
+ d = (void *)&mem[dynamic->p_offset];
+ for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
+ if (__ptr_oob(d, mem, size))
+ goto err_oob;
+
+ if (d->d_tag == DT_NULL) {
+ break;
+ } else if (d->d_tag == DT_STRTAB) {
+ dyn_strtab = d;
+ pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
+ } else if (d->d_tag == DT_SYMTAB) {
+ dyn_symtab = d;
+ pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
+ } else if (d->d_tag == DT_STRSZ) {
+ dyn_strsz = d;
+ pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val);
+ } else if (d->d_tag == DT_SYMENT) {
+ dyn_syment = d;
+ pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val);
+ } else if (d->d_tag == DT_HASH) {
+ dyn_hash = d;
+ pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr);
+ }
+ }
+
+ if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
+ pr_err("Not all dynamic entries are present\n");
+ return -EINVAL;
+ }
+
+ dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
+ if (__ptr_oob(dynsymbol_names, mem, size))
+ goto err_oob;
+
+ hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
+ if (__ptr_oob(hash, mem, size))
+ goto err_oob;
+
+ nbucket = hash[0];
+ nchain = hash[1];
+ bucket = &hash[2];
+ chain = &hash[nbucket + 2];
+
+ pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n",
+ (long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain);
+
+ for (i = 0; i < VDSO_SYMBOL_MAX; i++) {
+ const char * symbol = VDSO_SYMBOL(i);
+ k = elf_hash((const unsigned char *)symbol);
+
+ for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
+ Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
+ char *name;
+
+ sym = &sym[j];
+ if (__ptr_oob(sym, mem, size))
+ continue;
+
+ if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
+ ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
+ continue;
+
+ name = &dynsymbol_names[sym->st_name];
+ if (__ptr_oob(name, mem, size))
+ continue;
+
+ if (builtin_strcmp(name, symbol))
+ continue;
+
+ builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
+ t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
+ break;
+ }
+ }
+
+ return 0;
+
+err_oob:
+ pr_err("Corrupted Elf data\n");
+ return -EFAULT;
+}
+
+static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
+{
+ unsigned long addr;
+
+ pr_debug("Remap %s %lx -> %lx\n", who, from, to);
+
+ addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
+ if (addr != to) {
+ pr_err("Unable to remap %lx -> %lx %lx\n",
+ from, to, addr);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Park runtime vDSO in some safe place where it can be accessible from restorer */
+int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
+{
+ int ret;
+
+ BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
+
+ if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
+ if (sym_rt->vma_start < sym_rt->vvar_start) {
+ ret = vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ park_at += vdso_vma_size(sym_rt);
+ ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
+ park_at, vvar_vma_size(sym_rt));
+ } else {
+ ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
+ park_at, vvar_vma_size(sym_rt));
+ park_at += vvar_vma_size(sym_rt);
+ ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ }
+ } else
+ ret = vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ return ret;
+}
+
+int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ unsigned long vdso_rt_parked_at, size_t index,
+ VmaEntry *vmas, size_t nr_vmas)
+{
+ VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
+ struct vdso_symtable s = VDSO_SYMTABLE_INIT;
+ bool remap_rt = false;
+
+ /*
+ * Figure out which kind of vdso tuple we get.
+ */
+ if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
+ vma_vdso = &vmas[index];
+ else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
+ vma_vvar = &vmas[index];
+
+ if (index < (nr_vmas - 1)) {
+ if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
+ vma_vdso = &vmas[index + 1];
+ else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
+ vma_vvar = &vmas[index + 1];
+ }
+
+ if (!vma_vdso) {
+ pr_err("Can't find vDSO area in image\n");
+ return -1;
+ }
+
+ /*
+ * vDSO mark overwrites Elf program header of proxy vDSO thus
+ * it must never ever be greater in size.
+ */
+ BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
+
+ /*
+ * Find symbols in vDSO zone read from image.
+ */
+ if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
+ return -1;
+
+ /*
+ * Proxification strategy
+ *
+ * - There might be two vDSO zones: vdso code and optionally vvar data
+ * - To be able to use in-place remapping we need
+ *
+ * a) Size and order of vDSO zones are to match
+ * b) Symbols offsets must match
+ * c) Have same number of vDSO zones
+ */
+ if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
+ if (s.symbols[i].offset != sym_rt->symbols[i].offset)
+ break;
+ }
+
+ if (i == ARRAY_SIZE(s.symbols)) {
+ if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
+ remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
+ if (remap_rt) {
+ long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
+ long delta_this = vma_vvar->start - vma_vdso->start;
+
+ remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
+ }
+ } else
+ remap_rt = true;
+ }
+ }
+
+ pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
+ vma_vdso->start, vma_vdso->end,
+ vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
+ vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
+
+ /*
+ * Easy case -- the vdso from image has same offsets, order and size
+ * as runtime, so we simply remap runtime vdso to dumpee position
+ * without generating any proxy.
+ *
+ * Note we may remap VVAR vdso as well which might not yet been mapped
+ * by a caller code. So drop VMA_AREA_REGULAR from it and caller would
+ * not touch it anymore.
+ */
+ if (remap_rt) {
+ int ret = 0;
+
+ pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
+
+ if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
+ pr_err("Failed to unmap %s\n", who);
+ return -1;
+ }
+
+ if (vma_vvar) {
+ if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
+ pr_err("Failed to unmap %s\n", who);
+ return -1;
+ }
+
+ if (vma_vdso->start < vma_vvar->start) {
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+ vdso_rt_parked_at += vdso_vma_size(sym_rt);
+ ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+ } else {
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+ vdso_rt_parked_at += vvar_vma_size(sym_rt);
+ ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+ }
+ } else
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+
+ return ret;
+ }
+
+ /*
+ * Now complex case -- we need to proxify calls. We redirect
+ * calls from dumpee vdso to runtime vdso, making dumpee
+ * to operate as proxy vdso.
+ */
+ pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
+
+ /*
+ * Don't forget to shift if vvar is before vdso.
+ */
+ if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
+ sym_rt->vvar_start < sym_rt->vma_start)
+ vdso_rt_parked_at += vvar_vma_size(sym_rt);
+
+ if (vdso_redirect_calls(vdso_rt_parked_at,
+ vma_vdso->start,
+ sym_rt, &s)) {
+ pr_err("Failed to proxify dumpee contents\n");
+ return -1;
+ }
+
+ /*
+ * Put a special mark into runtime vdso, thus at next checkpoint
+ * routine we could detect this vdso and do not dump it, since
+ * it's auto-generated every new session if proxy required.
+ */
+ sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
+ vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
+ sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
+ return 0;
+}
diff --git a/arch/ppc64/vdso-trampoline.S b/arch/ppc64/vdso-trampoline.S
new file mode 100644
index 000000000000..54a22453701a
--- /dev/null
+++ b/arch/ppc64/vdso-trampoline.S
@@ -0,0 +1,11 @@
+#include "asm/linkage.h"
+
+ .section .text
+
+GLOBAL(vdso_trampoline)
+ mflr 12 /* r12 vdso_ptr's address */
+ mtlr 0 /* restore lr */
+ ld 12,0(12) /* read value store in vdso_ptr */
+ mtctr 12 /* branch to it */
+ bctr
+GLOBAL(vdso_trampoline_end)
diff --git a/arch/ppc64/vdso.c b/arch/ppc64/vdso.c
new file mode 100644
index 000000000000..43d9637f00af
--- /dev/null
+++ b/arch/ppc64/vdso.c
@@ -0,0 +1,309 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <elf.h>
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/types.h"
+#include "asm/parasite-syscall.h"
+
+#include "parasite-syscall.h"
+#include "parasite.h"
+#include "compiler.h"
+#include "kerndat.h"
+#include "vdso.h"
+#include "util.h"
+#include "log.h"
+#include "mem.h"
+#include "vma.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT;
+u64 vdso_pfn = VDSO_BAD_PFN;
+/*
+ * The VMAs list might have proxy vdso/vvar areas left
+ * from previous dump/restore cycle so we need to detect
+ * them and eliminated from the VMAs list, they will be
+ * generated again on restore if needed.
+ */
+int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+ struct vm_area_list *vma_area_list)
+{
+ unsigned long proxy_vdso_addr = VDSO_BAD_ADDR;
+ unsigned long proxy_vvar_addr = VVAR_BAD_ADDR;
+ struct vma_area *proxy_vdso_marked = NULL;
+ struct vma_area *proxy_vvar_marked = NULL;
+ struct parasite_vdso_vma_entry *args;
+ struct vma_area *vma;
+ int fd, ret = -1;
+ off_t off;
+ u64 pfn;
+
+ args = parasite_args(ctl, struct parasite_vdso_vma_entry);
+ fd = open_proc(pid, "pagemap");
+ if (fd < 0)
+ return -1;
+
+ list_for_each_entry(vma, &vma_area_list->h, list) {
+ if (!vma_area_is(vma, VMA_AREA_REGULAR))
+ continue;
+
+ if (vma_area_is(vma, VMA_FILE_SHARED) ||
+ vma_area_is(vma, VMA_FILE_PRIVATE))
+ continue;
+ /*
+ * It might be possible VVAR area from marked
+ * vDSO zone, we need to detect it earlier than
+ * VDSO_PROT test because VVAR_PROT is a subset
+ * of it but don't yield continue here,
+ * sigh... what a mess.
+ */
+ BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT));
+
+ if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) {
+ if (proxy_vvar_addr != VVAR_BAD_ADDR &&
+ proxy_vvar_addr == vma->e->start) {
+ BUG_ON(proxy_vvar_marked);
+ proxy_vvar_marked = vma;
+ continue;
+ }
+ }
+
+ if ((vma->e->prot & VDSO_PROT) != VDSO_PROT)
+ continue;
+
+ if (vma->e->prot != VDSO_PROT) {
+ pr_debug("Dropping %lx using extra protection test\n",
+ vma->e->start);
+ continue;
+ }
+
+ if (vma->e->start > TASK_SIZE)
+ continue;
+
+ if (vma->e->flags & MAP_GROWSDOWN)
+ continue;
+
+ /*
+ * I need to poke every potentially marked vma,
+ * otherwise if task never called for vdso functions
+ * page frame number won't be reported.
+ */
+ args->start = vma->e->start;
+ args->len = vma_area_len(vma);
+
+ if (parasite_execute_daemon(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) {
+ pr_err("vdso: Parasite failed to poke for mark\n");
+ ret = -1;
+ goto err;
+ }
+
+ /*
+ * Defer handling marked vdso until we walked over
+ * all vmas and restore potentially remapped vDSO
+ * area status.
+ */
+ if (unlikely(args->is_marked)) {
+ if (proxy_vdso_marked) {
+ pr_err("Ow! Second vdso mark detected!\n");
+ ret = -1;
+ goto err;
+ }
+ proxy_vdso_marked = vma;
+ proxy_vdso_addr = args->proxy_vdso_addr;
+ proxy_vvar_addr = args->proxy_vvar_addr;
+ continue;
+ }
+
+ off = (vma->e->start / PAGE_SIZE) * sizeof(u64);
+ ret = pread(fd, &pfn, sizeof(pfn), off);
+ if (ret < 0 || ret != sizeof(pfn)) {
+ pr_perror("Can't read pme for pid %d", pid);
+ ret = -1;
+ goto err;
+ }
+
+ pfn = PME_PFRAME(pfn);
+ if (!pfn) {
+ pr_err("Unexpected page fram number 0 for pid %d\n", pid);
+ ret = -1;
+ goto err;
+ }
+
+ /*
+ * Setup proper VMA status. Note starting with 3.16
+ * the [vdso]/[vvar] marks are reported correctly
+ * even when they are remapped into a new place,
+ * but only since that particular version of the
+ * kernel!
+ */
+ if (pfn == vdso_pfn) {
+ if (!vma_area_is(vma, VMA_AREA_VDSO)) {
+ pr_debug("vdso: Restore vDSO status by pfn at %lx\n",
+ (long)vma->e->start);
+ vma->e->status |= VMA_AREA_VDSO;
+ }
+ } else {
+ if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) {
+ pr_debug("vdso: Drop mishinted vDSO status at %lx\n",
+ (long)vma->e->start);
+ vma->e->status &= ~VMA_AREA_VDSO;
+ }
+ }
+ }
+
+ /*
+ * There is marked vdso, it means such vdso is autogenerated
+ * and must be dropped from vma list.
+ */
+ if (proxy_vdso_marked) {
+ pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n",
+ (long)proxy_vdso_marked->e->start,
+ (long)proxy_vdso_addr, (long)proxy_vvar_addr);
+
+ /*
+ * Don't forget to restore the proxy vdso/vvar status, since
+ * it's unknown to the kernel.
+ */
+ list_for_each_entry(vma, &vma_area_list->h, list) {
+ if (vma->e->start == proxy_vdso_addr) {
+ vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO;
+ pr_debug("vdso: Restore proxy vDSO status at %lx\n",
+ (long)vma->e->start);
+ } else if (vma->e->start == proxy_vvar_addr) {
+ vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR;
+ pr_debug("vdso: Restore proxy VVAR status at %lx\n",
+ (long)vma->e->start);
+ }
+ }
+
+ pr_debug("vdso: Droppping marked vdso at %lx\n",
+ (long)proxy_vdso_marked->e->start);
+ list_del(&proxy_vdso_marked->list);
+ xfree(proxy_vdso_marked);
+ vma_area_list->nr--;
+
+ if (proxy_vvar_marked) {
+ pr_debug("vdso: Droppping marked vvar at %lx\n",
+ (long)proxy_vvar_marked->e->start);
+ list_del(&proxy_vvar_marked->list);
+ xfree(proxy_vvar_marked);
+ vma_area_list->nr--;
+ }
+ }
+ ret = 0;
+err:
+ close(fd);
+ return ret;
+}
+
+static int vdso_fill_self_symtable(struct vdso_symtable *s)
+{
+ char buf[512];
+ int ret = -1;
+ FILE *maps;
+
+ *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT;
+
+ maps = fopen_proc(PROC_SELF, "maps");
+ if (!maps) {
+ pr_perror("Can't open self-vma");
+ return -1;
+ }
+
+ while (fgets(buf, sizeof(buf), maps)) {
+ unsigned long start, end;
+ char *has_vdso, *has_vvar;
+
+ has_vdso = strstr(buf, "[vdso]");
+ if (!has_vdso)
+ has_vvar = strstr(buf, "[vvar]");
+ else
+ has_vvar = NULL;
+
+ if (!has_vdso && !has_vvar)
+ continue;
+
+ ret = sscanf(buf, "%lx-%lx", &start, &end);
+ if (ret != 2) {
+ ret = -1;
+ pr_err("Can't find vDSO/VVAR bounds\n");
+ goto err;
+ }
+
+ if (has_vdso) {
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ pr_err("Got second vDSO entry\n");
+ ret = -1;
+ goto err;
+ }
+ s->vma_start = start;
+ s->vma_end = end;
+
+ ret = vdso_fill_symtable((void *)start, end - start, s);
+ if (ret)
+ goto err;
+ } else {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ pr_err("Got second VVAR entry\n");
+ ret = -1;
+ goto err;
+ }
+ s->vvar_start = start;
+ s->vvar_end = end;
+ }
+ }
+
+ /*
+ * Validate its structure -- for new vDSO format the
+ * structure must be like
+ *
+ * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso]
+ * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar]
+ *
+ * The areas may be in reverse order.
+ *
+ * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar]
+ * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso]
+ *
+ */
+ ret = 0;
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ if (s->vma_end != s->vvar_start &&
+ s->vvar_end != s->vma_start) {
+ ret = -1;
+ pr_err("Unexpected rt vDSO area bounds\n");
+ goto err;
+ }
+ }
+ } else {
+ ret = -1;
+ pr_err("Can't find rt vDSO\n");
+ goto err;
+ }
+
+ pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n",
+ s->vma_start, s->vma_end,
+ s->vvar_start, s->vvar_end);
+
+err:
+ fclose(maps);
+ return ret;
+}
+
+int vdso_init(void)
+{
+ if (vdso_fill_self_symtable(&vdso_sym_rt))
+ return -1;
+ return vaddr_to_pfn(vdso_sym_rt.vma_start, &vdso_pfn);
+}
diff --git a/cr-restore.c b/cr-restore.c
index 9d28e69e268d..e100164d4fcb 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -2751,8 +2751,13 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
* might be completely unused so it's here just for convenience.
*/
restore_thread_exec_start = restorer_sym(exec_mem_hint, __export_restore_thread);
+#ifdef CONFIG_PPC64
+ restore_task_exec_start = restorer_sym(exec_mem_hint, __export_restore_task_trampoline);
+ rsti(current)->munmap_restorer = restorer_sym(exec_mem_hint, __export_unmap_trampoline);
+#else
restore_task_exec_start = restorer_sym(exec_mem_hint, __export_restore_task);
rsti(current)->munmap_restorer = restorer_sym(exec_mem_hint, __export_unmap);
+#endif
exec_mem_hint += restorer_len;
diff --git a/include/image.h b/include/image.h
index 55e63dd9c7e2..c13ead0e51aa 100644
--- a/include/image.h
+++ b/include/image.h
@@ -11,7 +11,11 @@
#include "bfd.h"
#include "bug.h"
+#ifdef _ARCH_PPC64
+#define PAGE_IMAGE_SIZE 65536
+#else
#define PAGE_IMAGE_SIZE 4096
+#endif /* _ARCH_PPC64 */
#define PAGE_RSS 1
#define PAGE_ANON 2
diff --git a/pie/Makefile b/pie/Makefile
index c0e8f62cee23..11620d7dcee9 100644
--- a/pie/Makefile
+++ b/pie/Makefile
@@ -10,6 +10,9 @@ obj-e += $(ARCH_DIR)/vdso-pie.o
ifeq ($(SRCARCH),aarch64)
asm-e += $(ARCH_DIR)/intraprocedure.o
endif
+ifeq ($(SRCARCH), ppc64)
+asm-e += $(ARCH_DIR)/vdso-trampoline.o
+endif
endif
parasite-obj-y += parasite.o
@@ -18,6 +21,9 @@ parasite-libs-e += $(SYSCALL-LIB)
restorer-obj-y += restorer.o
restorer-obj-e += $(ARCH_DIR)/restorer.o
+ifeq ($(SRCARCH), ppc64)
+restorer-asm-e += $(ARCH_DIR)/restorer-trampoline.o
+endif
restorer-libs-e += $(SYSCALL-LIB)
#
diff --git a/pie/pie.lds.S.in b/pie/pie.lds.S.in
index f1dc526ef762..9e9c97f003c3 100644
--- a/pie/pie.lds.S.in
+++ b/pie/pie.lds.S.in
@@ -12,6 +12,8 @@ SECTIONS
. = ALIGN(32);
*(.got*)
. = ALIGN(32);
+ *(.toc*)
+ . = ALIGN(32);
} =0x00000000,
/DISCARD/ : {
diff --git a/protobuf/Makefile b/protobuf/Makefile
index d4e177462d11..0b1185203573 100644
--- a/protobuf/Makefile
+++ b/protobuf/Makefile
@@ -3,6 +3,7 @@ proto-obj-y += core.o
proto-obj-y += core-x86.o
proto-obj-y += core-arm.o
proto-obj-y += core-aarch64.o
+proto-obj-y += core-ppc64.o
proto-obj-y += cpuinfo.o
proto-obj-y += inventory.o
proto-obj-y += fdinfo.o
diff --git a/protobuf/core-ppc64.proto b/protobuf/core-ppc64.proto
new file mode 100644
index 000000000000..b874ccf88b9e
--- /dev/null
+++ b/protobuf/core-ppc64.proto
@@ -0,0 +1,23 @@
+message user_ppc64_regs_entry {
+ // Following is the list of regiters starting at r0.
+ repeated uint64 gpr = 1;
+ required uint64 nip = 2;
+ required uint64 msr = 3;
+ required uint64 orig_gpr3 = 4;
+ required uint64 ctr = 5;
+ required uint64 link = 6;
+ required uint64 xer = 7;
+ required uint64 ccr = 8;
+ required uint64 trap = 9;
+}
+
+message user_ppc64_fpstate_entry {
+ // Following is the list of regiters starting at fpr0
+ repeated uint64 fpregs = 1;
+}
+
+message thread_info_ppc64 {
+ required uint64 clear_tid_addr = 1;
+ required user_ppc64_regs_entry gpregs = 2;
+ optional user_ppc64_fpstate_entry fpstate = 3;
+}
diff --git a/protobuf/core.proto b/protobuf/core.proto
index 1f44a470cb78..9f70da929aab 100644
--- a/protobuf/core.proto
+++ b/protobuf/core.proto
@@ -1,6 +1,7 @@
import "core-x86.proto";
import "core-arm.proto";
import "core-aarch64.proto";
+import "core-ppc64.proto";
import "rlimit.proto";
import "timer.proto";
@@ -70,12 +71,14 @@ message core_entry {
X86_64 = 1;
ARM = 2;
AARCH64 = 3;
+ PPC64 = 4;
}
required march mtype = 1;
optional thread_info_x86 thread_info = 2;
optional thread_info_arm ti_arm = 6;
optional thread_info_aarch64 ti_aarch64 = 8;
+ optional thread_info_ppc64 ti_ppc64 = 9;
optional task_core_entry tc = 3;
optional task_kobj_ids_entry ids = 4;
--
1.9.1
More information about the CRIU
mailing list