[Devel] [PATCH 2/2] c/r: x86-64: checkpoint/restart implementation
Oren Laadan
orenl at cs.columbia.edu
Sun Dec 6 12:31:09 PST 2009
Support for checkpoint and restart for X86_32 architecture.
Partly based on Alexey's work.
Checkpoint Restart
(app/arch) (app/arch)
--------------------------------
64/x86-64 -> 64/x86-64 works
32/x86-64 -> 32/x86-64 ?
32/x86-64 -> 32/x86-32 ?
32/x86-32 -> 32/x86-64 ?
Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
---
arch/x86/Kconfig | 2 +-
arch/x86/include/asm/checkpoint_hdr.h | 6 +
arch/x86/include/asm/syscalls.h | 6 +
arch/x86/include/asm/unistd_64.h | 4 +
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/checkpoint_64.c | 251 +++++++++++++++++++++++++++++++++
arch/x86/kernel/entry_64.S | 5 +
include/linux/checkpoint_hdr.h | 2 +
8 files changed, 277 insertions(+), 1 deletions(-)
create mode 100644 arch/x86/kernel/checkpoint_64.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 69d6077..f6260f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,7 +88,7 @@ config HAVE_LATENCYTOP_SUPPORT
config CHECKPOINT_SUPPORT
bool
- default y if X86_32
+ default y
config MMU
def_bool y
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index 65511ca..0033bfe 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -36,6 +36,10 @@
#include <asm/processor.h>
#endif
+#ifdef CONFIG_X86_64
+#define CKPT_ARCH_ID CKPT_ARCH_X86_64
+#endif
+
#ifdef CONFIG_X86_32
#define CKPT_ARCH_ID CKPT_ARCH_X86_32
#endif
@@ -135,6 +139,8 @@ struct ckpt_hdr_cpu {
#define CKPT_X86_SEG_NULL 0
#define CKPT_X86_SEG_USER32_CS 1
#define CKPT_X86_SEG_USER32_DS 2
+#define CKPT_X86_SEG_USER64_CS 3
+#define CKPT_X86_SEG_USER64_DS 4
#define CKPT_X86_SEG_TLS 0x4000 /* 0100 0000 0000 00xx */
#define CKPT_X86_SEG_LDT 0x8000 /* 100x xxxx xxxx xxxx */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 1079447..063cdd0 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -88,6 +88,12 @@ asmlinkage long sys_execve(char __user *, char __user * __user *,
struct pt_regs *);
long sys_arch_prctl(int, unsigned long);
+/* kernel/checkpoint_64.c */
+#ifdef CONFIG_CHECKPOINT
+asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd,
+ struct pt_regs *regs);
+#endif
+
/* kernel/signal.c */
asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
struct pt_regs *);
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2ffc89..c360707 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,10 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
#define __NR_eclone 299
__SYSCALL(__NR_eclone, stub_eclone)
+#define __NR_checkpoint 300
+__SYSCALL(__NR_checkpoint, sys_checkpoint)
+#define __NR_restart 301
+__SYSCALL(__NR_restart, stub_restart)
#ifndef __NO_STUBS
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2821fd6..ded0ee2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -138,4 +138,6 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
+
+ obj-$(CONFIG_CHECKPOINT) += checkpoint_64.o
endif
diff --git a/arch/x86/kernel/checkpoint_64.c b/arch/x86/kernel/checkpoint_64.c
new file mode 100644
index 0000000..3901a53
--- /dev/null
+++ b/arch/x86/kernel/checkpoint_64.c
@@ -0,0 +1,251 @@
+/*
+ * Checkpoint/restart - architecture specific support for x86_64
+ *
+ * Copyright (C) 2009 Oren Laadan
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of the Linux
+ * distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG CKPT_DSYS
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/elf.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/*
+ * sys_restart needs to access and modify the pt_regs structure to
+ * restore the original state from the time of the checkpoint.
+ */
+asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd,
+ struct pt_regs *regs)
+{
+ return do_sys_restart(pid, fd, flags, logfd);
+}
+
+/* helpers to encode/decode/validate segments */
+
+int check_segment(__u16 seg)
+{
+ int ret = 0;
+
+ switch (seg) {
+ case CKPT_X86_SEG_NULL:
+ case CKPT_X86_SEG_USER64_CS:
+ case CKPT_X86_SEG_USER64_DS:
+#ifdef CONFIG_COMPAT
+ case CKPT_X86_SEG_USER32_CS:
+ case CKPT_X86_SEG_USER32_DS:
+#endif
+ return 1;
+ }
+ if (seg & CKPT_X86_SEG_TLS) {
+ seg &= ~CKPT_X86_SEG_TLS;
+ if (seg <= GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN)
+ ret = 1;
+ } else if (seg & CKPT_X86_SEG_LDT) {
+ seg &= ~CKPT_X86_SEG_LDT;
+ if (seg <= 0x1fff)
+ ret = 1;
+ }
+ return ret;
+}
+
+__u16 encode_segment(unsigned short seg)
+{
+ if (seg == 0)
+ return CKPT_X86_SEG_NULL;
+ BUG_ON((seg & 3) != 3);
+
+ if (seg == __USER_CS)
+ return CKPT_X86_SEG_USER64_CS;
+ if (seg == __USER_DS)
+ return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+ if (seg == __USER32_CS)
+ return CKPT_X86_SEG_USER32_CS;
+ if (seg == __USER32_DS)
+ return CKPT_X86_SEG_USER32_DS;
+#endif
+
+ if (seg & 4)
+ return CKPT_X86_SEG_LDT | (seg >> 3);
+
+ seg >>= 3;
+ if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+ return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+ printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+ BUG();
+}
+
+unsigned short decode_segment(__u16 seg)
+{
+ if (seg == CKPT_X86_SEG_NULL)
+ return 0;
+
+ if (seg == CKPT_X86_SEG_USER64_CS)
+ return __USER_CS;
+ if (seg == CKPT_X86_SEG_USER64_DS)
+ return __USER_DS;
+#ifdef CONFIG_COMPAT
+ if (seg == CKPT_X86_SEG_USER32_CS)
+ return __USER32_CS;
+ if (seg == CKPT_X86_SEG_USER32_DS)
+ return __USER32_DS;
+#endif
+
+ if (seg & CKPT_X86_SEG_TLS) {
+ seg &= ~CKPT_X86_SEG_TLS;
+ return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+ }
+ if (seg & CKPT_X86_SEG_LDT) {
+ seg &= ~CKPT_X86_SEG_LDT;
+ return (seg << 3) | 7;
+ }
+ BUG();
+}
+
+void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+ struct pt_regs *regs = task_pt_regs(t);
+ unsigned long _ds, _es, _fs, _gs;
+
+ h->r15 = regs->r15;
+ h->r14 = regs->r14;
+ h->r13 = regs->r13;
+ h->r12 = regs->r12;
+ h->r11 = regs->r11;
+ h->r10 = regs->r10;
+ h->r9 = regs->r9;
+ h->r8 = regs->r8;
+
+ h->bp = regs->bp;
+ h->bx = regs->bx;
+ h->ax = regs->ax;
+ h->cx = regs->cx;
+ h->dx = regs->dx;
+ h->si = regs->si;
+ h->di = regs->di;
+ h->orig_ax = regs->orig_ax;
+ h->ip = regs->ip;
+
+ h->flags = regs->flags;
+ h->sp = regs->sp;
+
+ /*
+ * for checkpoint in process context (from within a container)
+ * DS, ES, FS, GS registers should be saved from the hardware;
+ * otherwise they are already saved on the thread structure
+ */
+
+ h->cs = encode_segment(regs->cs);
+ h->ss = encode_segment(regs->ss);
+
+ if (t == current) {
+ savesegment(ds, _ds);
+ savesegment(es, _es);
+ savesegment(fs, _fs);
+ savesegment(gs, _gs);
+ } else {
+ _ds = t->thread.ds;
+ _es = t->thread.es;
+ _fs = t->thread.fsindex;
+ _gs = t->thread.gsindex;
+ }
+ h->ds = encode_segment(_ds);
+ h->es = encode_segment(_es);
+ h->fsindex = encode_segment(_fs);
+ h->gsindex = encode_segment(_gs);
+
+ if (!test_tsk_thread_flag(t, TIF_IA32)) {
+ h->fs = t->thread.fs;
+ h->gs = t->thread.gs;
+ }
+
+ /*
+ * for checkpoint in process context (from within a container),
+ * the actual syscall is taking place at this very moment; so
+ * we (optimistically) subtitute the future return value (0) of
+ * this syscall into the orig_eax, so that upon restart it will
+ * succeed (or it will endlessly retry checkpoint...)
+ */
+ if (t == current) {
+ BUG_ON(h->orig_ax < 0);
+ h->ax = 0;
+ }
+}
+
+int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+ struct thread_struct *thread = &t->thread;
+ struct pt_regs *regs = task_pt_regs(t);
+
+ if (h->cs == CKPT_X86_SEG_NULL)
+ return -EINVAL;
+ if (!check_segment(h->cs) || !check_segment(h->ds) ||
+ !check_segment(h->es) || !check_segment(h->ss) ||
+ !check_segment(h->fsindex) || !check_segment(h->gsindex))
+ return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+ if (test_tsk_thread_flag(t, TIF_IA32) &&
+ (!check_segment(h->fs) || !check_segment(h->gs)))
+ return -EINVAL;
+#endif
+
+ regs->r15 = h->r15;
+ regs->r14 = h->r14;
+ regs->r13 = h->r13;
+ regs->r12 = h->r12;
+ regs->r11 = h->r11;
+ regs->r10 = h->r10;
+ regs->r9 = h->r9;
+ regs->r8 = h->r8;
+
+ regs->bp = h->bp;
+ regs->bx = h->bx;
+ regs->ax = h->ax;
+ regs->cx = h->cx;
+ regs->dx = h->dx;
+ regs->si = h->si;
+ regs->di = h->di;
+ regs->orig_ax = h->orig_ax;
+ regs->ip = h->ip;
+
+ regs->sp = h->sp;
+ thread->usersp = h->sp;
+
+ preempt_disable();
+
+ regs->cs = decode_segment(h->cs);
+ regs->ss = decode_segment(h->ss);
+ thread->ds = decode_segment(h->ds);
+ thread->es = decode_segment(h->es);
+ thread->fsindex = decode_segment(h->fsindex);
+ thread->gsindex = decode_segment(h->gsindex);
+
+#ifdef CONFIG_COMPAT
+ if (!test_tsk_thread_flag(t, TIF_IA32)) {
+ thread->fs = h->fs;
+ thread->gs = h->gs;
+ }
+#endif
+
+ /* XXX - unsure is this really needed ... */
+ loadsegment(fs, thread->fsindex);
+ if (thread->fs)
+ wrmsrl(MSR_FS_BASE, thread->fs);
+ load_gs_index(thread->gsindex);
+ if (thread->gs)
+ wrmsrl(MSR_KERNEL_GS_BASE, thread->gs);
+
+ preempt_enable();
+
+ return 0;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6d60cd1..e692193 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -699,6 +699,11 @@ END(\label)
PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
PTREGSCALL stub_iopl, sys_iopl, %rsi
PTREGSCALL stub_eclone, sys_eclone, %r8
+#ifdef CONFIG_CHECKPOINT
+ PTREGSCALL stub_restart, sys_restart, %r8
+#else
+ PTREGSCALL stub_restart, sys_ni_syscall, %r8
+#endif
ENTRY(ptregscall_common)
DEFAULT_FRAME 1 8 /* offset 8: return address */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 4e57d37..6468fa9 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -195,6 +195,8 @@ enum {
#define CKPT_ARCH_PPC32 CKPT_ARCH_PPC32
CKPT_ARCH_PPC64,
#define CKPT_ARCH_PPC64 CKPT_ARCH_PPC64
+ CKPT_ARCH_X86_64,
+#define CKPT_ARCH_X86_64 CKPT_ARCH_X86_64
};
/* shared objrects (objref) */
--
1.6.3.3
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list