[Devel] Re: [RFC v14-rc2][PATCH 05/29] x86 support for checkpoint/restart
Sukadev Bhattiprolu
sukadev at linux.vnet.ibm.com
Mon Apr 6 20:25:31 PDT 2009
Couple of nits:
Oren Laadan [orenl at cs.columbia.edu] wrote:
| From f5dec38baa6a2cc2a88783db3a9afd676821d293 Mon Sep 17 00:00:00 2001
| From: Oren Laadan <orenl at cs.columbia.edu>
| Date: Mon, 30 Mar 2009 11:14:32 -0400
| Subject: [PATCH 05/29] x86 support for checkpoint/restart
|
| Add logic to save and restore architecture specific state, including
| thread-specific state, CPU registers and FPU state.
|
| In addition, architecture capabilities are saved in an architecure
| specific extension of the header (cr_hdr_head_arch); Currently this
| includes only FPU capabilities.
|
| Currently only x86-32 is supported. Compiling on x86-64 will trigger
| an explicit error.
|
| Changelog[v14]:
| - Remove preempt_disable/enable() around init_fpu() and fix leak
| - Revert change to pr_debug(), back to cr_debug()
| - Use only unsigned fields in checkpoint headers
| - Discard field 'h->parent'
| - Check whether calls to cr_hbuf_get() fail
|
| Changelog[v12]:
| - A couple of missed calls to cr_hbuf_put()
| - Replace obsolete cr_debug() with pr_debug()
|
| Changelog[v9]:
| - Add arch-specific header that details architecture capabilities;
| split FPU restore to send capabilities only once.
| - Test for zero TLS entries in cr_write_thread()
| - Fix asm/checkpoint_hdr.h so it can be included from user-space
|
| Changelog[v7]:
| - Fix save/restore state of FPU
|
| Changelog[v5]:
| - Remove preempt_disable() when restoring debug registers
|
| Changelog[v4]:
| - Fix header structure alignment
|
| Changelog[v2]:
| - Pad header structures to 64 bits to ensure compatibility
| - Follow Dave Hansen's refactoring of the original post
|
| Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
| Acked-by: Serge Hallyn <serue at us.ibm.com>
| Signed-off-by: Dave Hansen <dave at linux.vnet.ibm.com>
| ---
| arch/x86/include/asm/checkpoint_hdr.h | 98 +++++++++++++
| arch/x86/mm/Makefile | 2 +
| arch/x86/mm/checkpoint.c | 242 +++++++++++++++++++++++++++++++++
| arch/x86/mm/restart.c | 223 ++++++++++++++++++++++++++++++
| checkpoint/checkpoint.c | 19 ++-
| checkpoint/checkpoint_arch.h | 9 ++
| checkpoint/restart.c | 17 ++-
| include/linux/checkpoint_hdr.h | 2 +
| 8 files changed, 608 insertions(+), 4 deletions(-)
| create mode 100644 arch/x86/include/asm/checkpoint_hdr.h
| create mode 100644 arch/x86/mm/checkpoint.c
| create mode 100644 arch/x86/mm/restart.c
| create mode 100644 checkpoint/checkpoint_arch.h
|
| diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
| new file mode 100644
| index 0000000..ffdb5f5
| --- /dev/null
| +++ b/arch/x86/include/asm/checkpoint_hdr.h
| @@ -0,0 +1,98 @@
| +#ifndef __ASM_X86_CKPT_HDR_H
| +#define __ASM_X86_CKPT_HDR_H
| +/*
| + * Checkpoint/restart - architecture specific headers x86
| + *
| + * Copyright (C) 2008-2009 Oren Laadan
| + *
| + * This file is subject to the terms and conditions of the GNU General Public
| + * License. See the file COPYING in the main directory of the Linux
| + * distribution for more details.
| + */
| +
| +#include <linux/types.h>
| +
| +/*
| + * To maintain compatibility between 32-bit and 64-bit architecture flavors,
| + * keep data 64-bit aligned: use padding for structure members, and use
| + * __attribute__ ((aligned (8))) for the entire structure.
| + *
| + * Quoting Arnd Bergmann:
| + * "This structure has an odd multiple of 32-bit members, which means
| + * that if you put it into a larger structure that also contains 64-bit
| + * members, the larger structure may get different alignment on x86-32
| + * and x86-64, which you might want to avoid. I can't tell if this is
| + * an actual problem here. ... In this case, I'm pretty sure that
| + * sizeof(cr_hdr_task) on x86-32 is different from x86-64, since it
| + * will be 32-bit aligned on x86-32."
| + */
| +
| +/* i387 structure seen from kernel/userspace */
| +#ifdef __KERNEL__
| +#include <asm/processor.h>
| +#else
| +#include <sys/user.h>
| +#endif
| +
| +struct cr_hdr_head_arch {
| + /* FIXME: add HAVE_HWFP */
| +
| + __u16 has_fxsr;
| + __u16 has_xsave;
| + __u16 xstate_size;
| + __u16 _pading;
| +} __attribute__((aligned(8)));
| +
| +struct cr_hdr_thread {
| + /* FIXME: restart blocks */
| +
| + __u16 gdt_entry_tls_entries;
| + __u16 sizeof_tls_array;
| + __u16 ntls; /* number of TLS entries to follow */
| +} __attribute__((aligned(8)));
| +
| +struct cr_hdr_cpu {
| + /* see struct pt_regs (x86-64) */
| + __u64 r15;
| + __u64 r14;
| + __u64 r13;
| + __u64 r12;
| + __u64 bp;
| + __u64 bx;
| + __u64 r11;
| + __u64 r10;
| + __u64 r9;
| + __u64 r8;
| + __u64 ax;
| + __u64 cx;
| + __u64 dx;
| + __u64 si;
| + __u64 di;
| + __u64 orig_ax;
| + __u64 ip;
| + __u64 cs;
| + __u64 flags;
| + __u64 sp;
| + __u64 ss;
| +
| + /* segment registers */
| + __u64 ds;
| + __u64 es;
| + __u64 fs;
| + __u64 gs;
| +
| + /* debug registers */
| + __u64 debugreg0;
| + __u64 debugreg1;
| + __u64 debugreg2;
| + __u64 debugreg3;
| + __u64 debugreg6;
| + __u64 debugreg7;
| +
| + __u32 uses_debug;
| + __u32 used_math;
| +
| + /* thread_xstate contents follow (if used_math) */
| +} __attribute__((aligned(8)));
| +
| +#endif /* __ASM_X86_CKPT_HDR__H */
| diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
| index d8cc96a..e1cb5f8 100644
| --- a/arch/x86/mm/Makefile
| +++ b/arch/x86/mm/Makefile
| @@ -17,3 +17,5 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o
| obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
|
| obj-$(CONFIG_MEMTEST) += memtest.o
| +
| +obj-$(CONFIG_CHECKPOINT) += checkpoint.o restart.o
| diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c
| new file mode 100644
| index 0000000..946fac1
| --- /dev/null
| +++ b/arch/x86/mm/checkpoint.c
| @@ -0,0 +1,242 @@
| +/*
| + * Checkpoint/restart - architecture specific support for x86
| + *
| + * Copyright (C) 2008-2009 Oren Laadan
| + *
| + * This file is subject to the terms and conditions of the GNU General Public
| + * License. See the file COPYING in the main directory of the Linux
| + * distribution for more details.
| + */
| +
| +#include <asm/desc.h>
| +#include <asm/i387.h>
| +
| +#include <linux/checkpoint.h>
| +#include <linux/checkpoint_hdr.h>
| +
| +/* dump the thread_struct of a given task */
| +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
| +{
| + struct cr_hdr h;
| + struct cr_hdr_thread *hh;
| + struct thread_struct *thread;
| + struct desc_struct *desc;
| + int ntls = 0;
| + int n, ret;
| +
| + h.type = CR_HDR_THREAD;
| + h.len = sizeof(*hh);
| +
| + hh = cr_hbuf_get(ctx, sizeof(*hh));
| + if (!hh)
| + return -ENOMEM;
| +
| + thread = &t->thread;
| +
| + /* calculate no. of TLS entries that follow */
| + desc = thread->tls_array;
| + for (n = GDT_ENTRY_TLS_ENTRIES; n > 0; n--, desc++) {
| + if (desc->a || desc->b)
| + ntls++;
| + }
| +
| + hh->gdt_entry_tls_entries = GDT_ENTRY_TLS_ENTRIES;
| + hh->sizeof_tls_array = sizeof(thread->tls_array);
| + hh->ntls = ntls;
| +
| + ret = cr_write_obj(ctx, &h, hh);
| + cr_hbuf_put(ctx, sizeof(*hh));
| + if (ret < 0)
| + return ret;
| +
| + cr_debug("ntls %d\n", ntls);
| + if (ntls == 0)
| + return 0;
| +
| + /*
| + * For simplicity dump the entire array, cherry-pick upon restart
| + * FIXME: the TLS descriptors in the GDT should be called out and
| + * not tied to the in-kernel representation.
| + */
| + ret = cr_kwrite(ctx, thread->tls_array, sizeof(thread->tls_array));
| +
| + /* IGNORE RESTART BLOCKS FOR NOW ... */
| +
| + return ret;
| +}
| +
| +#ifdef CONFIG_X86_64
| +
| +#error "CONFIG_X86_64 unsupported yet."
| +
| +#else /* !CONFIG_X86_64 */
| +
| +static void cr_save_cpu_regs(struct cr_hdr_cpu *hh, struct task_struct *t)
| +{
| + struct thread_struct *thread = &t->thread;
| + struct pt_regs *regs = task_pt_regs(t);
| +
| + hh->bp = regs->bp;
| + hh->bx = regs->bx;
| + hh->ax = regs->ax;
| + hh->cx = regs->cx;
| + hh->dx = regs->dx;
| + hh->si = regs->si;
| + hh->di = regs->di;
| + hh->orig_ax = regs->orig_ax;
| + hh->ip = regs->ip;
| + hh->cs = regs->cs;
| + hh->flags = regs->flags;
| + hh->sp = regs->sp;
| + hh->ss = regs->ss;
| +
| + hh->ds = regs->ds;
| + hh->es = regs->es;
| +
| + /*
| + * for checkpoint in process context (from within a container)
| + * the GS and FS registers should be saved from the hardware;
| + * otherwise they are already sabed on the thread structure
| + */
| + if (t == current) {
| + savesegment(gs, hh->gs);
| + savesegment(fs, hh->fs);
| + } else {
| + hh->gs = thread->gs;
| + hh->fs = thread->fs;
| + }
| +
| + /*
| + * for checkpoint in process context (from within a container),
| + * the actual syscall is taking place at this very moment; so
| + * we (optimistically) subtitute the future return value (0) of
| + * this syscall into the orig_eax, so that upon restart it will
| + * succeed (or it will endlessly retry checkpoint...)
| + */
| + if (t == current) {
| + BUG_ON(hh->orig_ax < 0);
| + hh->ax = 0;
| + }
| +}
| +
| +static void cr_save_cpu_debug(struct cr_hdr_cpu *hh, struct task_struct *t)
| +{
| + struct thread_struct *thread = &t->thread;
| +
| + /* debug regs */
| +
| + /*
| + * for checkpoint in process context (from within a container),
| + * get the actual registers; otherwise get the saved values.
| + */
| +
| + if (t == current) {
| + get_debugreg(hh->debugreg0, 0);
| + get_debugreg(hh->debugreg1, 1);
| + get_debugreg(hh->debugreg2, 2);
| + get_debugreg(hh->debugreg3, 3);
| + get_debugreg(hh->debugreg6, 6);
| + get_debugreg(hh->debugreg7, 7);
| + } else {
| + hh->debugreg0 = thread->debugreg0;
| + hh->debugreg1 = thread->debugreg1;
| + hh->debugreg2 = thread->debugreg2;
| + hh->debugreg3 = thread->debugreg3;
| + hh->debugreg6 = thread->debugreg6;
| + hh->debugreg7 = thread->debugreg7;
| + }
| +
| + hh->uses_debug = !!(task_thread_info(t)->flags & TIF_DEBUG);
| +}
| +
| +static void cr_save_cpu_fpu(struct cr_hdr_cpu *hh, struct task_struct *t)
| +{
| + hh->used_math = tsk_used_math(t) ? 1 : 0;
| +}
| +
| +static int cr_write_cpu_fpu(struct cr_ctx *ctx, struct task_struct *t)
| +{
| + void *xstate_buf = cr_hbuf_get(ctx, xstate_size);
| + int ret;
| +
| + /* i387 + MMU + SSE logic */
| + preempt_disable(); /* needed it (t == current) */
| +
| + /*
| + * normally, no need to unlazy_fpu(), since TS_USEDFPU flag
| + * have been cleared when task was context-switched out...
Nit: s/have been/was/
| + * except if we are in process context, in which case we do
| + */
| + if (t == current && (task_thread_info(t)->status & TS_USEDFPU))
| + unlazy_fpu(current);
| +
| + /*
| + * For simplicity dump the entire structure.
| + * FIXME: need to be deliberate about what registers we are
| + * dumping for traceability and compatibility.
| + */
| + memcpy(xstate_buf, t->thread.xstate, xstate_size);
| + preempt_enable(); /* needed it (t == current) */
Nit: s/it/if/
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list