[Devel] Re: [RFC v14-rc2][PATCH 05/29] x86 support for checkpoint/restart

Sukadev Bhattiprolu sukadev at linux.vnet.ibm.com
Mon Apr 6 20:25:31 PDT 2009


Couple of nits:

Oren Laadan [orenl at cs.columbia.edu] wrote:
| From f5dec38baa6a2cc2a88783db3a9afd676821d293 Mon Sep 17 00:00:00 2001
| From: Oren Laadan <orenl at cs.columbia.edu>
| Date: Mon, 30 Mar 2009 11:14:32 -0400
| Subject: [PATCH 05/29] x86 support for checkpoint/restart
| 
| Add logic to save and restore architecture specific state, including
| thread-specific state, CPU registers and FPU state.
| 
| In addition, architecture capabilities are saved in an architecure
| specific extension of the header (cr_hdr_head_arch); Currently this
| includes only FPU capabilities.
| 
| Currently only x86-32 is supported. Compiling on x86-64 will trigger
| an explicit error.
| 
| Changelog[v14]:
|   - Remove preempt_disable/enable() around init_fpu() and fix leak
|   - Revert change to pr_debug(), back to cr_debug()
|   - Use only unsigned fields in checkpoint headers
|   - Discard field 'h->parent'
|   - Check whether calls to cr_hbuf_get() fail
| 
| Changelog[v12]:
|   - A couple of missed calls to cr_hbuf_put()
|   - Replace obsolete cr_debug() with pr_debug()
| 
| Changelog[v9]:
|   - Add arch-specific header that details architecture capabilities;
|     split FPU restore to send capabilities only once.
|   - Test for zero TLS entries in cr_write_thread()
|   - Fix asm/checkpoint_hdr.h so it can be included from user-space
| 
| Changelog[v7]:
|   - Fix save/restore state of FPU
| 
| Changelog[v5]:
|   - Remove preempt_disable() when restoring debug registers
| 
| Changelog[v4]:
|   - Fix header structure alignment
| 
| Changelog[v2]:
|   - Pad header structures to 64 bits to ensure compatibility
|   - Follow Dave Hansen's refactoring of the original post
| 
| Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
| Acked-by: Serge Hallyn <serue at us.ibm.com>
| Signed-off-by: Dave Hansen <dave at linux.vnet.ibm.com>
| ---
|  arch/x86/include/asm/checkpoint_hdr.h |   98 +++++++++++++
|  arch/x86/mm/Makefile                  |    2 +
|  arch/x86/mm/checkpoint.c              |  242 +++++++++++++++++++++++++++++++++
|  arch/x86/mm/restart.c                 |  223 ++++++++++++++++++++++++++++++
|  checkpoint/checkpoint.c               |   19 ++-
|  checkpoint/checkpoint_arch.h          |    9 ++
|  checkpoint/restart.c                  |   17 ++-
|  include/linux/checkpoint_hdr.h        |    2 +
|  8 files changed, 608 insertions(+), 4 deletions(-)
|  create mode 100644 arch/x86/include/asm/checkpoint_hdr.h
|  create mode 100644 arch/x86/mm/checkpoint.c
|  create mode 100644 arch/x86/mm/restart.c
|  create mode 100644 checkpoint/checkpoint_arch.h
| 
| diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
| new file mode 100644
| index 0000000..ffdb5f5
| --- /dev/null
| +++ b/arch/x86/include/asm/checkpoint_hdr.h
| @@ -0,0 +1,98 @@
| +#ifndef __ASM_X86_CKPT_HDR_H
| +#define __ASM_X86_CKPT_HDR_H
| +/*
| + *  Checkpoint/restart - architecture specific headers x86
| + *
| + *  Copyright (C) 2008-2009 Oren Laadan
| + *
| + *  This file is subject to the terms and conditions of the GNU General Public
| + *  License.  See the file COPYING in the main directory of the Linux
| + *  distribution for more details.
| + */
| +
| +#include <linux/types.h>
| +
| +/*
| + * To maintain compatibility between 32-bit and 64-bit architecture flavors,
| + * keep data 64-bit aligned: use padding for structure members, and use
| + * __attribute__ ((aligned (8))) for the entire structure.
| + *
| + * Quoting Arnd Bergmann:
| + *   "This structure has an odd multiple of 32-bit members, which means
| + *   that if you put it into a larger structure that also contains 64-bit
| + *   members, the larger structure may get different alignment on x86-32
| + *   and x86-64, which you might want to avoid. I can't tell if this is
| + *   an actual problem here. ... In this case, I'm pretty sure that
| + *   sizeof(cr_hdr_task) on x86-32 is different from x86-64, since it
| + *   will be 32-bit aligned on x86-32."
| + */
| +
| +/* i387 structure seen from kernel/userspace */
| +#ifdef __KERNEL__
| +#include <asm/processor.h>
| +#else
| +#include <sys/user.h>
| +#endif
| +
| +struct cr_hdr_head_arch {
| +	/* FIXME: add HAVE_HWFP */
| +
| +	__u16 has_fxsr;
| +	__u16 has_xsave;
| +	__u16 xstate_size;
| +	__u16 _pading;
| +} __attribute__((aligned(8)));
| +
| +struct cr_hdr_thread {
| +	/* FIXME: restart blocks */
| +
| +	__u16 gdt_entry_tls_entries;
| +	__u16 sizeof_tls_array;
| +	__u16 ntls;	/* number of TLS entries to follow */
| +} __attribute__((aligned(8)));
| +
| +struct cr_hdr_cpu {
| +	/* see struct pt_regs (x86-64) */
| +	__u64 r15;
| +	__u64 r14;
| +	__u64 r13;
| +	__u64 r12;
| +	__u64 bp;
| +	__u64 bx;
| +	__u64 r11;
| +	__u64 r10;
| +	__u64 r9;
| +	__u64 r8;
| +	__u64 ax;
| +	__u64 cx;
| +	__u64 dx;
| +	__u64 si;
| +	__u64 di;
| +	__u64 orig_ax;
| +	__u64 ip;
| +	__u64 cs;
| +	__u64 flags;
| +	__u64 sp;
| +	__u64 ss;
| +
| +	/* segment registers */
| +	__u64 ds;
| +	__u64 es;
| +	__u64 fs;
| +	__u64 gs;
| +
| +	/* debug registers */
| +	__u64 debugreg0;
| +	__u64 debugreg1;
| +	__u64 debugreg2;
| +	__u64 debugreg3;
| +	__u64 debugreg6;
| +	__u64 debugreg7;
| +
| +	__u32 uses_debug;
| +	__u32 used_math;
| +
| +	/* thread_xstate contents follow (if used_math) */
| +} __attribute__((aligned(8)));
| +
| +#endif /* __ASM_X86_CKPT_HDR__H */
| diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
| index d8cc96a..e1cb5f8 100644
| --- a/arch/x86/mm/Makefile
| +++ b/arch/x86/mm/Makefile
| @@ -17,3 +17,5 @@ obj-$(CONFIG_K8_NUMA)		+= k8topology_64.o
|  obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
|  
|  obj-$(CONFIG_MEMTEST)		+= memtest.o
| +
| +obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o restart.o
| diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c
| new file mode 100644
| index 0000000..946fac1
| --- /dev/null
| +++ b/arch/x86/mm/checkpoint.c
| @@ -0,0 +1,242 @@
| +/*
| + *  Checkpoint/restart - architecture specific support for x86
| + *
| + *  Copyright (C) 2008-2009 Oren Laadan
| + *
| + *  This file is subject to the terms and conditions of the GNU General Public
| + *  License.  See the file COPYING in the main directory of the Linux
| + *  distribution for more details.
| + */
| +
| +#include <asm/desc.h>
| +#include <asm/i387.h>
| +
| +#include <linux/checkpoint.h>
| +#include <linux/checkpoint_hdr.h>
| +
| +/* dump the thread_struct of a given task */
| +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
| +{
| +	struct cr_hdr h;
| +	struct cr_hdr_thread *hh;
| +	struct thread_struct *thread;
| +	struct desc_struct *desc;
| +	int ntls = 0;
| +	int n, ret;
| +
| +	h.type = CR_HDR_THREAD;
| +	h.len = sizeof(*hh);
| +
| +	hh = cr_hbuf_get(ctx, sizeof(*hh));
| +	if (!hh)
| +		return -ENOMEM;
| +
| +	thread = &t->thread;
| +
| +	/* calculate no. of TLS entries that follow */
| +	desc = thread->tls_array;
| +	for (n = GDT_ENTRY_TLS_ENTRIES; n > 0; n--, desc++) {
| +		if (desc->a || desc->b)
| +			ntls++;
| +	}
| +
| +	hh->gdt_entry_tls_entries = GDT_ENTRY_TLS_ENTRIES;
| +	hh->sizeof_tls_array = sizeof(thread->tls_array);
| +	hh->ntls = ntls;
| +
| +	ret = cr_write_obj(ctx, &h, hh);
| +	cr_hbuf_put(ctx, sizeof(*hh));
| +	if (ret < 0)
| +		return ret;
| +
| +	cr_debug("ntls %d\n", ntls);
| +	if (ntls == 0)
| +		return 0;
| +
| +	/*
| +	 * For simplicity dump the entire array, cherry-pick upon restart
| +	 * FIXME: the TLS descriptors in the GDT should be called out and
| +	 * not tied to the in-kernel representation.
| +	 */
| +	ret = cr_kwrite(ctx, thread->tls_array, sizeof(thread->tls_array));
| +
| +	/* IGNORE RESTART BLOCKS FOR NOW ... */
| +
| +	return ret;
| +}
| +
| +#ifdef CONFIG_X86_64
| +
| +#error "CONFIG_X86_64 unsupported yet."
| +
| +#else	/* !CONFIG_X86_64 */
| +
| +static void cr_save_cpu_regs(struct cr_hdr_cpu *hh, struct task_struct *t)
| +{
| +	struct thread_struct *thread = &t->thread;
| +	struct pt_regs *regs = task_pt_regs(t);
| +
| +	hh->bp = regs->bp;
| +	hh->bx = regs->bx;
| +	hh->ax = regs->ax;
| +	hh->cx = regs->cx;
| +	hh->dx = regs->dx;
| +	hh->si = regs->si;
| +	hh->di = regs->di;
| +	hh->orig_ax = regs->orig_ax;
| +	hh->ip = regs->ip;
| +	hh->cs = regs->cs;
| +	hh->flags = regs->flags;
| +	hh->sp = regs->sp;
| +	hh->ss = regs->ss;
| +
| +	hh->ds = regs->ds;
| +	hh->es = regs->es;
| +
| +	/*
| +	 * for checkpoint in process context (from within a container)
| +	 * the GS and FS registers should be saved from the hardware;
| +	 * otherwise they are already sabed on the thread structure
| +	 */
| +	if (t == current) {
| +		savesegment(gs, hh->gs);
| +		savesegment(fs, hh->fs);
| +	} else {
| +		hh->gs = thread->gs;
| +		hh->fs = thread->fs;
| +	}
| +
| +	/*
| +	 * for checkpoint in process context (from within a container),
| +	 * the actual syscall is taking place at this very moment; so
| +	 * we (optimistically) subtitute the future return value (0) of
| +	 * this syscall into the orig_eax, so that upon restart it will
| +	 * succeed (or it will endlessly retry checkpoint...)
| +	 */
| +	if (t == current) {
| +		BUG_ON(hh->orig_ax < 0);
| +		hh->ax = 0;
| +	}
| +}
| +
| +static void cr_save_cpu_debug(struct cr_hdr_cpu *hh, struct task_struct *t)
| +{
| +	struct thread_struct *thread = &t->thread;
| +
| +	/* debug regs */
| +
| +	/*
| +	 * for checkpoint in process context (from within a container),
| +	 * get the actual registers; otherwise get the saved values.
| +	 */
| +
| +	if (t == current) {
| +		get_debugreg(hh->debugreg0, 0);
| +		get_debugreg(hh->debugreg1, 1);
| +		get_debugreg(hh->debugreg2, 2);
| +		get_debugreg(hh->debugreg3, 3);
| +		get_debugreg(hh->debugreg6, 6);
| +		get_debugreg(hh->debugreg7, 7);
| +	} else {
| +		hh->debugreg0 = thread->debugreg0;
| +		hh->debugreg1 = thread->debugreg1;
| +		hh->debugreg2 = thread->debugreg2;
| +		hh->debugreg3 = thread->debugreg3;
| +		hh->debugreg6 = thread->debugreg6;
| +		hh->debugreg7 = thread->debugreg7;
| +	}
| +
| +	hh->uses_debug = !!(task_thread_info(t)->flags & TIF_DEBUG);
| +}
| +
| +static void cr_save_cpu_fpu(struct cr_hdr_cpu *hh, struct task_struct *t)
| +{
| +	hh->used_math = tsk_used_math(t) ? 1 : 0;
| +}
| +
| +static int cr_write_cpu_fpu(struct cr_ctx *ctx, struct task_struct *t)
| +{
| +	void *xstate_buf = cr_hbuf_get(ctx, xstate_size);
| +	int ret;
| +
| +	/* i387 + MMU + SSE logic */
| +	preempt_disable();	/* needed it (t == current) */
| +
| +	/*
| +	 * normally, no need to unlazy_fpu(), since TS_USEDFPU flag
| +	 * have been cleared when task was context-switched out...

Nit: s/have been/was/

| +	 * except if we are in process context, in which case we do
| +	 */
| +	if (t == current && (task_thread_info(t)->status & TS_USEDFPU))
| +		unlazy_fpu(current);
| +
| +	/*
| +	 * For simplicity dump the entire structure.
| +	 * FIXME: need to be deliberate about what registers we are
| +	 * dumping for traceability and compatibility.
| +	 */
| +	memcpy(xstate_buf, t->thread.xstate, xstate_size);
| +	preempt_enable();	/* needed it (t == current) */

Nit: s/it/if/
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list