[Devel] Re: [PATCH 1/1] c/r: define s390-specific checkpoint-restart code (v3)
Oren Laadan
orenl at cs.columbia.edu
Tue Feb 3 11:08:10 PST 2009
Serge E. Hallyn wrote:
> Implement the s390 arch-specific checkpoint/restart helpers. This
> is on top of Oren Laadan's c/r code.
>
> With these, I am able to checkpoint and restart simple
> programs as per Oren's patch intro. While on x86 I never had
> to freeze a single task to checkpoint it, on s390 I do need
> to. That is a prereq for consistent snapshots (esp with
> multiple processes) anyway so I don't see that as a problem.
>
> I'm having a strange problem with libraries though. If I link a
> program with some extra libraries (-lm, -lcrypt, -lpthread,
> whatever), then after restart, if I do a fprintf("%f), the program
> segfaults. Not linking with extra libraries beside libc, or not
> doing a fprintf of a float, doesn't cause any segfaults after
> restart. ltrace and strace aren't helpful, and gdb says
> that the restarted program faulted at __printf_fp@@GLIBC2.4.
> objdump -d output shows no difference (of course, since this
> is after linking), but mentions a __dso_handle which doesn't
> look familiar compared to x86 output. /proc/$$/maps looks
> the same on original and restarted task too. So I'm
> flummoxed.
You can try to force a core-dump of memory contents (and registers)
at the end of the checkpoint and just before resuming to user space
in the restart. Then compare the two. This technique proved invaluable
to debug c/r issues.
>
> Changelog:
> Jan 30:
> . Switched types in cr_hdr_cpu to __u64 etc.
> (Per Oren suggestion)
> . Replaced direct inclusion of structs in
> cr_hdr_cpu with the struct members.
> (Per Oren suggestion)
> . Also ended up adding a bunch of new things
> into restart (mm_segment, ksp, etc) in vain
> attempt to get code using fpu to not segfault
> after restart.
>
> Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
> ---
> arch/s390/include/asm/checkpoint_hdr.h | 91 ++++++++++++++
> arch/s390/include/asm/unistd.h | 4 +-
> arch/s390/kernel/compat_wrapper.S | 12 ++
> arch/s390/kernel/syscalls.S | 2 +
> arch/s390/mm/Makefile | 1 +
> arch/s390/mm/checkpoint.c | 206 ++++++++++++++++++++++++++++++++
> checkpoint/Kconfig | 2 +-
> 7 files changed, 316 insertions(+), 2 deletions(-)
> create mode 100644 arch/s390/include/asm/checkpoint_hdr.h
> create mode 100644 arch/s390/mm/checkpoint.c
>
> diff --git a/arch/s390/include/asm/checkpoint_hdr.h b/arch/s390/include/asm/checkpoint_hdr.h
> new file mode 100644
> index 0000000..f11ec74
> --- /dev/null
> +++ b/arch/s390/include/asm/checkpoint_hdr.h
> @@ -0,0 +1,91 @@
> +#ifndef __ASM_S390_CKPT_HDR_H
> +#define __ASM_S390_CKPT_HDR_H
> +/*
> + * Checkpoint/restart - architecture specific headers s/390
> + *
> + * Copyright (C) 2008 Oren Laadan
> + *
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License. See the file COPYING in the main directory of the Linux
> + * distribution for more details.
> + */
> +
> +#include <linux/types.h>
> +
> +/*
> + * To maintain compatibility between 32-bit and 64-bit architecture flavors,
> + * keep data 64-bit aligned: use padding for structure members, and use
> + * __attribute__((aligned (8))) for the entire structure.
> + *
> + * Quoting Arnd Bergmann:
> + * "This structure has an odd multiple of 32-bit members, which means
> + * that if you put it into a larger structure that also contains 64-bit
> + * members, the larger structure may get different alignment on x86-32
> + * and x86-64, which you might want to avoid. I can't tell if this is
> + * an actual problem here. ... In this case, I'm pretty sure that
> + * sizeof(cr_hdr_task) on x86-32 is different from x86-64, since it
> + * will be 32-bit aligned on x86-32."
> + */
> +
> +#ifdef __KERNEL__
> +#include <asm/processor.h>
> +#else
> +#include <sys/user.h>
> +#endif
> +
> +#ifdef __s390x__
> +/*
> + * Notes
> + * NUM_GPRS defined in <asm/ptrace.h> to be 16
> + * NUM_FPRS defined in <asm/ptrace.h> to be 16
> + * NUM_APRS defined in <asm/ptrace.h> to be 16
> + */
> +struct cr_hdr_cpu {
> + __u64 args[1];
> + __u64 gprs[NUM_GPRS];
> + __u64 orig_gpr2;
> + __u16 svcnr;
> + __u16 ilc;
> + __u32 acrs[NUM_ACRS];
> + __u64 ksp;
> + __u64 prot_addr;
> + __u32 trap_no;
> + __u64 ieee_instruction_pointer;
> + __u64 pfault_wait;
> +
> + /* mm_segment_t */
> + __u32 mm_segment_t_ar4;
> +
> + /* psw_t */
> + __u64 psw_t_mask;
> + __u64 psw_t_addr;
> +
> + /* s390_fp_regs_t */
> + __u32 fpc;
> + struct {
> + float f;
> + double d;
> + __u64 ui;
> + __u32 fp_hi;
> + __u32 fp_lo;
> + } fprs[NUM_FPRS];
> +
> + /* per_struct */
> + __u64 per_control_regs[3];
> + __u32 em_instr;
> + __u64 starting_addr;
> + __u64 ending_addr;
> + __u16 perc_atmid;
> + __u64 address;
> + __u8 access_id;
> +};
> +
> +struct cr_hdr_mm_context {
> + unsigned long vdso_base;
> +};
> +
> +struct cr_hdr_head_arch {
> +};
> +#endif /* __s390x__ */
> +
> +#endif /* __ASM_S390_CKPT_HDR__H */
> diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
> index c8ad350..ffe64a0 100644
> --- a/arch/s390/include/asm/unistd.h
> +++ b/arch/s390/include/asm/unistd.h
> @@ -265,7 +265,9 @@
> #define __NR_pipe2 325
> #define __NR_dup3 326
> #define __NR_epoll_create1 327
> -#define NR_syscalls 328
> +#define __NR_checkpoint 328
> +#define __NR_restart 329
> +#define NR_syscalls 330
>
> /*
> * There are some system calls that are not present on 64 bit, some
> diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
> index fc2c971..9546a81 100644
> --- a/arch/s390/kernel/compat_wrapper.S
> +++ b/arch/s390/kernel/compat_wrapper.S
> @@ -1767,3 +1767,15 @@ sys_dup3_wrapper:
> sys_epoll_create1_wrapper:
> lgfr %r2,%r2 # int
> jg sys_epoll_create1 # branch to system call
> +
> + .globl sys_checkpoint_wrapper
> +sys_checkpoint_wrapper:
> + lgfr %r2,%r2 # pid_t
> + lgfr %r3,%r3 # int
> + llgfr %r4,%r4 # unsigned long
> +
> + .globl sys_restart_wrapper
> +sys_restart_wrapper:
> + lgfr %r2,%r2 # int
> + lgfr %r3,%r3 # int
> + llgfr %r4,%r4 # unsigned long
> diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
> index 2d61787..54316c8 100644
> --- a/arch/s390/kernel/syscalls.S
> +++ b/arch/s390/kernel/syscalls.S
> @@ -336,3 +336,5 @@ SYSCALL(sys_inotify_init1,sys_inotify_init1,sys_inotify_init1_wrapper)
> SYSCALL(sys_pipe2,sys_pipe2,sys_pipe2_wrapper) /* 325 */
> SYSCALL(sys_dup3,sys_dup3,sys_dup3_wrapper)
> SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper)
> +SYSCALL(sys_checkpoint,sys_checkpoint,sys_checkpoint_wrapper)
> +SYSCALL(sys_restart,sys_restart,sys_restart_wrapper)
> diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
> index 2a74581..b3f0f32 100644
> --- a/arch/s390/mm/Makefile
> +++ b/arch/s390/mm/Makefile
> @@ -6,3 +6,4 @@ obj-y := init.o fault.o extmem.o mmap.o vmem.o pgtable.o
> obj-$(CONFIG_CMM) += cmm.o
> obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
> obj-$(CONFIG_PAGE_STATES) += page-states.o
> +obj-$(CONFIG_CHECKPOINT_RESTART) += checkpoint.o
> diff --git a/arch/s390/mm/checkpoint.c b/arch/s390/mm/checkpoint.c
> new file mode 100644
> index 0000000..2c96493
> --- /dev/null
> +++ b/arch/s390/mm/checkpoint.c
> @@ -0,0 +1,206 @@
> +/*
> + * Checkpoint/restart - architecture specific support for s390
> + *
> + * Copyright (C) 2008 Oren Laadan
> + *
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License. See the file COPYING in the main directory of the Linux
> + * distribution for more details.
> + */
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/kernel.h>
> +#include <asm/system.h>
> +#include <asm/pgtable.h>
> +
> +int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
> +{
> + return 0;
> +}
> +
> +static void cr_save_cpu_regs(struct cr_hdr_cpu *hh, struct task_struct *t)
> +{
> + struct thread_struct *thread = &t->thread;
> + struct pt_regs *regs = task_pt_regs(t);
> +
> + hh->fpc = thread->fp_regs.fpc;
> + memcpy(&hh->fprs, &thread->fp_regs.fprs, NUM_FPRS*sizeof(freg_t));
> + memcpy(hh->acrs, &thread->acrs[0], NUM_ACRS * sizeof(unsigned int));
> + hh->mm_segment_t_ar4 = thread->mm_segment.ar4;
> + hh->psw_t_mask = regs->psw.mask;
> + hh->psw_t_addr = regs->psw.addr;
> +
> + hh->ksp = thread->ksp; /* unsure */
> +
> + hh->args[0] = regs->args[0];
> + hh->svcnr = regs->svcnr;
> + hh->ilc = regs->ilc;
> + memcpy(hh->gprs, regs->gprs, NUM_GPRS*sizeof(unsigned long));
> + hh->orig_gpr2 = regs->orig_gpr2;
> +
> + hh->prot_addr = thread->prot_addr;
> + hh->trap_no = thread->trap_no;
> + hh->ieee_instruction_pointer = thread->ieee_instruction_pointer;
> + hh->pfault_wait = thread->pfault_wait;
> +
> + /* per_info */
> + memcpy(&hh->per_control_regs, &thread->per_info.control_regs.words,
> + 3 * sizeof(unsigned long));
> + hh->em_instr = 0;
> + if (thread->per_info.single_step)
> + hh->em_instr |= 1;
> + if (thread->per_info.instruction_fetch)
> + hh->em_instr |= 2;
> + hh->starting_addr = thread->per_info.starting_addr;
> + hh->ending_addr = thread->per_info.ending_addr;
> + hh->perc_atmid = thread->per_info.lowcore.words.perc_atmid;
> + hh->address = thread->per_info.lowcore.words.address;
> + hh->access_id = thread->per_info.lowcore.words.access_id;
> +}
> +
> +/* dump the cpu state and registers of a given task */
> +int cr_write_cpu(struct cr_ctx *ctx, struct task_struct *t)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_cpu *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + int ret;
> +
> + h.type = CR_HDR_CPU;
> + h.len = sizeof(*hh);
> + h.parent = task_pid_vnr(t);
> +
> + cr_save_cpu_regs(hh, t);
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> + WARN_ON_ONCE(ret < 0);
> +
> + return ret;
> +}
> +
> +int cr_write_head_arch(struct cr_ctx *ctx)
> +{
> + return 0;
> +}
> +
> +/* Nothing to do for mm context state */
> +int cr_write_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int parent)
> +{
> + struct cr_hdr h;
> + struct cr_hdr_mm_context *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + int ret;
> +
> + h.type = CR_HDR_MM_CONTEXT;
> + h.len = sizeof(*hh);
> + h.parent = parent;
> +
> +#if 0
> + /* Oren's v13 is on an older kernel which has no vdso_base */
> + /* on newer kernel, we'll have to enable this */
> + hh->vdso_base = mm->context.vdso_base;
> + printk(KERN_NOTICE "checkpointing vdso_base %lx\n", hh->vdso_base);
> +#else
> + hh->vdso_base = 0;
> +#endif
> +
> + ret = cr_write_obj(ctx, &h, hh);
> + cr_hbuf_put(ctx, sizeof(*hh));
> +
> + return ret;
> +}
> +
> +/* restart APIs */
> +
> +int cr_read_thread(struct cr_ctx *ctx)
> +{
> + return 0;
> +}
> +
> +int cr_read_cpu(struct cr_ctx *ctx)
> +{
> + struct cr_hdr_cpu *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + struct thread_struct *thread = ¤t->thread;
> + struct pt_regs *regs = task_pt_regs(current);
> + int parent, ret;
> +
> + parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_CPU);
> + if (parent < 0) {
> + ret = parent;
> + goto out;
> + }
> + ret = 0;
> +
> + regs->psw.addr &= ~PSW_ADDR_INSN;
> + regs->psw.addr |= hh->psw_t_addr & PSW_ADDR_INSN;
> +
> + regs->args[0] = hh->args[0];
> + regs->svcnr = hh->svcnr;
> + regs->ilc = hh->ilc;
> + memcpy(regs->gprs, hh->gprs, NUM_GPRS*sizeof(unsigned long));
> + regs->orig_gpr2 = hh->orig_gpr2;
> +
> + thread->ksp = hh->ksp; /* unsure */
> +
> + memcpy(thread->acrs, hh->acrs, NUM_ACRS * sizeof(unsigned int));
> + thread->prot_addr = hh->prot_addr;
> + thread->trap_no = hh->trap_no;
> + thread->ieee_instruction_pointer = hh->ieee_instruction_pointer;
> + thread->pfault_wait = hh->pfault_wait;
> +
> + /* s390_fp_regs_t */
> + thread->fp_regs.fpc = hh->fpc;
> + memcpy(&thread->fp_regs.fprs, &hh->fprs, NUM_FPRS*sizeof(freg_t));
> +
> + thread->mm_segment.ar4 = hh->mm_segment_t_ar4;
> +
> + /* per_struct */
> + memcpy(&thread->per_info.control_regs.words, &hh->per_control_regs,
> + 3 * sizeof(unsigned long));
> + if (hh->em_instr & 0x01)
> + thread->per_info.single_step = 1;
> + if (hh->em_instr & 0x02)
> + thread->per_info.instruction_fetch = 1;
> + thread->per_info.starting_addr = hh->starting_addr;
> + thread->per_info.ending_addr = hh->ending_addr;
> + thread->per_info.lowcore.words.perc_atmid = hh->perc_atmid;
> + thread->per_info.lowcore.words.address = hh->address;
> + thread->per_info.lowcore.words.access_id = hh->access_id;
> +
> +out:
> + cr_hbuf_put(ctx, sizeof(*hh));
> + return ret;
> +}
> +
> +int cr_read_head_arch(struct cr_ctx *ctx)
> +{
> + return 0;
> +}
> +
> +
> +int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm, int rparent)
> +{
> + struct cr_hdr_mm_context *hh = cr_hbuf_get(ctx, sizeof(*hh));
> + int parent, ret = -EINVAL;
> +
> + s390_enable_sie();
> +
> + parent = cr_read_obj_type(ctx, hh, sizeof(*hh), CR_HDR_MM_CONTEXT);
> + if (parent < 0) {
> + ret = parent;
> + goto out;
> + }
> + if (parent != rparent)
> + goto out;
> +
> +#if 0
> + /* enable this when s390 defines vdso-base */
> + mm->context.vdso_base = hh->vdso_base;
> + printk(KERN_NOTICE "read vdso_base %lx\n", hh->vdso_base);
> +#endif
> + ret = 0;
> +
> + out:
> + cr_hbuf_put(ctx, sizeof(*hh));
> + return ret;
> +}
> diff --git a/checkpoint/Kconfig b/checkpoint/Kconfig
> index ffaa635..cb1d29d 100644
> --- a/checkpoint/Kconfig
> +++ b/checkpoint/Kconfig
> @@ -1,7 +1,7 @@
> config CHECKPOINT_RESTART
> prompt "Enable checkpoint/restart (EXPERIMENTAL)"
> def_bool n
> - depends on X86_32 && EXPERIMENTAL
> + depends on (X86_32 || (S390 && 64BIT)) && EXPERIMENTAL
> help
> Application checkpoint/restart is the ability to save the
> state of a running application so that it can later resume
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list