[Devel] Re: [PATCH 1/1] cr: remap vdso at original address
Oren Laadan
orenl at cs.columbia.edu
Mon Mar 30 06:30:31 PDT 2009
Serge E. Hallyn wrote:
> Well, here is my current attempt at properly handling vdso for
> the x86 and s390 c/r code. I was going to test with gettimeofday,
> but x86 doesn't use vdso for that, and I'm still recompiling glibc
> on s390 to exploit vdso for it. So what I can tell so far is that
> CONFIG_COMPAT_VDSO=n is now save on x86 - the kernel vdso segment
> is re-mapped from the kernel into the checkpointed location, so
> the fact that the task was started with a random vdso base doesn't
> matter.
>
> Once I get a proper testcase, I intend to show that checkpointing
> a testcase which does gettimeofday(), waiting 10 seconds, and
> then restarting it, will end up with bad __vdso_gettimeofday()
> results without this patch, and good ones with it.
It will be helpful if you split the patch into non-c/r changes (i.e.
add a parameter to request mapping for vdso), following by c/r changes
per architecture.
Also, I noticed that it may be more complicated. For example, the
function get_gate_vma() (at least on x86) test the value of context.vdso
against some constant to decide on the return value. I don't know if
this will break if the vdso of a task ends up being something that the
system didn't expect it to be ?
Oren.
>
> -serge
>
> From f79593f4b17bef93b067584c6222fa9e510ab5a7 Mon Sep 17 00:00:00 2001
> From: Serge E. Hallyn <serue at us.ibm.com>
> Date: Tue, 24 Mar 2009 15:07:40 -0400
> Subject: [PATCH 1/1] cr: remap vdso at original address
>
> Remap the vdso at the original address in the case of sys_restart.
>
> sys_checkpoint does not save away the vdso vma. This is done
> in the arch-independent code
>
> sys_restart uses arch_setup_additional_pages() to request mapping
> vdso at the original (checkpointed) location. This is being
> done in the arch-dependent code.
>
> arch_setup_additional_pages() no longer takes the bprm argument,
> which was not used. Instead it takes a unsigned long reqaddr,
> which is a requested address. If 0, then we assume the usual
> calculation for vdso base is performed. Otherwise, we ask for
> the vdso to be installed at the original location (reqaddr), and
> return -EINVAL if that was not possible.
>
> Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
> ---
> arch/powerpc/include/asm/elf.h | 3 +--
> arch/powerpc/kernel/vdso.c | 8 +++++++-
> arch/s390/include/asm/elf.h | 2 +-
> arch/s390/kernel/vdso.c | 8 +++++++-
> arch/s390/mm/checkpoint.c | 1 +
> arch/s390/mm/restart.c | 9 ++++++++-
> arch/sh/include/asm/elf.h | 3 +--
> arch/sh/kernel/vsyscall/vsyscall.c | 5 ++++-
> arch/x86/include/asm/elf.h | 5 ++---
> arch/x86/mm/restart.c | 14 ++++++++++++--
> arch/x86/vdso/vdso32-setup.c | 8 ++++++--
> arch/x86/vdso/vma.c | 11 +++++++++--
> checkpoint/ckpt_mem.c | 10 ++++++++++
> fs/binfmt_elf.c | 2 +-
> 14 files changed, 70 insertions(+), 19 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
> index cd46f02..133e108 100644
> --- a/arch/powerpc/include/asm/elf.h
> +++ b/arch/powerpc/include/asm/elf.h
> @@ -266,8 +266,7 @@ extern int ucache_bsize;
> /* vDSO has arch_setup_additional_pages */
> #define ARCH_HAS_SETUP_ADDITIONAL_PAGES
> struct linux_binprm;
> -extern int arch_setup_additional_pages(struct linux_binprm *bprm,
> - int uses_interp);
> +extern int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr);
> #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b);
>
> #endif /* __KERNEL__ */
> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
> index ad06d5c..e91310c 100644
> --- a/arch/powerpc/kernel/vdso.c
> +++ b/arch/powerpc/kernel/vdso.c
> @@ -184,7 +184,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
> * This is called from binfmt_elf, we create the special vma for the
> * vDSO and insert it into the mm struct tree
> */
> -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr)
> {
> struct mm_struct *mm = current->mm;
> struct page **vdso_pagelist;
> @@ -210,6 +210,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> vdso_pages = vdso32_pages;
> vdso_base = VDSO32_MBASE;
> #endif
> + if (reqaddr)
> + vdso_base = req_addr;
>
> current->mm->context.vdso_base = 0;
>
> @@ -233,6 +235,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> rc = vdso_base;
> goto fail_mmapsem;
> }
> + if (reqaddr && reqaddr != vdso_base) {
> + rc = -EINVAL;
> + goto fail_mmapsem;
> + }
>
> /*
> * our vma flags don't have VM_WRITE so by default, the process isn't
> diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
> index 74d0bbb..49aaab8 100644
> --- a/arch/s390/include/asm/elf.h
> +++ b/arch/s390/include/asm/elf.h
> @@ -205,6 +205,6 @@ do { \
> struct linux_binprm;
>
> #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
> -int arch_setup_additional_pages(struct linux_binprm *, int);
> +int arch_setup_additional_pages(int, unsigned long);
>
> #endif
> diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
> index 690e178..e50ac7f 100644
> --- a/arch/s390/kernel/vdso.c
> +++ b/arch/s390/kernel/vdso.c
> @@ -184,7 +184,7 @@ static void vdso_init_cr5(void)
> * This is called from binfmt_elf, we create the special vma for the
> * vDSO and insert it into the mm struct tree
> */
> -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr)
> {
> struct mm_struct *mm = current->mm;
> struct page **vdso_pagelist;
> @@ -214,6 +214,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> vdso_pagelist = vdso32_pagelist;
> vdso_pages = vdso32_pages;
> #endif
> + if (reqaddr)
> + vdso_base = reqaddr;
>
> /*
> * vDSO has a problem and was disabled, just don't "enable" it for
> @@ -236,6 +238,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> rc = vdso_base;
> goto out_up;
> }
> + if (reqaddr && vdso_base != reqaddr) {
> + rc = -EINVAL;
> + goto out_up;
> + }
>
> /*
> * our vma flags don't have VM_WRITE so by default, the process
> diff --git a/arch/s390/mm/checkpoint.c b/arch/s390/mm/checkpoint.c
> index c4e3719..1c79418 100644
> --- a/arch/s390/mm/checkpoint.c
> +++ b/arch/s390/mm/checkpoint.c
> @@ -63,6 +63,7 @@ void cr_s390_mm(int op, struct cr_hdr_mm_context *hh, struct mm_struct *mm)
> CR_COPY(op, hh->alloc_pgste, mm->context.alloc_pgste);
> CR_COPY(op, hh->asce_bits, mm->context.asce_bits);
> CR_COPY(op, hh->asce_limit, mm->context.asce_limit);
> + CR_COPY(op, hh->vdso_base, mm->context.vdso_base);
> }
>
> int cr_write_thread(struct cr_ctx *ctx, struct task_struct *t)
> diff --git a/arch/s390/mm/restart.c b/arch/s390/mm/restart.c
> index 6892a2a..401f862 100644
> --- a/arch/s390/mm/restart.c
> +++ b/arch/s390/mm/restart.c
> @@ -13,6 +13,7 @@
> #include <linux/kernel.h>
> #include <asm/system.h>
> #include <asm/pgtable.h>
> +#include <asm/elf.h>
>
> #include "checkpoint_s390.h"
>
> @@ -71,7 +72,13 @@ int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm)
> }
>
> cr_s390_mm(CR_RST, hh, mm);
> - ret = 0;
> + if (mm->context.vdso_base) {
> + ret = arch_setup_additional_pages(1, mm->context.vdso_base);
> + printk(KERN_NOTICE "%s: resetting vdso to %lx ret %d\n",
> + __func__, mm->context.vdso_base, ret);
> + }
> + else
> + ret = 0;
> out:
> cr_hbuf_put(ctx, sizeof(*hh));
> return ret;
> diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h
> index ccb1d93..de173dc 100644
> --- a/arch/sh/include/asm/elf.h
> +++ b/arch/sh/include/asm/elf.h
> @@ -201,8 +201,7 @@ do { \
> /* vDSO has arch_setup_additional_pages */
> #define ARCH_HAS_SETUP_ADDITIONAL_PAGES
> struct linux_binprm;
> -extern int arch_setup_additional_pages(struct linux_binprm *bprm,
> - int uses_interp);
> +extern int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr);
>
> extern unsigned int vdso_enabled;
> extern void __kernel_vsyscall;
> diff --git a/arch/sh/kernel/vsyscall/vsyscall.c b/arch/sh/kernel/vsyscall/vsyscall.c
> index 3f7e415..232b415 100644
> --- a/arch/sh/kernel/vsyscall/vsyscall.c
> +++ b/arch/sh/kernel/vsyscall/vsyscall.c
> @@ -59,12 +59,15 @@ int __init vsyscall_init(void)
> }
>
> /* Setup a VMA at program startup for the vsyscall page */
> -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr)
> {
> struct mm_struct *mm = current->mm;
> unsigned long addr;
> int ret;
>
> + if (reqaddr) /* no restart support for sh yet */
> + return -EINVAL;
> +
> down_write(&mm->mmap_sem);
> addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
> if (IS_ERR_VALUE(addr)) {
> diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
> index f51a3dd..cb16bf6 100644
> --- a/arch/x86/include/asm/elf.h
> +++ b/arch/x86/include/asm/elf.h
> @@ -324,10 +324,9 @@ do { \
> struct linux_binprm;
>
> #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
> -extern int arch_setup_additional_pages(struct linux_binprm *bprm,
> - int uses_interp);
> +extern int arch_setup_additional_pages( int uses_interp, unsigned long reqaddr);
>
> -extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
> +extern int syscall32_setup_pages(int exstack, unsigned long reqaddr);
> #define compat_arch_setup_additional_pages syscall32_setup_pages
>
> extern unsigned long arch_randomize_brk(struct mm_struct *mm);
> diff --git a/arch/x86/mm/restart.c b/arch/x86/mm/restart.c
> index c76d2b2..f5eedf6 100644
> --- a/arch/x86/mm/restart.c
> +++ b/arch/x86/mm/restart.c
> @@ -14,6 +14,8 @@
> #include <linux/checkpoint.h>
> #include <linux/checkpoint_hdr.h>
>
> +#include <asm/elf.h>
> +
> /* read the thread_struct into the current task */
> int cr_read_thread(struct cr_ctx *ctx)
> {
> @@ -240,8 +242,16 @@ int cr_read_mm_context(struct cr_ctx *ctx, struct mm_struct *mm)
> hh->nldt, (unsigned long) hh->vdso, mm->context.vdso);
>
> ret = -EINVAL;
> - if (hh->vdso != (unsigned long) mm->context.vdso)
> - goto out;
> + printk(KERN_NOTICE "%s: setting vdso from %p to %lx\n",
> + __func__, mm->context.vdso, (unsigned long)hh->vdso);
> + mm->context.vdso = (void *) hh->vdso;
> + if (hh->vdso) {
> + ret = arch_setup_additional_pages(1, hh->vdso);
> + printk(KERN_NOTICE "%s: setting vdso to %p, ret %d\n",
> + __func__, mm->context.vdso, ret);
> + if (ret)
> + goto out;
> + }
> if (hh->ldt_entry_size != LDT_ENTRY_SIZE)
> goto out;
>
> diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
> index 1241f11..7d4e99b 100644
> --- a/arch/x86/vdso/vdso32-setup.c
> +++ b/arch/x86/vdso/vdso32-setup.c
> @@ -310,7 +310,7 @@ int __init sysenter_setup(void)
> }
>
> /* Setup a VMA at program startup for the vsyscall page */
> -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr)
> {
> struct mm_struct *mm = current->mm;
> unsigned long addr;
> @@ -331,11 +331,15 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> if (compat)
> addr = VDSO_HIGH_BASE;
> else {
> - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0);
> + addr = get_unmapped_area(NULL, reqaddr, PAGE_SIZE, 0, 0);
> if (IS_ERR_VALUE(addr)) {
> ret = addr;
> goto up_fail;
> }
> + if (reqaddr && addr != reqaddr) {
> + ret = -EINVAL;
> + goto up_fail;
> + }
> }
>
> if (compat_uses_vma || !compat) {
> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index 9c98cc6..098b077 100644
> --- a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -98,7 +98,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
>
> /* Setup a VMA at program startup for the vsyscall page.
> Not called for compat tasks */
> -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> +int arch_setup_additional_pages(int uses_interp, unsigned long reqaddr)
> {
> struct mm_struct *mm = current->mm;
> unsigned long addr;
> @@ -108,12 +108,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> return 0;
>
> down_write(&mm->mmap_sem);
> - addr = vdso_addr(mm->start_stack, vdso_size);
> + if (reqaddr)
> + addr = reqaddr;
> + else
> + addr = vdso_addr(mm->start_stack, vdso_size);
> addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
> if (IS_ERR_VALUE(addr)) {
> ret = addr;
> goto up_fail;
> }
> + if (reqaddr && addr != reqaddr) {
> + ret = -EINVAL;
> + goto up_fail;
> + }
>
> ret = install_special_mapping(mm, addr, vdso_size,
> VM_READ|VM_EXEC|
> diff --git a/checkpoint/ckpt_mem.c b/checkpoint/ckpt_mem.c
> index b1b50b5..e58b348 100644
> --- a/checkpoint/ckpt_mem.c
> +++ b/checkpoint/ckpt_mem.c
> @@ -741,6 +741,13 @@ int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
>
> hh->map_count = mm->map_count;
>
> + /* there's got to be a better way to do this */
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + const char *name = arch_vma_name(vma);
> + if (name && strcmp(name, "[vdso]")==0)
> + hh->map_count--;
> + }
> +
> /* FIX: need also mm->flags */
>
> ret = cr_write_obj(ctx, &h, hh);
> @@ -750,6 +757,9 @@ int cr_write_mm(struct cr_ctx *ctx, struct task_struct *t)
>
> /* write the vma's */
> for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + const char *name = arch_vma_name(vma);
> + if (name && strcmp(name, "[vdso]")==0)
> + continue;
> ret = cr_write_vma(ctx, vma);
> if (ret < 0)
> goto out;
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 33b7235..02168b5 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -961,7 +961,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
> set_binfmt(&elf_format);
>
> #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
> - retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
> + retval = arch_setup_additional_pages(!!elf_interpreter, 0);
> if (retval < 0) {
> send_sig(SIGKILL, current, 0);
> goto out;
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list