[Devel] Re: [PATCH] c/r: do not hold mmap_sem while checkpointing vma's
Matt Helsley
matthltc at us.ibm.com
Mon Oct 26 13:52:36 PDT 2009
On Sun, Oct 25, 2009 at 06:23:29PM -0400, Oren Laadan wrote:
> This patch modifies the memory checkpoint code to _not_ hold the
> mmap_sem while dumping out the vma's.
>
> The problem with holding the mmap_sem is that it first takes the
> mmap_sem and then takes the file's inode semaphore. This violates the
> normal locking order, e,g, when taking a page fault during a copyout,
> which is inode sem and then the mmap_sem.
>
> Normally this reverse locking order won't cause a lockup because a the
> output file for the checkpoint image isn't used by the checkpointee.
> However, there a couple of cases where it may be a problem, e.g. when
> some async-IO happens to complete and triggers a page fault at the
> wrong time.
>
> This fixes complaints from the lockdep about this reverse ordering.
>
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
> ---
> checkpoint/memory.c | 133 ++++++++++++++++++++++++++++++++++++---------------
> 1 files changed, 94 insertions(+), 39 deletions(-)
>
> diff --git a/checkpoint/memory.c b/checkpoint/memory.c
> index 0da948f..656614c 100644
> --- a/checkpoint/memory.c
> +++ b/checkpoint/memory.c
> @@ -644,11 +644,80 @@ static int anonymous_checkpoint(struct ckpt_ctx *ctx,
> return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
> }
>
> +static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
> +{
> + struct vm_area_struct *vma, *next;
> + int map_count = 0;
> + int ret = 0;
> +
> + vma = kzalloc(sizeof(*vma), GFP_KERNEL);
> + if (!vma)
> + return -ENOMEM;
> +
> + /*
> + * Must not hold mm->mmap_sem when writing to image file, so
> + * can't simply traverse the vma list. Instead, use find_vma()
> + * to get the @next and make a local "copy" of it.
> + */
> + while (1) {
> + down_read(&mm->mmap_sem);
> + next = find_vma(mm, vma->vm_end);
> + if (!next) {
> + up_read(&mm->mmap_sem);
> + break;
> + }
> + if (vma->vm_file)
> + fput(vma->vm_file);
> + *vma = *next;
> + if (vma->vm_file)
> + get_file(vma->vm_file);
> + up_read(&mm->mmap_sem);
> +
> + map_count++;
> +
> + ckpt_debug("vma %#lx-%#lx flags %#lx\n",
> + vma->vm_start, vma->vm_end, vma->vm_flags);
> +
> + if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
> + ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n",
> + -ENOSYS, vma->vm_flags);
> + ret = -ENOSYS;
> + break;
> + }
> +
> + if (!vma->vm_ops)
> + ret = anonymous_checkpoint(ctx, vma);
> + else if (vma->vm_ops->checkpoint)
> + ret = (*vma->vm_ops->checkpoint)(ctx, vma);
> + else
> + ret = -ENOSYS;
> + if (ret < 0) {
> + ckpt_write_err(ctx, "TE", "vma: failed", ret);
> + break;
> + }
> + /*
> + * The file was collected, but not always checkpointed;
> + * be safe and mark as visited to appease leak detection
> + */
> + if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
> + ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
> + if (ret < 0)
> + break;
> + }
> + }
> +
> + if (vma->vm_file)
> + fput(vma->vm_file);
> +
> + kfree(vma);
> +
> + return ret < 0 ? ret : map_count;
> +}
> +
> static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
> {
> struct ckpt_hdr_mm *h;
> - struct vm_area_struct *vma;
> - int exe_objref = 0;
> + struct file *exe_file = NULL;
> int ret;
>
> h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
> @@ -674,14 +743,23 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
>
> h->map_count = mm->map_count;
>
> - /* checkpoint the ->exe_file */
> - if (mm->exe_file) {
> - exe_objref = checkpoint_obj(ctx, mm->exe_file, CKPT_OBJ_FILE);
> - if (exe_objref < 0) {
> - ret = exe_objref;
> + if (mm->exe_file) { /* checkpoint the ->exe_file */
> + exe_file = mm->exe_file;
> + get_file(exe_file);
> + }
> +
> + /*
> + * Drop mm->mmap_sem before writing data to checkpoint image
> + * to avoid reverse locking order (inode must come before mm).
> + */
> + up_read(&mm->mmap_sem);
> +
> + if (exe_file) {
> + h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
> + if (h->exe_objref < 0) {
> + ret = h->exe_objref;
> goto out;
> }
> - h->exe_objref = exe_objref;
> }
>
> ret = ckpt_write_obj(ctx, &h->h);
> @@ -692,40 +770,17 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
> if (ret < 0)
> return ret;
>
> - /* write the vma's */
> - for (vma = mm->mmap; vma; vma = vma->vm_next) {
> - ckpt_debug("vma %#lx-%#lx flags %#lx\n",
> - vma->vm_start, vma->vm_end, vma->vm_flags);
> - if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
> - ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n",
> - -ENOSYS, vma->vm_flags);
> - return -ENOSYS;
> - }
> - if (!vma->vm_ops)
> - ret = anonymous_checkpoint(ctx, vma);
> - else if (vma->vm_ops->checkpoint)
> - ret = (*vma->vm_ops->checkpoint)(ctx, vma);
> - else
> - ret = -ENOSYS;
> - if (ret < 0) {
> - ckpt_write_err(ctx, "TE", "vma: failed", ret);
> - goto out;
> - }
> - /*
> - * The file was collected, but not always checkpointed;
> - * be safe and mark as visited to appease leak detection
> - */
> - if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
> - ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
> - if (ret < 0)
> - goto out;
> - }
> - }
> + ret = checkpoint_vmas(ctx, mm);
> + if (ret != h->map_count && ret >= 0)
> + ret = -EBUSY; /* checkpoint mm leak */
> + if (ret < 0)
> + goto out;
>
> ret = checkpoint_mm_context(ctx, mm);
> out:
> + if (exe_file)
> + fput(exe_file);
> ckpt_hdr_put(ctx, h);
> - up_read(&mm->mmap_sem);
> return ret;
> }
>
> @@ -1288,9 +1343,9 @@ static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
> }
> set_mm_exe_file(mm, file);
> }
> + up_write(&mm->mmap_sem);
>
> ret = _ckpt_read_buffer(ctx, mm->saved_auxv, sizeof(mm->saved_auxv));
> - up_write(&mm->mmap_sem);
> if (ret < 0)
> goto out;
>
> --
At least in the restart path it's interesting to see how Alexey did it
without mmap_sem, at least for part of it:
http://patchwork.kernel.org/patch/25337/
(search for kstate_restore_mm_struct())
Is that a feasible and more-suitable approach for the initial portions
of mm restore?
Cheers,
-Matt Helsley
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list