[Devel] [PATCH] c/r: do not hold mmap_sem while checkpointing vma's
Oren Laadan
orenl at librato.com
Sun Oct 25 15:23:29 PDT 2009
This patch modifies the memory checkpoint code to _not_ hold the
mmap_sem while dumping out the vma's.
The problem with holding the mmap_sem is that it first takes the
mmap_sem and then takes the file's inode semaphore. This violates the
normal locking order, e,g, when taking a page fault during a copyout,
which is inode sem and then the mmap_sem.
Normally this reverse locking order won't cause a lockup because a the
output file for the checkpoint image isn't used by the checkpointee.
However, there a couple of cases where it may be a problem, e.g. when
some async-IO happens to complete and triggers a page fault at the
wrong time.
This fixes complaints from the lockdep about this reverse ordering.
Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
---
checkpoint/memory.c | 133 ++++++++++++++++++++++++++++++++++++---------------
1 files changed, 94 insertions(+), 39 deletions(-)
diff --git a/checkpoint/memory.c b/checkpoint/memory.c
index 0da948f..656614c 100644
--- a/checkpoint/memory.c
+++ b/checkpoint/memory.c
@@ -644,11 +644,80 @@ static int anonymous_checkpoint(struct ckpt_ctx *ctx,
return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
}
+static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
+{
+ struct vm_area_struct *vma, *next;
+ int map_count = 0;
+ int ret = 0;
+
+ vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+ if (!vma)
+ return -ENOMEM;
+
+ /*
+ * Must not hold mm->mmap_sem when writing to image file, so
+ * can't simply traverse the vma list. Instead, use find_vma()
+ * to get the @next and make a local "copy" of it.
+ */
+ while (1) {
+ down_read(&mm->mmap_sem);
+ next = find_vma(mm, vma->vm_end);
+ if (!next) {
+ up_read(&mm->mmap_sem);
+ break;
+ }
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ *vma = *next;
+ if (vma->vm_file)
+ get_file(vma->vm_file);
+ up_read(&mm->mmap_sem);
+
+ map_count++;
+
+ ckpt_debug("vma %#lx-%#lx flags %#lx\n",
+ vma->vm_start, vma->vm_end, vma->vm_flags);
+
+ if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
+ ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n",
+ -ENOSYS, vma->vm_flags);
+ ret = -ENOSYS;
+ break;
+ }
+
+ if (!vma->vm_ops)
+ ret = anonymous_checkpoint(ctx, vma);
+ else if (vma->vm_ops->checkpoint)
+ ret = (*vma->vm_ops->checkpoint)(ctx, vma);
+ else
+ ret = -ENOSYS;
+ if (ret < 0) {
+ ckpt_write_err(ctx, "TE", "vma: failed", ret);
+ break;
+ }
+ /*
+ * The file was collected, but not always checkpointed;
+ * be safe and mark as visited to appease leak detection
+ */
+ if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
+ ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
+ if (ret < 0)
+ break;
+ }
+ }
+
+ if (vma->vm_file)
+ fput(vma->vm_file);
+
+ kfree(vma);
+
+ return ret < 0 ? ret : map_count;
+}
+
static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
{
struct ckpt_hdr_mm *h;
- struct vm_area_struct *vma;
- int exe_objref = 0;
+ struct file *exe_file = NULL;
int ret;
h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
@@ -674,14 +743,23 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
h->map_count = mm->map_count;
- /* checkpoint the ->exe_file */
- if (mm->exe_file) {
- exe_objref = checkpoint_obj(ctx, mm->exe_file, CKPT_OBJ_FILE);
- if (exe_objref < 0) {
- ret = exe_objref;
+ if (mm->exe_file) { /* checkpoint the ->exe_file */
+ exe_file = mm->exe_file;
+ get_file(exe_file);
+ }
+
+ /*
+ * Drop mm->mmap_sem before writing data to checkpoint image
+ * to avoid reverse locking order (inode must come before mm).
+ */
+ up_read(&mm->mmap_sem);
+
+ if (exe_file) {
+ h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
+ if (h->exe_objref < 0) {
+ ret = h->exe_objref;
goto out;
}
- h->exe_objref = exe_objref;
}
ret = ckpt_write_obj(ctx, &h->h);
@@ -692,40 +770,17 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
if (ret < 0)
return ret;
- /* write the vma's */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- ckpt_debug("vma %#lx-%#lx flags %#lx\n",
- vma->vm_start, vma->vm_end, vma->vm_flags);
- if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
- ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n",
- -ENOSYS, vma->vm_flags);
- return -ENOSYS;
- }
- if (!vma->vm_ops)
- ret = anonymous_checkpoint(ctx, vma);
- else if (vma->vm_ops->checkpoint)
- ret = (*vma->vm_ops->checkpoint)(ctx, vma);
- else
- ret = -ENOSYS;
- if (ret < 0) {
- ckpt_write_err(ctx, "TE", "vma: failed", ret);
- goto out;
- }
- /*
- * The file was collected, but not always checkpointed;
- * be safe and mark as visited to appease leak detection
- */
- if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
- ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
- if (ret < 0)
- goto out;
- }
- }
+ ret = checkpoint_vmas(ctx, mm);
+ if (ret != h->map_count && ret >= 0)
+ ret = -EBUSY; /* checkpoint mm leak */
+ if (ret < 0)
+ goto out;
ret = checkpoint_mm_context(ctx, mm);
out:
+ if (exe_file)
+ fput(exe_file);
ckpt_hdr_put(ctx, h);
- up_read(&mm->mmap_sem);
return ret;
}
@@ -1288,9 +1343,9 @@ static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
}
set_mm_exe_file(mm, file);
}
+ up_write(&mm->mmap_sem);
ret = _ckpt_read_buffer(ctx, mm->saved_auxv, sizeof(mm->saved_auxv));
- up_write(&mm->mmap_sem);
if (ret < 0)
goto out;
--
1.6.0.4
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list