[Devel] [PATCH RHEL7 COMMIT] ms/aio: Make it possible to remap aio ring
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Oct 15 04:04:14 PDT 2015
The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.8.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.8.6
------>
commit a3ffce64acc927dd35825252566389966520dc94
Author: Andrew Vagin <avagin at openvz.org>
Date: Thu Oct 15 15:04:14 2015 +0400
ms/aio: Make it possible to remap aio ring
The patch is required for CRIU.
https://jira.sw.ru/browse/PSBM-39834
From: Pavel Emelyanov <xemul at parallels.com>
ML: e4a0d3e720e7e508749c1439b5ba3aff56c92976
There are actually two issues this patch addresses. Let me start with
the one I tried to solve in the beginning.
So, in the checkpoint-restore project (criu) we try to dump tasks'
state and restore one back exactly as it was. One of the tasks' state
bits is rings set up with io_setup() call. There's (almost) no problems
in dumping them, there's a problem restoring them -- if I dump a task
with aio ring originally mapped at address A, I want to restore one
back at exactly the same address A. Unfortunately, the io_setup() does
not allow for that -- it mmaps the ring at whatever place mm finds
appropriate (it calls do_mmap_pgoff() with zero address and without
the MAP_FIXED flag).
To make restore possible I'm going to mremap() the freshly created ring
into the address A (under which it was seen before dump). The problem is
that the ring's virtual address is passed back to the user-space as the
context ID and this ID is then used as search key by all the other io_foo()
calls. Reworking this ID to be just some integer doesn't seem to work, as
this value is already used by libaio as a pointer using which this library
accesses memory for aio meta-data.
So, to make restore work we need to make sure that
a) ring is mapped at desired virtual address
b) kioctx->user_id matches this value
Having said that, the patch makes mremap() on aio region update the
kioctx's user_id and mmap_base values.
Here appears the 2nd issue I mentioned in the beginning of this mail.
If (regardless of the C/R dances I do) someone creates an io context
with io_setup(), then mremap()-s the ring and then destroys the context,
the kill_ioctx() routine will call munmap() on wrong (old) address.
This will result in a) aio ring remaining in memory and b) some other
vma get unexpectedly unmapped.
What do you think?
Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
Acked-by: Dmitry Monakhov <dmonakhov at openvz.org>
Signed-off-by: Benjamin LaHaise <bcrl at kvack.org>
Signed-off-by: Andrew Vagin <avagin at openvz.org>
---
fs/aio.c | 20 ++++++++++++++++++++
include/linux/fs.h | 1 +
mm/mremap.c | 3 ++-
3 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/fs/aio.c b/fs/aio.c
index 9d700b0..301da77 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -257,12 +257,32 @@ static void aio_free_ring(struct kioctx *ctx)
static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
{
+ vma->vm_flags |= VM_DONTEXPAND;
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
+static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct kioctx *ctx;
+
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
+ if (ctx && ctx->aio_ring_file == file) {
+ ctx->user_id = ctx->mmap_base = vma->vm_start;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
+}
+
static const struct file_operations aio_ring_fops = {
.mmap = aio_ring_mmap,
+ .mremap = aio_ring_remap,
};
#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e7bd3f..bbbf186 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1734,6 +1734,7 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
+ void (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
diff --git a/mm/mremap.c b/mm/mremap.c
index e1db886..0b40af6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -293,7 +293,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
- }
+ } else if (vma->vm_file && vma->vm_file->f_op->mremap)
+ vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
More information about the Devel
mailing list