[Devel] [PATCH RHEL8 COMMIT] ve/aio: Add a handle to checkpoint/restore AIO context

Konstantin Khorenko khorenko at virtuozzo.com
Tue Dec 22 17:56:03 MSK 2020


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
------>
commit f5d12793d7fcea5477c0e5736adb1d7129a0e762
Author: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Date:   Tue Dec 22 17:56:03 2020 +0300

    ve/aio: Add a handle to checkpoint/restore AIO context
    
    This adds ioctl, which allows to set ring buffer tail
    and to wait till aio requests are finished.
    
    v2: Add pseudosuper check
    
    https://jira.sw.ru/browse/PSBM-42488
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    Reviewed-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    khorenko@: we don't support migration of incomplete aio requests
               https://jira.sw.ru/browse/PSBM-41425
    
    so using added instruments we wait till all AIO requests are completed
    and migrate the results (AIO req contexts with status).
    
    ======================================
    
    ve/aio: Enumerate ioctl numbers right
    
    Do not use common used numbers, use custom.
    Also, make error codes different.
    
    https://jira.sw.ru/browse/PSBM-42488
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    ======================================
    
    ve/aio: Kill ve_aio_set_tail()
    
    Since tail is restored using submitting requests to write in /dev/null,
    we do not need this interface anymore.
    
    https://jira.sw.ru/browse/PSBM-42488
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    ======================================
    
    ve/aio: Wait for all inflight AIO reqs of a task
    
    Make it wait all task's AIO contexts instead of a single AIO request.
    This minimizes the number of syscall we do to dump aios.
    
    https://jira.sw.ru/browse/PSBM-42488
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
    ======================================
    
    Ported with respect to ms commits:
    34e83fc ("aio: reqs_active -> reqs_available")
    723be6e ("aio: percpu ioctx refcount")
    db446a0 ("aio: convert the ioctx list to table lookup v3")
    
    https://jira.sw.ru/browse/PSBM-123159
    
    Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
 fs/aio.c            | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/base.c      | 27 ++++++++++++++++++
 include/linux/aio.h | 13 +++++++++
 3 files changed, 122 insertions(+)

diff --git a/fs/aio.c b/fs/aio.c
index 492f1a8b7661..7c547247b056 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -20,6 +20,7 @@
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 
+#include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -1892,6 +1893,87 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	return ret;
 }
 
+#ifdef CONFIG_VE
+static bool has_reqs_active(struct kioctx *ctx)
+{
+	unsigned long flags;
+	unsigned nr;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	nr = (ctx->nr_events - 1) - atomic_read(&ctx->reqs_available);
+	nr -= ctx->completed_events;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	return !!nr;
+}
+
+static int ve_aio_wait_inflight_reqs(struct task_struct *p)
+{
+	struct mm_struct *mm;
+	struct kioctx_table *table;
+	int ret, i;
+
+	if (p->flags & PF_KTHREAD)
+		return -EINVAL;
+
+	task_lock(p);
+	mm = p->mm;
+	if (mm)
+		atomic_inc(&mm->mm_count);
+	task_unlock(p);
+	if (!mm)
+		return -ESRCH;
+
+again:
+	spin_lock_irq(&mm->ioctx_lock);
+	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
+	for (i = 0; i < table->nr; i++) {
+		struct kioctx *ctx;
+
+		ctx = rcu_dereference(table->table[i]);
+		if (!ctx)
+			continue;
+
+		if (!has_reqs_active(ctx))
+			continue;
+
+		percpu_ref_get(&ctx->users);
+		rcu_read_unlock();
+		spin_unlock_irq(&mm->ioctx_lock);
+
+		ret = wait_event_interruptible(ctx->wait, !has_reqs_active(ctx));
+		percpu_ref_put(&ctx->users);
+
+		if (ret)
+			goto mmdrop;
+		goto again;
+	}
+
+	rcu_read_unlock();
+	spin_unlock_irq(&mm->ioctx_lock);
+	ret = 0;
+mmdrop:
+	mmdrop(mm);
+	return ret;
+}
+
+int ve_aio_ioctl(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	switch (cmd) {
+		case VE_AIO_IOC_WAIT_ACTIVE:
+			ret = ve_aio_wait_inflight_reqs(task);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+#endif
+
 struct __aio_sigset {
 	const sigset_t __user	*sigmask;
 	size_t		sigsetsize;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7c0fd93ba7d1..38268a980989 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -95,6 +95,7 @@
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
 #include <linux/resctrl.h>
+#include <linux/aio.h>
 #include <trace/events/oom.h>
 #include "internal.h"
 #include "fd.h"
@@ -2429,6 +2430,29 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = {
 	.release	= single_release,
 };
 
+#ifdef CONFIG_VE
+static long proc_aio_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *task;
+	int ret;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	ret = ve_aio_ioctl(task, cmd, arg);
+
+	put_task_struct(task);
+
+	return ret;
+}
+
+static const struct file_operations proc_aio_operations = {
+	.unlocked_ioctl		= proc_aio_ioctl,
+};
+#endif /* CONFIG_VE */
+
 static struct dentry *proc_pident_instantiate(struct dentry *dentry,
 	struct task_struct *task, const void *ptr)
 {
@@ -3010,6 +3034,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	REG("aio",	  S_IRUGO|S_IWUSR, proc_aio_operations),
+#endif
 #ifdef CONFIG_LIVEPATCH
 	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
 #endif
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 4b7a331156ff..ccaaae0db31d 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -12,14 +12,27 @@ typedef int (kiocb_cancel_fn)(struct kiocb *);
 
 #define AIO_MAX_NR_DEFAULT	0x10000
 
+struct ve_ioc_arg
+{
+	aio_context_t	ctx_id;
+	unsigned	val;
+};
+
+#define VE_AIO_IOC_WAIT_ACTIVE	_IOW('a',  1, struct ve_ioc_arg)
+
 /* prototypes */
 #ifdef CONFIG_AIO
 extern void exit_aio(struct mm_struct *mm);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
+#ifdef CONFIG_VE
+int ve_aio_ioctl(struct task_struct *, unsigned int, unsigned long);
+#endif
 #else
 static inline void exit_aio(struct mm_struct *mm) { }
 static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
+static int ve_aio_ioctl(struct task_struct *task, unsigned int cmd,
+			unsigned long arg) { return 0; }
 #endif /* CONFIG_AIO */
 
 #endif /* __LINUX__AIO_H */


More information about the Devel mailing list