[Devel] [PATCH vz9 27/27] ve/aio: Add a handle to checkpoint/restore AIO context
Nikita Yushchenko
nikita.yushchenko at virtuozzo.com
Wed Oct 6 11:57:38 MSK 2021
From: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
This adds ioctl, which allows to set ring buffer tail
and to wait till aio requests are finished.
v2: Add pseudosuper check
https://jira.sw.ru/browse/PSBM-42488
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Reviewed-by: Cyrill Gorcunov <gorcunov at openvz.org>
khorenko@: we don't support migration of incomplete aio requests
https://jira.sw.ru/browse/PSBM-41425
so using added instruments we wait till all AIO requests are completed
and migrate the results (AIO req contexts with status).
======================================
ve/aio: Enumerate ioctl numbers right
Do not use common used numbers, use custom.
Also, make error codes different.
https://jira.sw.ru/browse/PSBM-42488
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
======================================
ve/aio: Kill ve_aio_set_tail()
Since tail is restored using submitting requests to write in /dev/null,
we do not need this interface anymore.
https://jira.sw.ru/browse/PSBM-42488
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
======================================
ve/aio: Wait for all inflight AIO reqs of a task
Make it wait all task's AIO contexts instead of a single AIO request.
This minimizes the number of syscall we do to dump aios.
https://jira.sw.ru/browse/PSBM-42488
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
======================================
Ported with respect to ms commits:
34e83fc ("aio: reqs_active -> reqs_available")
723be6e ("aio: percpu ioctx refcount")
db446a0 ("aio: convert the ioctx list to table lookup v3")
https://jira.sw.ru/browse/PSBM-123159
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
+++
aio: ioctl(VE_AIO_IOC_WAIT_ACTIVE) in-flight reqs counting fix
We have to take into account percpu part of reqs_available
counter on struct kioctx.
mFixes: f5d1279 ("ve/aio: Add a handle to checkpoint/restore AIO context")
https://jira.sw.ru/browse/PSBM-128710
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
(cherry-picked from vz8 commit c04e652e2451 ("ve/aio: Add a handle
to checkpoint/restore AIO context"))
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
fs/aio.c | 92 +++++++++++++++++++++++++++++++++++++++++++++
fs/proc/base.c | 27 +++++++++++++
include/linux/aio.h | 13 +++++++
3 files changed, 132 insertions(+)
diff --git a/fs/aio.c b/fs/aio.c
index d2e99e348b7a..779a528bcc25 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -22,6 +22,7 @@
#include <linux/refcount.h>
#include <linux/uio.h>
+#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
@@ -2274,3 +2275,94 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
return ret;
}
#endif
+
+#ifdef CONFIG_VE
+static bool has_reqs_active(struct kioctx *ctx)
+{
+ unsigned long flags;
+ unsigned nr;
+ int cpu;
+ unsigned reqs_avail_batch = 0;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ /*
+ * See get_reqs_available()/put_reqs_available() about
+ * how reqs_available distributed between atomic
+ * ctx->reqs_available and percpu ctx->cpu reqs_available.
+ */
+ for_each_possible_cpu(cpu)
+ reqs_avail_batch += per_cpu_ptr(ctx->cpu, cpu)->reqs_available;
+ nr = ctx->nr_events - 1;
+ nr -= atomic_read(&ctx->reqs_available) + reqs_avail_batch;
+ nr -= ctx->completed_events;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+ return !!nr;
+}
+
+static int ve_aio_wait_inflight_reqs(struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct kioctx_table *table;
+ int ret, i;
+
+ if (p->flags & PF_KTHREAD)
+ return -EINVAL;
+
+ task_lock(p);
+ mm = p->mm;
+ if (mm)
+ atomic_inc(&mm->mm_count);
+ task_unlock(p);
+ if (!mm)
+ return -ESRCH;
+
+again:
+ spin_lock_irq(&mm->ioctx_lock);
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
+ for (i = 0; i < table->nr; i++) {
+ struct kioctx *ctx;
+
+ ctx = rcu_dereference(table->table[i]);
+ if (!ctx)
+ continue;
+
+ if (!has_reqs_active(ctx))
+ continue;
+
+ percpu_ref_get(&ctx->users);
+ rcu_read_unlock();
+ spin_unlock_irq(&mm->ioctx_lock);
+
+ ret = wait_event_interruptible(ctx->wait, !has_reqs_active(ctx));
+ percpu_ref_put(&ctx->users);
+
+ if (ret)
+ goto mmdrop;
+ goto again;
+ }
+
+ rcu_read_unlock();
+ spin_unlock_irq(&mm->ioctx_lock);
+ ret = 0;
+mmdrop:
+ mmdrop(mm);
+ return ret;
+}
+
+int ve_aio_ioctl(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ switch (cmd) {
+ case VE_AIO_IOC_WAIT_ACTIVE:
+ ret = ve_aio_wait_inflight_reqs(task);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+#endif
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2c25b9039a4c..6fb9575976a6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -95,6 +95,7 @@
#include <linux/posix-timers.h>
#include <linux/time_namespace.h>
#include <linux/resctrl.h>
+#include <linux/aio.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -2601,6 +2602,29 @@ static const struct file_operations proc_pid_set_timerslack_ns_operations = {
.release = single_release,
};
+#ifdef CONFIG_VE
+static long proc_aio_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ struct task_struct *task;
+ int ret;
+
+ task = get_proc_task(inode);
+ if (!task)
+ return -ESRCH;
+
+ ret = ve_aio_ioctl(task, cmd, arg);
+
+ put_task_struct(task);
+
+ return ret;
+}
+
+static const struct file_operations proc_aio_operations = {
+ .unlocked_ioctl = proc_aio_ioctl,
+};
+#endif /* CONFIG_VE */
+
static struct dentry *proc_pident_instantiate(struct dentry *dentry,
struct task_struct *task, const void *ptr)
{
@@ -3272,6 +3296,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("timers", S_IRUGO, proc_timers_operations),
#endif
REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ REG("aio", S_IRUGO|S_IWUSR, proc_aio_operations),
+#endif
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 4b7a331156ff..ccaaae0db31d 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -12,14 +12,27 @@ typedef int (kiocb_cancel_fn)(struct kiocb *);
#define AIO_MAX_NR_DEFAULT 0x10000
+struct ve_ioc_arg
+{
+ aio_context_t ctx_id;
+ unsigned val;
+};
+
+#define VE_AIO_IOC_WAIT_ACTIVE _IOW('a', 1, struct ve_ioc_arg)
+
/* prototypes */
#ifdef CONFIG_AIO
extern void exit_aio(struct mm_struct *mm);
void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
+#ifdef CONFIG_VE
+int ve_aio_ioctl(struct task_struct *, unsigned int, unsigned long);
+#endif
#else
static inline void exit_aio(struct mm_struct *mm) { }
static inline void kiocb_set_cancel_fn(struct kiocb *req,
kiocb_cancel_fn *cancel) { }
+static int ve_aio_ioctl(struct task_struct *task, unsigned int cmd,
+ unsigned long arg) { return 0; }
#endif /* CONFIG_AIO */
#endif /* __LINUX__AIO_H */
--
2.30.2
More information about the Devel
mailing list