[Devel] [PATCH vz9 26/27] ve/fs/aio: aio_nr & aio_max_nr variables virtualization

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Wed Oct 6 11:57:37 MSK 2021


From: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>

Virtualization of kernel global aio_nr & aio_max_nr variables is required
to isolate containers and ve0 when allocating aio request/events resources.

Each ve and ve0 has own aio_nr, aio_max_nr values. Function ioctx_alloc trying
to charge appropriate aio_nr value selected by ve context.

It's not possible to exhaust aio events resources of one ve from another ve.

Default per-CT aio_max_nr value == 0x10000, including CT0.

https://jira.sw.ru/browse/PSBM-29017

Signed-off-by: Andrey Ryabinin <aryabinin at odin.com>

Reviewed-by: Vladimir Davydov <vdavydov at parallels.com>

==============================

fs-aio-show-real-number-of-aio----------------------------------------

fs/aio: show real number of aio events in fs.aio-nr sysctl

fs.aio-nr accounts number of aio events requested by user via io_setup()
syscall. The kernel usually creates more events than was requested.
CRIU doesn't care about the number of requested events, it cares only
about created events. So while restoring the process CRIU requests
in io_setup() the number of actually created events. This leads
to inconsistent value of fs.aio-nr after the restore.

Let's show in fs.aio-nr a number of created events, not requested.

https://jira.sw.ru/browse/PSBM-47209

Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>

+++
fs/aio-nr: fix decrement of aio-nr

Commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr sysctl")
changed only incrementing of fs.aio-nr counter. It failed to update
decrement path which leads to constant growing of fs.aio-nr value.

mFixes commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr
sysctl").

https://jira.sw.ru/browse/PSBM-47209

Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>

+++
Ported to VZ8:
ve->aio_nr now incremented by ctx->nr_events (really allocated
io events) as in ms kernel
https://jira.sw.ru/browse/PSBM-123159

Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>

(cherry-picked from vz8 commit 333272f33de2 ("ve/fs/aio:
aio_nr & aio_max_nr variables virtualization"))

Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 fs/aio.c            | 45 ++++++++++++++++++++++++---------------------
 include/linux/aio.h |  6 ++----
 include/linux/ve.h  |  6 ++++++
 kernel/sysctl.c     | 16 ++++++++--------
 kernel/ve/ve.c      |  7 +++++++
 5 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 76ce0cc3ee4e..d2e99e348b7a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -31,6 +31,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
+#include <linux/ve.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
@@ -162,6 +163,7 @@ struct kioctx {
 
 	struct page		*internal_pages[AIO_RING_PAGES];
 	struct file		*aio_ring_file;
+	struct ve_struct	*ve;
 
 	unsigned		id;
 };
@@ -217,12 +219,6 @@ struct aio_kiocb {
 	struct eventfd_ctx	*ki_eventfd;
 };
 
-/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr;		/* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
-/*----end sysctl variables---*/
-
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
@@ -582,12 +578,14 @@ static void free_ioctx(struct work_struct *work)
 {
 	struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
 					  free_rwork);
+	struct ve_struct *ve = ctx->ve;
 	pr_debug("freeing %p\n", ctx);
 
 	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
 	percpu_ref_exit(&ctx->users);
+	put_ve(ve);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -684,14 +682,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 	}
 }
 
-static void aio_nr_sub(unsigned nr)
+static void aio_nr_sub(struct kioctx *ctx, unsigned nr)
 {
-	spin_lock(&aio_nr_lock);
-	if (WARN_ON(aio_nr - nr > aio_nr))
-		aio_nr = 0;
+	struct ve_struct *ve = ctx->ve;
+
+	spin_lock(&ve->aio_nr_lock);
+	if (WARN_ON(ve->aio_nr - nr > ve->aio_nr))
+		ve->aio_nr = 0;
 	else
-		aio_nr -= nr;
-	spin_unlock(&aio_nr_lock);
+		ve->aio_nr -= nr;
+	spin_unlock(&ve->aio_nr_lock);
 }
 
 /* ioctx_alloc
@@ -701,6 +701,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
+	struct ve_struct *ve = get_exec_env();
 	int err = -ENOMEM;
 
 	/*
@@ -727,7 +728,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
+	if (!nr_events || (unsigned long)max_reqs > ve->aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -735,6 +736,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = max_reqs;
+	ctx->ve = get_ve(ve);
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -766,15 +768,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		ctx->req_batch = 1;
 
 	/* limit the number of system wide aios */
-	spin_lock(&aio_nr_lock);
-	if (aio_nr + ctx->max_reqs > aio_max_nr ||
-	    aio_nr + ctx->max_reqs < aio_nr) {
-		spin_unlock(&aio_nr_lock);
+	spin_lock(&ve->aio_nr_lock);
+	if (ve->aio_nr + ctx->max_reqs > ve->aio_max_nr ||
+	    ve->aio_nr + ctx->max_reqs < ve->aio_nr) {
+		spin_unlock(&ve->aio_nr_lock);
 		err = -EAGAIN;
 		goto err_ctx;
 	}
-	aio_nr += ctx->max_reqs;
-	spin_unlock(&aio_nr_lock);
+	ve->aio_nr += ctx->max_reqs;
+	spin_unlock(&ve->aio_nr_lock);
 
 	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
 	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
@@ -791,13 +793,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	return ctx;
 
 err_cleanup:
-	aio_nr_sub(ctx->max_reqs);
+	aio_nr_sub(ctx, ctx->max_reqs);
 err_ctx:
 	atomic_set(&ctx->dead, 1);
 	if (ctx->mmap_size)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
 	aio_free_ring(ctx);
 err:
+	put_ve(ctx->ve);
 	mutex_unlock(&ctx->ring_lock);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
@@ -838,7 +841,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 	 * -EAGAIN with no ioctxs actually in use (as far as userspace
 	 *  could tell).
 	 */
-	aio_nr_sub(ctx->max_reqs);
+	aio_nr_sub(ctx, ctx->max_reqs);
 
 	if (ctx->mmap_size)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index b83e68dd006f..4b7a331156ff 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -10,6 +10,8 @@ struct mm_struct;
 
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
+#define AIO_MAX_NR_DEFAULT	0x10000
+
 /* prototypes */
 #ifdef CONFIG_AIO
 extern void exit_aio(struct mm_struct *mm);
@@ -20,8 +22,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
 
-/* for sysctl: */
-extern unsigned long aio_nr;
-extern unsigned long aio_max_nr;
-
 #endif /* __LINUX__AIO_H */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index c9e823f2c5c0..3d5a1dc2ed0d 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -89,6 +89,12 @@ struct ve_struct {
 
 	struct list_head	devmnt_list;
 	struct mutex		devmnt_mutex;
+
+#ifdef CONFIG_AIO
+	spinlock_t		aio_nr_lock;
+	unsigned long		aio_nr;
+	unsigned long		aio_max_nr;
+#endif
 };
 
 struct ve_devmnt {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 10c92dccf575..49656fd84639 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3369,17 +3369,17 @@ static struct ctl_table fs_table[] = {
 #ifdef CONFIG_AIO
 	{
 		.procname	= "aio-nr",
-		.data		= &aio_nr,
-		.maxlen		= sizeof(aio_nr),
-		.mode		= 0444,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0444 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 	{
 		.procname	= "aio-max-nr",
-		.data		= &aio_max_nr,
-		.maxlen		= sizeof(aio_max_nr),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_max_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index f129f16b8c27..aad35ae983b4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/ve.h>
+#include <linux/aio.h>
 #include <linux/errno.h>
 #include <linux/rcupdate.h>
 #include <linux/init_task.h>
@@ -706,6 +707,12 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	INIT_LIST_HEAD(&ve->devmnt_list);
 	mutex_init(&ve->devmnt_mutex);
 
+#ifdef CONFIG_AIO
+	spin_lock_init(&ve->aio_nr_lock);
+	ve->aio_nr = 0;
+	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
+#endif
+
 	return &ve->css;
 
 err_vdso:
-- 
2.30.2



More information about the Devel mailing list