[Devel] [PATCH RHEL8 COMMIT] ve/fs/aio: aio_nr & aio_max_nr variables virtualization

Konstantin Khorenko khorenko at virtuozzo.com
Tue Dec 22 17:49:02 MSK 2020


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
------>
commit f33a6bd814d31dd056682c49cf8baa3490eae5c0
Author: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Date:   Tue Dec 22 17:49:02 2020 +0300

    ve/fs/aio: aio_nr & aio_max_nr variables virtualization
    
    Virtualization of kernel global aio_nr & aio_max_nr variables is required
    to isolate containers and ve0 when allocating aio request/events resources.
    
    Each ve and ve0 has own aio_nr, aio_max_nr values. Function ioctx_alloc trying
    to charge appropriate aio_nr value selected by ve context.
    
    It's not possible to exhaust aio events resources of one ve from another ve.
    
    Default per-CT aio_max_nr value == 0x10000, including CT0.
    
    https://jira.sw.ru/browse/PSBM-29017
    
    Signed-off-by: Andrey Ryabinin <aryabinin at odin.com>
    
    Reviewed-by: Vladimir Davydov <vdavydov at parallels.com>
    
    ==============================
    
    fs-aio-show-real-number-of-aio----------------------------------------
    
    fs/aio: show real number of aio events in fs.aio-nr sysctl
    
    fs.aio-nr accounts number of aio events requested by user via io_setup()
    syscall. The kernel usually creates more events than was requested.
    CRIU doesn't care about the number of requested events, it cares only
    about created events. So while restoring the process CRIU requests
    in io_setup() the number of actually created events. This leads
    to inconsistent value of fs.aio-nr after the restore.
    
    Let's show in fs.aio-nr a number of created events, not requested.
    
    https://jira.sw.ru/browse/PSBM-47209
    
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    +++
    fs/aio-nr: fix decrement of aio-nr
    
    Commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr sysctl")
    changed only incrementing of fs.aio-nr counter. It failed to update
    decrement path which leads to constant growing of fs.aio-nr value.
    
    mFixes commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr
    sysctl").
    
    https://jira.sw.ru/browse/PSBM-47209
    
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    
    Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    +++
    Ported to VZ8:
    ve->aio_nr now incremented by ctx->nr_events (really allocated
    io events) as in ms kernel
    https://jira.sw.ru/browse/PSBM-123159
    
    Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
 fs/aio.c            | 45 ++++++++++++++++++++++++---------------------
 include/linux/aio.h |  6 ++----
 include/linux/ve.h  |  6 ++++++
 kernel/sysctl.c     | 16 ++++++++--------
 kernel/ve/ve.c      |  7 +++++++
 5 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index e81c8583e055..492f1a8b7661 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
+#include <linux/ve.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
@@ -155,6 +156,7 @@ struct kioctx {
 
 	struct page		*internal_pages[AIO_RING_PAGES];
 	struct file		*aio_ring_file;
+	struct ve_struct	*ve;
 
 	unsigned		id;
 };
@@ -187,12 +189,6 @@ struct aio_kiocb {
 	struct eventfd_ctx	*ki_eventfd;
 };
 
-/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr;		/* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
-/*----end sysctl variables---*/
-
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
@@ -555,12 +551,14 @@ static void free_ioctx(struct work_struct *work)
 {
 	struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
 					  free_rwork);
+	struct ve_struct *ve = ctx->ve;
 	pr_debug("freeing %p\n", ctx);
 
 	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
 	percpu_ref_exit(&ctx->users);
+	put_ve(ve);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -657,14 +655,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 	}
 }
 
-static void aio_nr_sub(unsigned nr)
+static void aio_nr_sub(struct kioctx *ctx, unsigned nr)
 {
-	spin_lock(&aio_nr_lock);
-	if (WARN_ON(aio_nr - nr > aio_nr))
-		aio_nr = 0;
+	struct ve_struct *ve = ctx->ve;
+
+	spin_lock(&ve->aio_nr_lock);
+	if (WARN_ON(ve->aio_nr - nr > ve->aio_nr))
+		ve->aio_nr = 0;
 	else
-		aio_nr -= nr;
-	spin_unlock(&aio_nr_lock);
+		ve->aio_nr -= nr;
+	spin_unlock(&ve->aio_nr_lock);
 }
 
 /* ioctx_alloc
@@ -674,6 +674,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
+	struct ve_struct *ve = get_exec_env();
 	int err = -ENOMEM;
 
 	/*
@@ -700,7 +701,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
+	if (!nr_events || (unsigned long)max_reqs > ve->aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -708,6 +709,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = max_reqs;
+	ctx->ve = get_ve(ve);
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -739,15 +741,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		ctx->req_batch = 1;
 
 	/* limit the number of system wide aios */
-	spin_lock(&aio_nr_lock);
-	if (aio_nr + ctx->max_reqs > aio_max_nr ||
-	    aio_nr + ctx->max_reqs < aio_nr) {
-		spin_unlock(&aio_nr_lock);
+	spin_lock(&ve->aio_nr_lock);
+	if (ve->aio_nr + ctx->max_reqs > ve->aio_max_nr ||
+	    ve->aio_nr + ctx->max_reqs < ve->aio_nr) {
+		spin_unlock(&ve->aio_nr_lock);
 		err = -EAGAIN;
 		goto err_ctx;
 	}
-	aio_nr += ctx->max_reqs;
-	spin_unlock(&aio_nr_lock);
+	ve->aio_nr += ctx->max_reqs;
+	spin_unlock(&ve->aio_nr_lock);
 
 	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
 	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
@@ -764,13 +766,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	return ctx;
 
 err_cleanup:
-	aio_nr_sub(ctx->max_reqs);
+	aio_nr_sub(ctx, ctx->max_reqs);
 err_ctx:
 	atomic_set(&ctx->dead, 1);
 	if (ctx->mmap_size)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
 	aio_free_ring(ctx);
 err:
+	put_ve(ctx->ve);
 	mutex_unlock(&ctx->ring_lock);
 	free_percpu(ctx->cpu);
 	percpu_ref_exit(&ctx->reqs);
@@ -811,7 +814,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 	 * -EAGAIN with no ioctxs actually in use (as far as userspace
 	 *  could tell).
 	 */
-	aio_nr_sub(ctx->max_reqs);
+	aio_nr_sub(ctx, ctx->max_reqs);
 
 	if (ctx->mmap_size)
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index b83e68dd006f..4b7a331156ff 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -10,6 +10,8 @@ struct mm_struct;
 
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
+#define AIO_MAX_NR_DEFAULT	0x10000
+
 /* prototypes */
 #ifdef CONFIG_AIO
 extern void exit_aio(struct mm_struct *mm);
@@ -20,8 +22,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
 
-/* for sysctl: */
-extern unsigned long aio_nr;
-extern unsigned long aio_max_nr;
-
 #endif /* __LINUX__AIO_H */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index ab8da4dceec1..103d0a9044fc 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -99,6 +99,12 @@ struct ve_struct {
 
 	struct list_head	devmnt_list;
 	struct mutex		devmnt_mutex;
+
+#ifdef CONFIG_AIO
+	spinlock_t		aio_nr_lock;
+	unsigned long		aio_nr;
+	unsigned long		aio_max_nr;
+#endif
 };
 
 struct ve_devmnt {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 512f5abe34b8..63401eda6dd5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1817,17 +1817,17 @@ static struct ctl_table fs_table[] = {
 #ifdef CONFIG_AIO
 	{
 		.procname	= "aio-nr",
-		.data		= &aio_nr,
-		.maxlen		= sizeof(aio_nr),
-		.mode		= 0444,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0444 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 	{
 		.procname	= "aio-max-nr",
-		.data		= &aio_max_nr,
-		.maxlen		= sizeof(aio_max_nr),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_max_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 65a1ea27b738..b83b2b66a875 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/ve.h>
+#include <linux/aio.h>
 #include <linux/errno.h>
 #include <linux/rcupdate.h>
 #include <linux/init_task.h>
@@ -648,6 +649,12 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	INIT_LIST_HEAD(&ve->devmnt_list);
 	mutex_init(&ve->devmnt_mutex);
 
+#ifdef CONFIG_AIO
+	spin_lock_init(&ve->aio_nr_lock);
+	ve->aio_nr = 0;
+	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
+#endif
+
 	return &ve->css;
 
 err_vdso:


More information about the Devel mailing list