[Devel] [PATCH RHEL8 COMMIT] ve/fs/aio: aio_nr & aio_max_nr variables virtualization
Konstantin Khorenko
khorenko at virtuozzo.com
Tue Dec 22 17:49:02 MSK 2020
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
------>
commit f33a6bd814d31dd056682c49cf8baa3490eae5c0
Author: Stanislav Kinsburskiy <skinsbursky at virtuozzo.com>
Date: Tue Dec 22 17:49:02 2020 +0300
ve/fs/aio: aio_nr & aio_max_nr variables virtualization
Virtualization of kernel global aio_nr & aio_max_nr variables is required
to isolate containers and ve0 when allocating aio request/events resources.
Each ve and ve0 has own aio_nr, aio_max_nr values. Function ioctx_alloc trying
to charge appropriate aio_nr value selected by ve context.
It's not possible to exhaust aio events resources of one ve from another ve.
Default per-CT aio_max_nr value == 0x10000, including CT0.
https://jira.sw.ru/browse/PSBM-29017
Signed-off-by: Andrey Ryabinin <aryabinin at odin.com>
Reviewed-by: Vladimir Davydov <vdavydov at parallels.com>
==============================
fs-aio-show-real-number-of-aio----------------------------------------
fs/aio: show real number of aio events in fs.aio-nr sysctl
fs.aio-nr accounts number of aio events requested by user via io_setup()
syscall. The kernel usually creates more events than was requested.
CRIU doesn't care about the number of requested events, it cares only
about created events. So while restoring the process CRIU requests
in io_setup() the number of actually created events. This leads
to inconsistent value of fs.aio-nr after the restore.
Let's show in fs.aio-nr a number of created events, not requested.
https://jira.sw.ru/browse/PSBM-47209
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
+++
fs/aio-nr: fix decrement of aio-nr
Commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr sysctl")
changed only incrementing of fs.aio-nr counter. It failed to update
decrement path which leads to constant growing of fs.aio-nr value.
mFixes commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr
sysctl").
https://jira.sw.ru/browse/PSBM-47209
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Acked-by: Kirill Tkhai <ktkhai at virtuozzo.com>
+++
Ported to VZ8:
ve->aio_nr now incremented by ctx->nr_events (really allocated
io events) as in ms kernel
https://jira.sw.ru/browse/PSBM-123159
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
fs/aio.c | 45 ++++++++++++++++++++++++---------------------
include/linux/aio.h | 6 ++----
include/linux/ve.h | 6 ++++++
kernel/sysctl.c | 16 ++++++++--------
kernel/ve/ve.c | 7 +++++++
5 files changed, 47 insertions(+), 33 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index e81c8583e055..492f1a8b7661 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -30,6 +30,7 @@
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/aio.h>
+#include <linux/ve.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
#include <linux/security.h>
@@ -155,6 +156,7 @@ struct kioctx {
struct page *internal_pages[AIO_RING_PAGES];
struct file *aio_ring_file;
+ struct ve_struct *ve;
unsigned id;
};
@@ -187,12 +189,6 @@ struct aio_kiocb {
struct eventfd_ctx *ki_eventfd;
};
-/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr; /* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
-/*----end sysctl variables---*/
-
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
@@ -555,12 +551,14 @@ static void free_ioctx(struct work_struct *work)
{
struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
free_rwork);
+ struct ve_struct *ve = ctx->ve;
pr_debug("freeing %p\n", ctx);
aio_free_ring(ctx);
free_percpu(ctx->cpu);
percpu_ref_exit(&ctx->reqs);
percpu_ref_exit(&ctx->users);
+ put_ve(ve);
kmem_cache_free(kioctx_cachep, ctx);
}
@@ -657,14 +655,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
}
}
-static void aio_nr_sub(unsigned nr)
+static void aio_nr_sub(struct kioctx *ctx, unsigned nr)
{
- spin_lock(&aio_nr_lock);
- if (WARN_ON(aio_nr - nr > aio_nr))
- aio_nr = 0;
+ struct ve_struct *ve = ctx->ve;
+
+ spin_lock(&ve->aio_nr_lock);
+ if (WARN_ON(ve->aio_nr - nr > ve->aio_nr))
+ ve->aio_nr = 0;
else
- aio_nr -= nr;
- spin_unlock(&aio_nr_lock);
+ ve->aio_nr -= nr;
+ spin_unlock(&ve->aio_nr_lock);
}
/* ioctx_alloc
@@ -674,6 +674,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
{
struct mm_struct *mm = current->mm;
struct kioctx *ctx;
+ struct ve_struct *ve = get_exec_env();
int err = -ENOMEM;
/*
@@ -700,7 +701,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ERR_PTR(-EINVAL);
}
- if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
+ if (!nr_events || (unsigned long)max_reqs > ve->aio_max_nr)
return ERR_PTR(-EAGAIN);
ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -708,6 +709,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ERR_PTR(-ENOMEM);
ctx->max_reqs = max_reqs;
+ ctx->ve = get_ve(ve);
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->completion_lock);
@@ -739,15 +741,15 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
ctx->req_batch = 1;
/* limit the number of system wide aios */
- spin_lock(&aio_nr_lock);
- if (aio_nr + ctx->max_reqs > aio_max_nr ||
- aio_nr + ctx->max_reqs < aio_nr) {
- spin_unlock(&aio_nr_lock);
+ spin_lock(&ve->aio_nr_lock);
+ if (ve->aio_nr + ctx->max_reqs > ve->aio_max_nr ||
+ ve->aio_nr + ctx->max_reqs < ve->aio_nr) {
+ spin_unlock(&ve->aio_nr_lock);
err = -EAGAIN;
goto err_ctx;
}
- aio_nr += ctx->max_reqs;
- spin_unlock(&aio_nr_lock);
+ ve->aio_nr += ctx->max_reqs;
+ spin_unlock(&ve->aio_nr_lock);
percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */
@@ -764,13 +766,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
return ctx;
err_cleanup:
- aio_nr_sub(ctx->max_reqs);
+ aio_nr_sub(ctx, ctx->max_reqs);
err_ctx:
atomic_set(&ctx->dead, 1);
if (ctx->mmap_size)
vm_munmap(ctx->mmap_base, ctx->mmap_size);
aio_free_ring(ctx);
err:
+ put_ve(ctx->ve);
mutex_unlock(&ctx->ring_lock);
free_percpu(ctx->cpu);
percpu_ref_exit(&ctx->reqs);
@@ -811,7 +814,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
* -EAGAIN with no ioctxs actually in use (as far as userspace
* could tell).
*/
- aio_nr_sub(ctx->max_reqs);
+ aio_nr_sub(ctx, ctx->max_reqs);
if (ctx->mmap_size)
vm_munmap(ctx->mmap_base, ctx->mmap_size);
diff --git a/include/linux/aio.h b/include/linux/aio.h
index b83e68dd006f..4b7a331156ff 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -10,6 +10,8 @@ struct mm_struct;
typedef int (kiocb_cancel_fn)(struct kiocb *);
+#define AIO_MAX_NR_DEFAULT 0x10000
+
/* prototypes */
#ifdef CONFIG_AIO
extern void exit_aio(struct mm_struct *mm);
@@ -20,8 +22,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req,
kiocb_cancel_fn *cancel) { }
#endif /* CONFIG_AIO */
-/* for sysctl: */
-extern unsigned long aio_nr;
-extern unsigned long aio_max_nr;
-
#endif /* __LINUX__AIO_H */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index ab8da4dceec1..103d0a9044fc 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -99,6 +99,12 @@ struct ve_struct {
struct list_head devmnt_list;
struct mutex devmnt_mutex;
+
+#ifdef CONFIG_AIO
+ spinlock_t aio_nr_lock;
+ unsigned long aio_nr;
+ unsigned long aio_max_nr;
+#endif
};
struct ve_devmnt {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 512f5abe34b8..63401eda6dd5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1817,17 +1817,17 @@ static struct ctl_table fs_table[] = {
#ifdef CONFIG_AIO
{
.procname = "aio-nr",
- .data = &aio_nr,
- .maxlen = sizeof(aio_nr),
- .mode = 0444,
- .proc_handler = proc_doulongvec_minmax,
+ .data = &ve0.aio_nr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0444 | S_ISVTX,
+ .proc_handler = proc_doulongvec_minmax_virtual,
},
{
.procname = "aio-max-nr",
- .data = &aio_max_nr,
- .maxlen = sizeof(aio_max_nr),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
+ .data = &ve0.aio_max_nr,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644 | S_ISVTX,
+ .proc_handler = proc_doulongvec_minmax_virtual,
},
#endif /* CONFIG_AIO */
#ifdef CONFIG_INOTIFY_USER
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 65a1ea27b738..b83b2b66a875 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/ve.h>
+#include <linux/aio.h>
#include <linux/errno.h>
#include <linux/rcupdate.h>
#include <linux/init_task.h>
@@ -648,6 +649,12 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
INIT_LIST_HEAD(&ve->devmnt_list);
mutex_init(&ve->devmnt_mutex);
+#ifdef CONFIG_AIO
+ spin_lock_init(&ve->aio_nr_lock);
+ ve->aio_nr = 0;
+ ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
+#endif
+
return &ve->css;
err_vdso:
More information about the Devel
mailing list