[Devel] [PATCH RHEL9 COMMIT] mm/memflags/procfs: Allow userspace to set kernel memory allocation flags

Konstantin Khorenko khorenko at virtuozzo.com
Fri Mar 24 19:41:43 MSK 2023


The commit is pushed to "branch-rh9-5.14.0-162.18.1.vz9.19.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-162.18.1.vz9.19.2
------>
commit 55d2e8ecb5d7b26b5bb8056f60c1ee99ae68a654
Author: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
Date:   Mon Mar 20 18:47:00 2023 +0200

    mm/memflags/procfs: Allow userspace to set kernel memory allocation flags
    
    Currently there is no way to hint the kernel to avoid triggering
    page reclaims. This can be useful in networked file systems,
    which can deadlock in the synchronous reclaim path and streaming
    which can get unexpected jitter when a synchronouse reclaim is done.
    To aid the userspace add interface to set PF_MEMALLOC, PF_MEMALLOC_NOIO,
    PF_MEMALLOC_NOFS, PF_MEMALLOC_PIN flags on self.
    
    Reading from /proc/self/memflags returns current set flags.
    Writing to /proc/self/memflags sets the flags.
    Flag values used are defined in the kernel header include/linux/sched.h.
    
    https://jira.vzint.dev/browse/PSBM-141577
    Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
    
    khorenko@: in our particular case FUSE-based vStorage processes will
    tweak those flags to themselves in order to avoid possible deadlocks.
    
    Feature: vStorage
---
 fs/proc/base.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0b900343aef6..a2b4f09aaa56 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1360,6 +1360,81 @@ static const struct file_operations proc_oom_score_adj_operations = {
 	.llseek		= default_llseek,
 };
 
+#define MEMALLOC_FLAGS_MASK (PF_MEMALLOC | PF_MEMALLOC_NOFS | \
+			     PF_MEMALLOC_NOIO | PF_MEMALLOC_PIN)
+
+static ssize_t memalloc_flags_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF];
+	unsigned int pflags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+
+	pflags = READ_ONCE(task->flags) & MEMALLOC_FLAGS_MASK;
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%u\n", pflags);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t memalloc_flags_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF] = {0};
+	int memalloc_flags;
+	int err = -ESRCH;
+	unsigned int pflags;
+
+	if (!task)
+		goto out;
+
+	if (!ve_is_super(get_exec_env())) {
+		err = -EPERM;
+		goto out;
+	}
+	/*
+	 * Potential issue here if task != current
+	 * concurrent setting of flags need synchronization
+	 * but currently flags are expected to change only from
+	 * current process so there is none.
+	 */
+	if (task != current) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	count = min(count, sizeof(buffer) -1);
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &memalloc_flags);
+	if (err)
+		goto out;
+	if (memalloc_flags & ~MEMALLOC_FLAGS_MASK) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	pflags = READ_ONCE(task->flags) & ~MEMALLOC_FLAGS_MASK;
+	WRITE_ONCE(task->flags, pflags | memalloc_flags);
+out:
+	if (task)
+		put_task_struct(task);
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_memalloc_flags_operations = {
+	.read		= memalloc_flags_read,
+	.write		= memalloc_flags_write,
+	.llseek		= default_llseek,
+};
+
 #ifdef CONFIG_AUDIT
 #define TMPBUFLEN 11
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -3400,6 +3475,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	ONE("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
 #ifdef CONFIG_AUDIT
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -3749,6 +3825,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	ONE("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
 #ifdef CONFIG_AUDIT
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),


More information about the Devel mailing list