[Devel] [PATCH vz7] /proc/self/memflags: allow userspace to set kernel memory allocation flags

Alexander Atanasov alexander.atanasov at virtuozzo.com
Mon Mar 20 12:20:30 MSK 2023


Currently there is no way to control if page reclaim can be done.
This can be useful in networked file systems, which can deadlock
in the synchronous reclaim path and streaming which can get
unexpected jitter when a synchronouse reclaim is done.

To improve this add interface to set PF_MEMALLOC and
PF_MEMALLOC_NOIO flags on current process.

Reading from /proc/self/memflags returns current flags.
Writing to /proc/self/memflags sets the flags.
Flag values used are defined in the kernel header include/linux/sched.h.

https://jira.sw.ru/browse/PSBM-141577
Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
---
 fs/proc/base.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index eafb5574c543..c776dd8b3f33 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1425,6 +1425,81 @@ static const struct file_operations proc_oom_score_adj_operations = {
 	.llseek		= default_llseek,
 };
 
+#define MEMALLOC_FLAGS_MASK (PF_MEMALLOC | PF_MEMALLOC_NOIO)
+
+static ssize_t memalloc_flags_read(struct file *file, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF];
+	unsigned int pflags;
+	size_t len;
+
+	if (!task)
+		return -ESRCH;
+
+	pflags = READ_ONCE(task->flags) & MEMALLOC_FLAGS_MASK;
+	put_task_struct(task);
+	len = snprintf(buffer, sizeof(buffer), "%d\n", pflags);
+	return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+static ssize_t memalloc_flags_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct task_struct *task = get_proc_task(file_inode(file));
+	char buffer[PROC_NUMBUF];
+	int memalloc_flags;
+	int err = -ESRCH;
+	unsigned int pflags;
+
+	if (!task)
+		goto out;
+
+	if (get_exec_env() != get_ve0()) {
+		err = -EPERM;
+		goto out;
+	}
+
+	/*
+	 * Potential issue here if task != current
+	 * concurrent setting of flags need R/W_ONCE
+	 * but flags are expected to change only from current
+	 */
+	if (task != current) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	memset(buffer, 0, sizeof(buffer));
+	if (count > sizeof(buffer) - 1)
+		count = sizeof(buffer) - 1;
+	if (copy_from_user(buffer, buf, count)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	err = kstrtoint(strstrip(buffer), 0, &memalloc_flags);
+	if (err)
+		goto out;
+	if (memalloc_flags & ~MEMALLOC_FLAGS_MASK) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	pflags = READ_ONCE(task->flags) & ~MEMALLOC_FLAGS_MASK;
+	WRITE_ONCE(task->flags, pflags | memalloc_flags);
+out:
+	if (task)
+		put_task_struct(task);
+	return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_memalloc_flags_operations = {
+	.read		= memalloc_flags_read,
+	.write		= memalloc_flags_write,
+	.llseek		= default_llseek,
+};
+
 #ifdef CONFIG_AUDITSYSCALL
 #define TMPBUFLEN 21
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -3132,6 +3207,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	INF("oom_score",  S_IRUGO, proc_oom_score),
 	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
@@ -3499,6 +3575,7 @@ static const struct pid_entry tid_base_stuff[] = {
 	INF("oom_score", S_IRUGO, proc_oom_score),
 	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
 	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
+	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
 	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
-- 
2.31.1



More information about the Devel mailing list