[Devel] [PATCH vz7] /proc/self/memflags: allow userspace to set kernel memory allocation flags

Konstantin Khorenko khorenko at virtuozzo.com
Mon Mar 20 13:06:10 MSK 2023


Nikolay, please review the patch.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 20.03.2023 10:20, Alexander Atanasov wrote:
> Currently there is no way to control if page reclaim can be done.
> This can be useful in networked file systems, which can deadlock
> in the synchronous reclaim path and streaming which can get
> unexpected jitter when a synchronouse reclaim is done.
> 
> To improve this add interface to set PF_MEMALLOC and
> PF_MEMALLOC_NOIO flags on current process.
> 
> Reading from /proc/self/memflags returns current flags.
> Writing to /proc/self/memflags sets the flags.
> Flag values used are defined in the kernel header include/linux/sched.h.
> 
> https://jira.sw.ru/browse/PSBM-141577
> Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
> ---
>   fs/proc/base.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 77 insertions(+)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index eafb5574c543..c776dd8b3f33 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1425,6 +1425,81 @@ static const struct file_operations proc_oom_score_adj_operations = {
>   	.llseek		= default_llseek,
>   };
>   
> +#define MEMALLOC_FLAGS_MASK (PF_MEMALLOC | PF_MEMALLOC_NOIO)
> +
> +static ssize_t memalloc_flags_read(struct file *file, char __user *buf,
> +					size_t count, loff_t *ppos)
> +{
> +	struct task_struct *task = get_proc_task(file_inode(file));
> +	char buffer[PROC_NUMBUF];
> +	unsigned int pflags;
> +	size_t len;
> +
> +	if (!task)
> +		return -ESRCH;
> +
> +	pflags = READ_ONCE(task->flags) & MEMALLOC_FLAGS_MASK;
> +	put_task_struct(task);
> +	len = snprintf(buffer, sizeof(buffer), "%d\n", pflags);
> +	return simple_read_from_buffer(buf, count, ppos, buffer, len);
> +}
> +static ssize_t memalloc_flags_write(struct file *file, const char __user *buf,
> +					size_t count, loff_t *ppos)
> +{
> +	struct task_struct *task = get_proc_task(file_inode(file));
> +	char buffer[PROC_NUMBUF];
> +	int memalloc_flags;
> +	int err = -ESRCH;
> +	unsigned int pflags;
> +
> +	if (!task)
> +		goto out;
> +
> +	if (get_exec_env() != get_ve0()) {
> +		err = -EPERM;
> +		goto out;
> +	}
> +
> +	/*
> +	 * Potential issue here if task != current
> +	 * concurrent setting of flags need R/W_ONCE
> +	 * but flags are expected to change only from current
> +	 */
> +	if (task != current) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	memset(buffer, 0, sizeof(buffer));
> +	if (count > sizeof(buffer) - 1)
> +		count = sizeof(buffer) - 1;
> +	if (copy_from_user(buffer, buf, count)) {
> +		err = -EFAULT;
> +		goto out;
> +	}
> +
> +	err = kstrtoint(strstrip(buffer), 0, &memalloc_flags);
> +	if (err)
> +		goto out;
> +	if (memalloc_flags & ~MEMALLOC_FLAGS_MASK) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	pflags = READ_ONCE(task->flags) & ~MEMALLOC_FLAGS_MASK;
> +	WRITE_ONCE(task->flags, pflags | memalloc_flags);
> +out:
> +	if (task)
> +		put_task_struct(task);
> +	return err < 0 ? err : count;
> +}
> +
> +static const struct file_operations proc_memalloc_flags_operations = {
> +	.read		= memalloc_flags_read,
> +	.write		= memalloc_flags_write,
> +	.llseek		= default_llseek,
> +};
> +
>   #ifdef CONFIG_AUDITSYSCALL
>   #define TMPBUFLEN 21
>   static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
> @@ -3132,6 +3207,7 @@ static const struct pid_entry tgid_base_stuff[] = {
>   	INF("oom_score",  S_IRUGO, proc_oom_score),
>   	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
>   	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
> +	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
>   #ifdef CONFIG_AUDITSYSCALL
>   	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
>   	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> @@ -3499,6 +3575,7 @@ static const struct pid_entry tid_base_stuff[] = {
>   	INF("oom_score", S_IRUGO, proc_oom_score),
>   	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
>   	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
> +	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
>   #ifdef CONFIG_AUDITSYSCALL
>   	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
>   	REG("sessionid",  S_IRUGO, proc_sessionid_operations),


More information about the Devel mailing list