[Devel] [PATCH vz9] /proc/self/memflags: allow userspace to set kernel memory allocation flags

nb nikolay.borisov at virtuozzo.com
Mon Mar 20 13:43:59 MSK 2023



On 20.03.23 г. 11:20 ч., Alexander Atanasov wrote:
> Currently there is no way to hint the kernel to avoid triggering
> page reclaims. This can be useful in networked file systems,
> which can deadlock in the synchronous reclaim path and streaming
> which can get unexpected jitter when a synchronouse reclaim is done.
> To aid the userspace add interface to set PF_MEMALLOC, PF_MEMALLOC_NOIO,
> PF_MEMALLOC_NOFS, PF_MEMALLOC_PIN flags on self.
> 
> Reading from /proc/self/memflags returns current set flags.
> Writing to /proc/self/memflags sets the flags.
> Flag values used are defined in the kernel header include/linux/sched.h.
> 
> https://jira.sw.ru/browse/PSBM-141577
> Signed-off-by: Alexander Atanasov <alexander.atanasov at virtuozzo.com>
> ---
>   fs/proc/base.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 78 insertions(+)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 0b900343aef6..8101601cc188 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -1360,6 +1360,82 @@ static const struct file_operations proc_oom_score_adj_operations = {
>   	.llseek		= default_llseek,
>   };
>   
> +#define MEMALLOC_FLAGS_MASK (PF_MEMALLOC | PF_MEMALLOC_NOFS | \
> +			     PF_MEMALLOC_NOIO | PF_MEMALLOC_PIN)
> +
> +static ssize_t memalloc_flags_read(struct file *file, char __user *buf,
> +					size_t count, loff_t *ppos)
> +{
> +	struct task_struct *task = get_proc_task(file_inode(file));
> +	char buffer[PROC_NUMBUF];
> +	unsigned int pflags;
> +	size_t len;
> +
> +	if (!task)
> +		return -ESRCH;

nit: Looking at other functions in fs/proc/base.c that use get_proc_task 
they seem to return ENOENT if NULL is returned. For the sake of 
consistency I'd say change ESRCH to ENOENT.

> +
> +	pflags = READ_ONCE(task->flags) & MEMALLOC_FLAGS_MASK;
> +	put_task_struct(task);
> +	len = snprintf(buffer, sizeof(buffer), "%d\n", pflags);

Since this is really holding a mask perhaps we want an unsigned integer 
or even better a hex value. Sure, non of the values in 
MEMALLOC_FLAGS_MASK change bit 31 but again, let's try to be as simple 
as possible". So making it a %x is the best IMO.

> +	return simple_read_from_buffer(buf, count, ppos, buffer, len);
> +}
> +
> +static ssize_t memalloc_flags_write(struct file *file, const char __user *buf,
> +					size_t count, loff_t *ppos)
> +{
> +	struct task_struct *task = get_proc_task(file_inode(file));
> +	char buffer[PROC_NUMBUF];
> +	int memalloc_flags;
> +	int err = -ESRCH;

Ditto

> +	unsigned int pflags;
> +
> +	if (!task)
> +		goto out;

nit: just return which then obviates the need to have a check for "if 
task) under the out label, it simplifies the code overall.

> +
> +	if (get_exec_env() != get_ve0()) {

nit: !ve_is_super(get_exec_env()), not that it would make any material 
difference, but it looks somewhat more idiomatic to me.

> +		err = -EPERM;
> +		goto out;
> +	}
> +	/*
> +	 * Potential issue here if task != current
> +	 * concurrent setting of flags need R/W_ONCE
> +	 * but flags are expected to change only from current
> +	 */
> +	if (task != current) {
> +		err = -EINVAL;
> +		goto out;
> +	}

nit: R/W_ONCE is used to prevent certain compiler optimisations and 
ensure that certain accesses are atomic in the face of relaxed memory 
ordering. I.e they are not a synchronization primitive. So the comment 
should be rephrased to say that we can't guarantee 
race-free/synchronized access to the flags.


> +
> +	memset(buffer, 0, sizeof(buffer));

nit: if you define the buffer as char buffer = {0} then the compiler 
would generate teh code, one less line to read :D


> +	if (count > sizeof(buffer) - 1)
> +		count = sizeof(buffer) - 1;

nit: count = min(count, sizeof(buffer) -1)

> +	if (copy_from_user(buffer, buf, count)) {
> +		err = -EFAULT;
> +		goto out;
> +	}
> +
> +	err = kstrtoint(strstrip(buffer), 0, &memalloc_flags);
> +	if (err)
> +		goto out;
> +	if (memalloc_flags & ~MEMALLOC_FLAGS_MASK) {
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	pflags = READ_ONCE(task->flags) & ~MEMALLOC_FLAGS_MASK;
> +	WRITE_ONCE(task->flags, pflags | memalloc_flags);
> +out:
> +	if (task)
> +		put_task_struct(task);
> +	return err < 0 ? err : count;
> +}
> +
> +static const struct file_operations proc_memalloc_flags_operations = {
> +	.read		= memalloc_flags_read,
> +	.write		= memalloc_flags_write,
> +	.llseek		= default_llseek,
> +};
> +
>   #ifdef CONFIG_AUDIT
>   #define TMPBUFLEN 11
>   static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
> @@ -3400,6 +3476,7 @@ static const struct pid_entry tgid_base_stuff[] = {
>   	ONE("oom_score",  S_IRUGO, proc_oom_score),
>   	REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
>   	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
> +	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
>   #ifdef CONFIG_AUDIT
>   	REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
>   	REG("sessionid",  S_IRUGO, proc_sessionid_operations),
> @@ -3749,6 +3826,7 @@ static const struct pid_entry tid_base_stuff[] = {
>   	ONE("oom_score", S_IRUGO, proc_oom_score),
>   	REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
>   	REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
> +	REG("memalloc_flags", S_IRUGO|S_IWUSR, proc_memalloc_flags_operations),
>   #ifdef CONFIG_AUDIT
>   	REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
>   	REG("sessionid",  S_IRUGO, proc_sessionid_operations),


More information about the Devel mailing list