[Devel] [PATCH vz10 5/7] fs/kernfs, ve: fix ve_perms_map use-after-free, annotate it __rcu

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Tue Jun 30 15:08:39 MSK 2026



On 6/28/26 11:26, Mirian Shilakadze wrote:
> The per-VE sysfs permission readers (kernfs_ve_permission,
> kernfs_d_visible) read kn->ve_perms_map with a plain load and pass it to
> kmapset_get_value(), whose rcu_read_lock is internal, so the pointer is
> loaded outside any RCU section. A concurrent ve.sysfs_permissions write
> (kernfs_perms_set) swaps the pointer and drops the old map, which kmapset
> frees via kfree_rcu.

> On a preemptible kernel the reader can be scheduled
> out between the load and the internal rcu_read_lock, the grace period
> completes, the old map is freed, and the reader dereferences freed memory.

Note: Even on non-preemptible kernel if thread didn't mark itself with
rcu_read_lock kernel does not give any guaranty not to run rcu callbacks
concurrently, and can free the objects used right under us.

> 
> Mark kn->ve_perms_map __rcu and route every access through the proper
> accessor: rcu_dereference under rcu_read_lock in the two readers,
> rcu_assign_pointer on the publish in kernfs_perms_set (replacing the plain
> swap), rcu_dereference_protected for the writer (serialised by
> sysfs_perms_mutex in the caller) and for the seq paths (held under
> kernfs_rwsem), rcu_access_pointer for the NULL check, and RCU_INIT_POINTER
> for the root setup. The annotation lets sparse and lockdep catch future
> misuse.
> 
> Take the readers' rcu_read_lock() before the preceding kernfs_parent()
> namespace walk too. kernfs_parent() dereferences the rcu-protected parent
> pointer, so that walk must run inside the same rcu read-side section.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-136480
> Fixes: cd9dd4ead86a ("ve/kernfs: implement ve-based permissions")
> Signed-off-by: Mirian Shilakadze <mirian.shilakadze at virtuozzo.com>
> ---
>  fs/kernfs/ve.c         | 79 +++++++++++++++++++++++++++++++-----------
>  include/linux/kernfs.h |  2 +-
>  2 files changed, 59 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/kernfs/ve.c b/fs/kernfs/ve.c
> index 66bb4b534f48..94f3533bbdd9 100644
> --- a/fs/kernfs/ve.c
> +++ b/fs/kernfs/ve.c
> @@ -16,6 +16,7 @@
>  
>  #include <linux/ve.h>
>  #include <linux/kmapset.h>
> +#include <linux/rcupdate.h>
>  #include <linux/kernfs-ve.h>
>  
>  #include "kernfs-internal.h"
> @@ -33,12 +34,14 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
>  			 struct kmapset_set *perms_set)
>  {
>  	struct kernfs_node *kn = root->kn;
> +	struct kmapset_map *map;
>  
>  	kmapset_init_set(perms_set);
> -	kn->ve_perms_map = kmapset_new(perms_set);
> -	if (!kn->ve_perms_map)
> +	map = kmapset_new(perms_set);
> +	if (!map)
>  		return -ENOMEM;
> -	kmapset_commit(kn->ve_perms_map);
> +	kmapset_commit(map);
> +	RCU_INIT_POINTER(kn->ve_perms_map, map);
>  
>  	root->ve_perms_set = perms_set;
>  	return 0;
> @@ -46,7 +49,8 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
>  
>  int kernfs_ve_allowed(struct kernfs_node *kn)
>  {
> -	return !kn->ve_perms_map || ve_is_super(get_exec_env());
> +	return !rcu_access_pointer(kn->ve_perms_map) ||
> +	       ve_is_super(get_exec_env());
>  }
>  
>  static struct kmapset_key *kernfs_info_perms_key(struct kernfs_super_info *info)
> @@ -63,17 +67,22 @@ int kernfs_ve_permission(struct kernfs_node *kn,
>  	if (kernfs_ve_allowed(kn))
>  		return 0;
>  
> +	rcu_read_lock();
>  	/* Entries with namespace tag and their sub-entries always visible */
>  	while (tmp_kn) {
> -		if (tmp_kn->ns)
> +		if (tmp_kn->ns) {
> +			rcu_read_unlock();
>  			return 0;
> +		}
>  		tmp_kn = kernfs_parent(tmp_kn);
>  	}
>  
>  	if (kernfs_type(kn) == KERNFS_LINK)
>  		kn = kn->symlink.target_kn;
>  
> -	perm = kmapset_get_value(kn->ve_perms_map, kernfs_info_perms_key(info));
> +	perm = kmapset_get_value(rcu_dereference(kn->ve_perms_map),
> +				 kernfs_info_perms_key(info));
> +	rcu_read_unlock();
>  	if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
>  		return 0;
>  
> @@ -90,27 +99,35 @@ void kernfs_get_ve_perms(struct kernfs_node *kn)
>  
>  	kms = kmapset_new(root->ve_perms_set);
>  	if (kms)
> -		kn->ve_perms_map = kmapset_commit(kms);
> +		rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(kms));
>  }
>  
>  void kernfs_put_ve_perms(struct kernfs_node *kn)
>  {
> -	if (kn->ve_perms_map)
> -		kmapset_put(kn->ve_perms_map);
> +	/* teardown path, no concurrent users of kn */
> +	struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
> +							    true);
> +
> +	if (map)
> +		kmapset_put(map);
>  }
>  
>  bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
>  {
>  	struct kernfs_node *tmp_kn = kn;
> +	bool visible;
>  
>  
>  	if (kernfs_ve_allowed(kn))
>  		return true;
>  
> +	rcu_read_lock();
>  	/* Entries with namespace tag and their sub-entries always visible */
>  	while (tmp_kn) {
> -		if (tmp_kn->ns)
> +		if (tmp_kn->ns) {
> +			rcu_read_unlock();
>  			return true;
> +		}
>  		tmp_kn = kernfs_parent(tmp_kn);
>  	}
>  
> @@ -118,8 +135,10 @@ bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
>  	if (kernfs_type(kn) == KERNFS_LINK)
>  		kn = kn->symlink.target_kn;
>  
> -	return !!kmapset_get_value(kn->ve_perms_map,
> -				   kernfs_info_perms_key(info));
> +	visible = !!kmapset_get_value(rcu_dereference(kn->ve_perms_map),
> +				      kernfs_info_perms_key(info));
> +	rcu_read_unlock();
> +	return visible;
>  }
>  
>  #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
> @@ -146,15 +165,22 @@ static struct kernfs_node *kernfs_next_recursive(struct kernfs_node *kn)
>  static bool kernfs_perms_shown(struct ve_struct *ve, struct kernfs_node *kn,
>  			       struct kmapset_key *key)
>  {
> +	/*
> +	 * The caller's perms mutex serialises against the map writer. The
> +	 * lockdep check below names kernfs_rwsem (also held, for the tree
> +	 * walk) because that mutex is private to the caller.
> +	 */
> +	struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
> +				lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));

I don't understand why we would put some lock unrelated to ->ve_perms_map update
path here, let's go with "true". And add more concise comment like:

/* Protected against ve_perms_apply() with sysfs_perms_mutex */

Please don't use ambiguous "caller's perms mutex", just name the lock as it is
in current tree. It would've grately decreased the review time.

note: Same applies to other lockdep_is_held places.

>  	bool ret;
>  
> -	if (!kn->ve_perms_map)
> +	if (!map)
>  		return false;
>  	if (ve_is_super(ve))
> -		return kn->ve_perms_map->default_value != 0;
> +		return map->default_value != 0;
>  	/* kmapset_lookup() walks an rcu list, so keep the section to the lookup. */
>  	rcu_read_lock();
> -	ret = kmapset_lookup(kn->ve_perms_map, key) != NULL;
> +	ret = kmapset_lookup(map, key) != NULL;
>  	rcu_read_unlock();
>  	return ret;
>  }
> @@ -198,14 +224,22 @@ int kernfs_perms_show(struct seq_file *m, void *v, struct kmapset_key *key)
>  {
>  	struct ve_struct *ve = css_to_ve(seq_css(m));
>  	struct kernfs_node *kn = v;
> +	struct kmapset_map *map;
>  	char *buf;
>  	size_t size, len, off;
>  	int mask;
>  
> +	/*
> +	 * The caller's perms mutex serialises against the map writer. The
> +	 * lockdep check below names kernfs_rwsem (also held, for the tree
> +	 * walk) because that mutex is private to the caller.
> +	 */
> +	map = rcu_dereference_protected(kn->ve_perms_map,
> +			lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));
>  	if (ve_is_super(ve))
> -		mask = kn->ve_perms_map->default_value;
> +		mask = map->default_value;
>  	else
> -		mask = kmapset_get_value(kn->ve_perms_map, key);
> +		mask = kmapset_get_value(map, key);
>  
>  	size = seq_get_buf(m, &buf);
>  	if (size) {
> @@ -246,7 +280,7 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
>  		     struct kernfs_node *root, struct kmapset_key *key)
>  {
>  	struct kernfs_node *kn = root, *nkn;
> -	struct kmapset_map *map = NULL;
> +	struct kmapset_map *map = NULL, *old;
>  	char *sep = path, *dname;
>  	int ret;
>  
> @@ -272,7 +306,8 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
>  	} while (sep);
>  
>  	ret = -ENOMEM;
> -	map = kmapset_dup(kn->ve_perms_map);
> +	/* the caller (fs/sysfs/ve.c) holds sysfs_perms_mutex vs other writers */
> +	map = kmapset_dup(rcu_dereference_protected(kn->ve_perms_map, true));
>  	if (!map)
>  		goto out_put;
>  
> @@ -286,8 +321,10 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
>  	}
>  
>  	if (!ret) {
> -		map = kmapset_commit(map);
> -		swap(map, kn->ve_perms_map);
> +		old = rcu_dereference_protected(kn->ve_perms_map, true);
> +		rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(map));
> +		kmapset_put(old);
> +		map = NULL;
>  	}
>  
>  out_put:
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index d0c6f0f17a58..06e750b6a88f 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -232,7 +232,7 @@ struct kernfs_node {
>  
>  	struct rcu_head		rcu;
>  #ifdef CONFIG_VE
> -	struct kmapset_map	*ve_perms_map;
> +	struct kmapset_map __rcu *ve_perms_map;
>  #endif
>  };
>  

-- 
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.



More information about the Devel mailing list