[Devel] [PATCH vz10 5/7] fs/kernfs, ve: fix ve_perms_map use-after-free, annotate it __rcu

Mirian Shilakadze mirian.shilakadze at virtuozzo.com
Sun Jun 28 12:26:03 MSK 2026


The per-VE sysfs permission readers (kernfs_ve_permission,
kernfs_d_visible) read kn->ve_perms_map with a plain load and pass it to
kmapset_get_value(), whose rcu_read_lock is internal, so the pointer is
loaded outside any RCU section. A concurrent ve.sysfs_permissions write
(kernfs_perms_set) swaps the pointer and drops the old map, which kmapset
frees via kfree_rcu. On a preemptible kernel the reader can be scheduled
out between the load and the internal rcu_read_lock, the grace period
completes, the old map is freed, and the reader dereferences freed memory.

Mark kn->ve_perms_map __rcu and route every access through the proper
accessor: rcu_dereference under rcu_read_lock in the two readers,
rcu_assign_pointer on the publish in kernfs_perms_set (replacing the plain
swap), rcu_dereference_protected for the writer (serialised by
sysfs_perms_mutex in the caller) and for the seq paths (held under
kernfs_rwsem), rcu_access_pointer for the NULL check, and RCU_INIT_POINTER
for the root setup. The annotation lets sparse and lockdep catch future
misuse.

Take the readers' rcu_read_lock() before the preceding kernfs_parent()
namespace walk too. kernfs_parent() dereferences the rcu-protected parent
pointer, so that walk must run inside the same rcu read-side section.

https://virtuozzo.atlassian.net/browse/VSTOR-136480
Fixes: cd9dd4ead86a ("ve/kernfs: implement ve-based permissions")
Signed-off-by: Mirian Shilakadze <mirian.shilakadze at virtuozzo.com>
---
 fs/kernfs/ve.c         | 79 +++++++++++++++++++++++++++++++-----------
 include/linux/kernfs.h |  2 +-
 2 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/fs/kernfs/ve.c b/fs/kernfs/ve.c
index 66bb4b534f48..94f3533bbdd9 100644
--- a/fs/kernfs/ve.c
+++ b/fs/kernfs/ve.c
@@ -16,6 +16,7 @@
 
 #include <linux/ve.h>
 #include <linux/kmapset.h>
+#include <linux/rcupdate.h>
 #include <linux/kernfs-ve.h>
 
 #include "kernfs-internal.h"
@@ -33,12 +34,14 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
 			 struct kmapset_set *perms_set)
 {
 	struct kernfs_node *kn = root->kn;
+	struct kmapset_map *map;
 
 	kmapset_init_set(perms_set);
-	kn->ve_perms_map = kmapset_new(perms_set);
-	if (!kn->ve_perms_map)
+	map = kmapset_new(perms_set);
+	if (!map)
 		return -ENOMEM;
-	kmapset_commit(kn->ve_perms_map);
+	kmapset_commit(map);
+	RCU_INIT_POINTER(kn->ve_perms_map, map);
 
 	root->ve_perms_set = perms_set;
 	return 0;
@@ -46,7 +49,8 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
 
 int kernfs_ve_allowed(struct kernfs_node *kn)
 {
-	return !kn->ve_perms_map || ve_is_super(get_exec_env());
+	return !rcu_access_pointer(kn->ve_perms_map) ||
+	       ve_is_super(get_exec_env());
 }
 
 static struct kmapset_key *kernfs_info_perms_key(struct kernfs_super_info *info)
@@ -63,17 +67,22 @@ int kernfs_ve_permission(struct kernfs_node *kn,
 	if (kernfs_ve_allowed(kn))
 		return 0;
 
+	rcu_read_lock();
 	/* Entries with namespace tag and their sub-entries always visible */
 	while (tmp_kn) {
-		if (tmp_kn->ns)
+		if (tmp_kn->ns) {
+			rcu_read_unlock();
 			return 0;
+		}
 		tmp_kn = kernfs_parent(tmp_kn);
 	}
 
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kn = kn->symlink.target_kn;
 
-	perm = kmapset_get_value(kn->ve_perms_map, kernfs_info_perms_key(info));
+	perm = kmapset_get_value(rcu_dereference(kn->ve_perms_map),
+				 kernfs_info_perms_key(info));
+	rcu_read_unlock();
 	if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 		return 0;
 
@@ -90,27 +99,35 @@ void kernfs_get_ve_perms(struct kernfs_node *kn)
 
 	kms = kmapset_new(root->ve_perms_set);
 	if (kms)
-		kn->ve_perms_map = kmapset_commit(kms);
+		rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(kms));
 }
 
 void kernfs_put_ve_perms(struct kernfs_node *kn)
 {
-	if (kn->ve_perms_map)
-		kmapset_put(kn->ve_perms_map);
+	/* teardown path, no concurrent users of kn */
+	struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
+							    true);
+
+	if (map)
+		kmapset_put(map);
 }
 
 bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
 {
 	struct kernfs_node *tmp_kn = kn;
+	bool visible;
 
 
 	if (kernfs_ve_allowed(kn))
 		return true;
 
+	rcu_read_lock();
 	/* Entries with namespace tag and their sub-entries always visible */
 	while (tmp_kn) {
-		if (tmp_kn->ns)
+		if (tmp_kn->ns) {
+			rcu_read_unlock();
 			return true;
+		}
 		tmp_kn = kernfs_parent(tmp_kn);
 	}
 
@@ -118,8 +135,10 @@ bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kn = kn->symlink.target_kn;
 
-	return !!kmapset_get_value(kn->ve_perms_map,
-				   kernfs_info_perms_key(info));
+	visible = !!kmapset_get_value(rcu_dereference(kn->ve_perms_map),
+				      kernfs_info_perms_key(info));
+	rcu_read_unlock();
+	return visible;
 }
 
 #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -146,15 +165,22 @@ static struct kernfs_node *kernfs_next_recursive(struct kernfs_node *kn)
 static bool kernfs_perms_shown(struct ve_struct *ve, struct kernfs_node *kn,
 			       struct kmapset_key *key)
 {
+	/*
+	 * The caller's perms mutex serialises against the map writer. The
+	 * lockdep check below names kernfs_rwsem (also held, for the tree
+	 * walk) because that mutex is private to the caller.
+	 */
+	struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
+				lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));
 	bool ret;
 
-	if (!kn->ve_perms_map)
+	if (!map)
 		return false;
 	if (ve_is_super(ve))
-		return kn->ve_perms_map->default_value != 0;
+		return map->default_value != 0;
 	/* kmapset_lookup() walks an rcu list, so keep the section to the lookup. */
 	rcu_read_lock();
-	ret = kmapset_lookup(kn->ve_perms_map, key) != NULL;
+	ret = kmapset_lookup(map, key) != NULL;
 	rcu_read_unlock();
 	return ret;
 }
@@ -198,14 +224,22 @@ int kernfs_perms_show(struct seq_file *m, void *v, struct kmapset_key *key)
 {
 	struct ve_struct *ve = css_to_ve(seq_css(m));
 	struct kernfs_node *kn = v;
+	struct kmapset_map *map;
 	char *buf;
 	size_t size, len, off;
 	int mask;
 
+	/*
+	 * The caller's perms mutex serialises against the map writer. The
+	 * lockdep check below names kernfs_rwsem (also held, for the tree
+	 * walk) because that mutex is private to the caller.
+	 */
+	map = rcu_dereference_protected(kn->ve_perms_map,
+			lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));
 	if (ve_is_super(ve))
-		mask = kn->ve_perms_map->default_value;
+		mask = map->default_value;
 	else
-		mask = kmapset_get_value(kn->ve_perms_map, key);
+		mask = kmapset_get_value(map, key);
 
 	size = seq_get_buf(m, &buf);
 	if (size) {
@@ -246,7 +280,7 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
 		     struct kernfs_node *root, struct kmapset_key *key)
 {
 	struct kernfs_node *kn = root, *nkn;
-	struct kmapset_map *map = NULL;
+	struct kmapset_map *map = NULL, *old;
 	char *sep = path, *dname;
 	int ret;
 
@@ -272,7 +306,8 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
 	} while (sep);
 
 	ret = -ENOMEM;
-	map = kmapset_dup(kn->ve_perms_map);
+	/* the caller (fs/sysfs/ve.c) holds sysfs_perms_mutex vs other writers */
+	map = kmapset_dup(rcu_dereference_protected(kn->ve_perms_map, true));
 	if (!map)
 		goto out_put;
 
@@ -286,8 +321,10 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
 	}
 
 	if (!ret) {
-		map = kmapset_commit(map);
-		swap(map, kn->ve_perms_map);
+		old = rcu_dereference_protected(kn->ve_perms_map, true);
+		rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(map));
+		kmapset_put(old);
+		map = NULL;
 	}
 
 out_put:
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d0c6f0f17a58..06e750b6a88f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -232,7 +232,7 @@ struct kernfs_node {
 
 	struct rcu_head		rcu;
 #ifdef CONFIG_VE
-	struct kmapset_map	*ve_perms_map;
+	struct kmapset_map __rcu *ve_perms_map;
 #endif
 };
 
-- 
2.43.0



More information about the Devel mailing list