[Devel] [PATCH vz10 5/7] fs/kernfs, ve: fix ve_perms_map use-after-free, annotate it __rcu
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Tue Jun 30 15:08:39 MSK 2026
On 6/28/26 11:26, Mirian Shilakadze wrote:
> The per-VE sysfs permission readers (kernfs_ve_permission,
> kernfs_d_visible) read kn->ve_perms_map with a plain load and pass it to
> kmapset_get_value(), whose rcu_read_lock is internal, so the pointer is
> loaded outside any RCU section. A concurrent ve.sysfs_permissions write
> (kernfs_perms_set) swaps the pointer and drops the old map, which kmapset
> frees via kfree_rcu.
> On a preemptible kernel the reader can be scheduled
> out between the load and the internal rcu_read_lock, the grace period
> completes, the old map is freed, and the reader dereferences freed memory.
Note: Even on non-preemptible kernel if thread didn't mark itself with
rcu_read_lock kernel does not give any guaranty not to run rcu callbacks
concurrently, and can free the objects used right under us.
>
> Mark kn->ve_perms_map __rcu and route every access through the proper
> accessor: rcu_dereference under rcu_read_lock in the two readers,
> rcu_assign_pointer on the publish in kernfs_perms_set (replacing the plain
> swap), rcu_dereference_protected for the writer (serialised by
> sysfs_perms_mutex in the caller) and for the seq paths (held under
> kernfs_rwsem), rcu_access_pointer for the NULL check, and RCU_INIT_POINTER
> for the root setup. The annotation lets sparse and lockdep catch future
> misuse.
>
> Take the readers' rcu_read_lock() before the preceding kernfs_parent()
> namespace walk too. kernfs_parent() dereferences the rcu-protected parent
> pointer, so that walk must run inside the same rcu read-side section.
>
> https://virtuozzo.atlassian.net/browse/VSTOR-136480
> Fixes: cd9dd4ead86a ("ve/kernfs: implement ve-based permissions")
> Signed-off-by: Mirian Shilakadze <mirian.shilakadze at virtuozzo.com>
> ---
> fs/kernfs/ve.c | 79 +++++++++++++++++++++++++++++++-----------
> include/linux/kernfs.h | 2 +-
> 2 files changed, 59 insertions(+), 22 deletions(-)
>
> diff --git a/fs/kernfs/ve.c b/fs/kernfs/ve.c
> index 66bb4b534f48..94f3533bbdd9 100644
> --- a/fs/kernfs/ve.c
> +++ b/fs/kernfs/ve.c
> @@ -16,6 +16,7 @@
>
> #include <linux/ve.h>
> #include <linux/kmapset.h>
> +#include <linux/rcupdate.h>
> #include <linux/kernfs-ve.h>
>
> #include "kernfs-internal.h"
> @@ -33,12 +34,14 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
> struct kmapset_set *perms_set)
> {
> struct kernfs_node *kn = root->kn;
> + struct kmapset_map *map;
>
> kmapset_init_set(perms_set);
> - kn->ve_perms_map = kmapset_new(perms_set);
> - if (!kn->ve_perms_map)
> + map = kmapset_new(perms_set);
> + if (!map)
> return -ENOMEM;
> - kmapset_commit(kn->ve_perms_map);
> + kmapset_commit(map);
> + RCU_INIT_POINTER(kn->ve_perms_map, map);
>
> root->ve_perms_set = perms_set;
> return 0;
> @@ -46,7 +49,8 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
>
> int kernfs_ve_allowed(struct kernfs_node *kn)
> {
> - return !kn->ve_perms_map || ve_is_super(get_exec_env());
> + return !rcu_access_pointer(kn->ve_perms_map) ||
> + ve_is_super(get_exec_env());
> }
>
> static struct kmapset_key *kernfs_info_perms_key(struct kernfs_super_info *info)
> @@ -63,17 +67,22 @@ int kernfs_ve_permission(struct kernfs_node *kn,
> if (kernfs_ve_allowed(kn))
> return 0;
>
> + rcu_read_lock();
> /* Entries with namespace tag and their sub-entries always visible */
> while (tmp_kn) {
> - if (tmp_kn->ns)
> + if (tmp_kn->ns) {
> + rcu_read_unlock();
> return 0;
> + }
> tmp_kn = kernfs_parent(tmp_kn);
> }
>
> if (kernfs_type(kn) == KERNFS_LINK)
> kn = kn->symlink.target_kn;
>
> - perm = kmapset_get_value(kn->ve_perms_map, kernfs_info_perms_key(info));
> + perm = kmapset_get_value(rcu_dereference(kn->ve_perms_map),
> + kernfs_info_perms_key(info));
> + rcu_read_unlock();
> if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
> return 0;
>
> @@ -90,27 +99,35 @@ void kernfs_get_ve_perms(struct kernfs_node *kn)
>
> kms = kmapset_new(root->ve_perms_set);
> if (kms)
> - kn->ve_perms_map = kmapset_commit(kms);
> + rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(kms));
> }
>
> void kernfs_put_ve_perms(struct kernfs_node *kn)
> {
> - if (kn->ve_perms_map)
> - kmapset_put(kn->ve_perms_map);
> + /* teardown path, no concurrent users of kn */
> + struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
> + true);
> +
> + if (map)
> + kmapset_put(map);
> }
>
> bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
> {
> struct kernfs_node *tmp_kn = kn;
> + bool visible;
>
>
> if (kernfs_ve_allowed(kn))
> return true;
>
> + rcu_read_lock();
> /* Entries with namespace tag and their sub-entries always visible */
> while (tmp_kn) {
> - if (tmp_kn->ns)
> + if (tmp_kn->ns) {
> + rcu_read_unlock();
> return true;
> + }
> tmp_kn = kernfs_parent(tmp_kn);
> }
>
> @@ -118,8 +135,10 @@ bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
> if (kernfs_type(kn) == KERNFS_LINK)
> kn = kn->symlink.target_kn;
>
> - return !!kmapset_get_value(kn->ve_perms_map,
> - kernfs_info_perms_key(info));
> + visible = !!kmapset_get_value(rcu_dereference(kn->ve_perms_map),
> + kernfs_info_perms_key(info));
> + rcu_read_unlock();
> + return visible;
> }
>
> #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
> @@ -146,15 +165,22 @@ static struct kernfs_node *kernfs_next_recursive(struct kernfs_node *kn)
> static bool kernfs_perms_shown(struct ve_struct *ve, struct kernfs_node *kn,
> struct kmapset_key *key)
> {
> + /*
> + * The caller's perms mutex serialises against the map writer. The
> + * lockdep check below names kernfs_rwsem (also held, for the tree
> + * walk) because that mutex is private to the caller.
> + */
> + struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
> + lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));
I don't understand why we would put some lock unrelated to ->ve_perms_map update
path here, let's go with "true". And add more concise comment like:
/* Protected against ve_perms_apply() with sysfs_perms_mutex */
Please don't use ambiguous "caller's perms mutex", just name the lock as it is
in current tree. It would've grately decreased the review time.
note: Same applies to other lockdep_is_held places.
> bool ret;
>
> - if (!kn->ve_perms_map)
> + if (!map)
> return false;
> if (ve_is_super(ve))
> - return kn->ve_perms_map->default_value != 0;
> + return map->default_value != 0;
> /* kmapset_lookup() walks an rcu list, so keep the section to the lookup. */
> rcu_read_lock();
> - ret = kmapset_lookup(kn->ve_perms_map, key) != NULL;
> + ret = kmapset_lookup(map, key) != NULL;
> rcu_read_unlock();
> return ret;
> }
> @@ -198,14 +224,22 @@ int kernfs_perms_show(struct seq_file *m, void *v, struct kmapset_key *key)
> {
> struct ve_struct *ve = css_to_ve(seq_css(m));
> struct kernfs_node *kn = v;
> + struct kmapset_map *map;
> char *buf;
> size_t size, len, off;
> int mask;
>
> + /*
> + * The caller's perms mutex serialises against the map writer. The
> + * lockdep check below names kernfs_rwsem (also held, for the tree
> + * walk) because that mutex is private to the caller.
> + */
> + map = rcu_dereference_protected(kn->ve_perms_map,
> + lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));
> if (ve_is_super(ve))
> - mask = kn->ve_perms_map->default_value;
> + mask = map->default_value;
> else
> - mask = kmapset_get_value(kn->ve_perms_map, key);
> + mask = kmapset_get_value(map, key);
>
> size = seq_get_buf(m, &buf);
> if (size) {
> @@ -246,7 +280,7 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
> struct kernfs_node *root, struct kmapset_key *key)
> {
> struct kernfs_node *kn = root, *nkn;
> - struct kmapset_map *map = NULL;
> + struct kmapset_map *map = NULL, *old;
> char *sep = path, *dname;
> int ret;
>
> @@ -272,7 +306,8 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
> } while (sep);
>
> ret = -ENOMEM;
> - map = kmapset_dup(kn->ve_perms_map);
> + /* the caller (fs/sysfs/ve.c) holds sysfs_perms_mutex vs other writers */
> + map = kmapset_dup(rcu_dereference_protected(kn->ve_perms_map, true));
> if (!map)
> goto out_put;
>
> @@ -286,8 +321,10 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
> }
>
> if (!ret) {
> - map = kmapset_commit(map);
> - swap(map, kn->ve_perms_map);
> + old = rcu_dereference_protected(kn->ve_perms_map, true);
> + rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(map));
> + kmapset_put(old);
> + map = NULL;
> }
>
> out_put:
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index d0c6f0f17a58..06e750b6a88f 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -232,7 +232,7 @@ struct kernfs_node {
>
> struct rcu_head rcu;
> #ifdef CONFIG_VE
> - struct kmapset_map *ve_perms_map;
> + struct kmapset_map __rcu *ve_perms_map;
> #endif
> };
>
--
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.
More information about the Devel
mailing list