[Devel] [PATCH vz10 5/7] fs/kernfs, ve: fix ve_perms_map use-after-free, annotate it __rcu
Mirian Shilakadze
mirian.shilakadze at virtuozzo.com
Sun Jun 28 12:26:03 MSK 2026
The per-VE sysfs permission readers (kernfs_ve_permission,
kernfs_d_visible) read kn->ve_perms_map with a plain load and pass it to
kmapset_get_value(), whose rcu_read_lock is internal, so the pointer is
loaded outside any RCU section. A concurrent ve.sysfs_permissions write
(kernfs_perms_set) swaps the pointer and drops the old map, which kmapset
frees via kfree_rcu. On a preemptible kernel the reader can be scheduled
out between the load and the internal rcu_read_lock, the grace period
completes, the old map is freed, and the reader dereferences freed memory.
Mark kn->ve_perms_map __rcu and route every access through the proper
accessor: rcu_dereference under rcu_read_lock in the two readers,
rcu_assign_pointer on the publish in kernfs_perms_set (replacing the plain
swap), rcu_dereference_protected for the writer (serialised by
sysfs_perms_mutex in the caller) and for the seq paths (held under
kernfs_rwsem), rcu_access_pointer for the NULL check, and RCU_INIT_POINTER
for the root setup. The annotation lets sparse and lockdep catch future
misuse.
Take the readers' rcu_read_lock() before the preceding kernfs_parent()
namespace walk too. kernfs_parent() dereferences the rcu-protected parent
pointer, so that walk must run inside the same rcu read-side section.
https://virtuozzo.atlassian.net/browse/VSTOR-136480
Fixes: cd9dd4ead86a ("ve/kernfs: implement ve-based permissions")
Signed-off-by: Mirian Shilakadze <mirian.shilakadze at virtuozzo.com>
---
fs/kernfs/ve.c | 79 +++++++++++++++++++++++++++++++-----------
include/linux/kernfs.h | 2 +-
2 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/fs/kernfs/ve.c b/fs/kernfs/ve.c
index 66bb4b534f48..94f3533bbdd9 100644
--- a/fs/kernfs/ve.c
+++ b/fs/kernfs/ve.c
@@ -16,6 +16,7 @@
#include <linux/ve.h>
#include <linux/kmapset.h>
+#include <linux/rcupdate.h>
#include <linux/kernfs-ve.h>
#include "kernfs-internal.h"
@@ -33,12 +34,14 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
struct kmapset_set *perms_set)
{
struct kernfs_node *kn = root->kn;
+ struct kmapset_map *map;
kmapset_init_set(perms_set);
- kn->ve_perms_map = kmapset_new(perms_set);
- if (!kn->ve_perms_map)
+ map = kmapset_new(perms_set);
+ if (!map)
return -ENOMEM;
- kmapset_commit(kn->ve_perms_map);
+ kmapset_commit(map);
+ RCU_INIT_POINTER(kn->ve_perms_map, map);
root->ve_perms_set = perms_set;
return 0;
@@ -46,7 +49,8 @@ int kernfs_init_ve_perms(struct kernfs_root *root,
int kernfs_ve_allowed(struct kernfs_node *kn)
{
- return !kn->ve_perms_map || ve_is_super(get_exec_env());
+ return !rcu_access_pointer(kn->ve_perms_map) ||
+ ve_is_super(get_exec_env());
}
static struct kmapset_key *kernfs_info_perms_key(struct kernfs_super_info *info)
@@ -63,17 +67,22 @@ int kernfs_ve_permission(struct kernfs_node *kn,
if (kernfs_ve_allowed(kn))
return 0;
+ rcu_read_lock();
/* Entries with namespace tag and their sub-entries always visible */
while (tmp_kn) {
- if (tmp_kn->ns)
+ if (tmp_kn->ns) {
+ rcu_read_unlock();
return 0;
+ }
tmp_kn = kernfs_parent(tmp_kn);
}
if (kernfs_type(kn) == KERNFS_LINK)
kn = kn->symlink.target_kn;
- perm = kmapset_get_value(kn->ve_perms_map, kernfs_info_perms_key(info));
+ perm = kmapset_get_value(rcu_dereference(kn->ve_perms_map),
+ kernfs_info_perms_key(info));
+ rcu_read_unlock();
if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
return 0;
@@ -90,27 +99,35 @@ void kernfs_get_ve_perms(struct kernfs_node *kn)
kms = kmapset_new(root->ve_perms_set);
if (kms)
- kn->ve_perms_map = kmapset_commit(kms);
+ rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(kms));
}
void kernfs_put_ve_perms(struct kernfs_node *kn)
{
- if (kn->ve_perms_map)
- kmapset_put(kn->ve_perms_map);
+ /* teardown path, no concurrent users of kn */
+ struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
+ true);
+
+ if (map)
+ kmapset_put(map);
}
bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
{
struct kernfs_node *tmp_kn = kn;
+ bool visible;
if (kernfs_ve_allowed(kn))
return true;
+ rcu_read_lock();
/* Entries with namespace tag and their sub-entries always visible */
while (tmp_kn) {
- if (tmp_kn->ns)
+ if (tmp_kn->ns) {
+ rcu_read_unlock();
return true;
+ }
tmp_kn = kernfs_parent(tmp_kn);
}
@@ -118,8 +135,10 @@ bool kernfs_d_visible(struct kernfs_node *kn, struct kernfs_super_info *info)
if (kernfs_type(kn) == KERNFS_LINK)
kn = kn->symlink.target_kn;
- return !!kmapset_get_value(kn->ve_perms_map,
- kernfs_info_perms_key(info));
+ visible = !!kmapset_get_value(rcu_dereference(kn->ve_perms_map),
+ kernfs_info_perms_key(info));
+ rcu_read_unlock();
+ return visible;
}
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -146,15 +165,22 @@ static struct kernfs_node *kernfs_next_recursive(struct kernfs_node *kn)
static bool kernfs_perms_shown(struct ve_struct *ve, struct kernfs_node *kn,
struct kmapset_key *key)
{
+ /*
+ * The caller's perms mutex serialises against the map writer. The
+ * lockdep check below names kernfs_rwsem (also held, for the tree
+ * walk) because that mutex is private to the caller.
+ */
+ struct kmapset_map *map = rcu_dereference_protected(kn->ve_perms_map,
+ lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));
bool ret;
- if (!kn->ve_perms_map)
+ if (!map)
return false;
if (ve_is_super(ve))
- return kn->ve_perms_map->default_value != 0;
+ return map->default_value != 0;
/* kmapset_lookup() walks an rcu list, so keep the section to the lookup. */
rcu_read_lock();
- ret = kmapset_lookup(kn->ve_perms_map, key) != NULL;
+ ret = kmapset_lookup(map, key) != NULL;
rcu_read_unlock();
return ret;
}
@@ -198,14 +224,22 @@ int kernfs_perms_show(struct seq_file *m, void *v, struct kmapset_key *key)
{
struct ve_struct *ve = css_to_ve(seq_css(m));
struct kernfs_node *kn = v;
+ struct kmapset_map *map;
char *buf;
size_t size, len, off;
int mask;
+ /*
+ * The caller's perms mutex serialises against the map writer. The
+ * lockdep check below names kernfs_rwsem (also held, for the tree
+ * walk) because that mutex is private to the caller.
+ */
+ map = rcu_dereference_protected(kn->ve_perms_map,
+ lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem));
if (ve_is_super(ve))
- mask = kn->ve_perms_map->default_value;
+ mask = map->default_value;
else
- mask = kmapset_get_value(kn->ve_perms_map, key);
+ mask = kmapset_get_value(map, key);
size = seq_get_buf(m, &buf);
if (size) {
@@ -246,7 +280,7 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
struct kernfs_node *root, struct kmapset_key *key)
{
struct kernfs_node *kn = root, *nkn;
- struct kmapset_map *map = NULL;
+ struct kmapset_map *map = NULL, *old;
char *sep = path, *dname;
int ret;
@@ -272,7 +306,8 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
} while (sep);
ret = -ENOMEM;
- map = kmapset_dup(kn->ve_perms_map);
+ /* the caller (fs/sysfs/ve.c) holds sysfs_perms_mutex vs other writers */
+ map = kmapset_dup(rcu_dereference_protected(kn->ve_perms_map, true));
if (!map)
goto out_put;
@@ -286,8 +321,10 @@ int kernfs_perms_set(char *path, struct ve_struct *ve, int mask,
}
if (!ret) {
- map = kmapset_commit(map);
- swap(map, kn->ve_perms_map);
+ old = rcu_dereference_protected(kn->ve_perms_map, true);
+ rcu_assign_pointer(kn->ve_perms_map, kmapset_commit(map));
+ kmapset_put(old);
+ map = NULL;
}
out_put:
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index d0c6f0f17a58..06e750b6a88f 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -232,7 +232,7 @@ struct kernfs_node {
struct rcu_head rcu;
#ifdef CONFIG_VE
- struct kmapset_map *ve_perms_map;
+ struct kmapset_map __rcu *ve_perms_map;
#endif
};
--
2.43.0
More information about the Devel
mailing list