[Devel] [PATCH vz10 7/7] fs/proc, ve: add per-VE ve.proc_permissions
Mirian Shilakadze
mirian.shilakadze at virtuozzo.com
Sun Jun 28 12:26:05 MSK 2026
Add a per-VE allowlist of /proc paths exposed through the cgroup file
ve.proc_permissions, the procfs counterpart of ve.sysfs_permissions.
Each proc_dir_entry gains a kmapset map keyed by VE (proc_perms_key on
ve_struct), so the single shared proc tree yields per-VE answers. The
filesystem agnostic leaf logic is reused from fs/ve_perms.c, this commit
adds the proc tree walk, the locking, and the VFS hooks: visibility in
proc_lookup_de/proc_readdir_de and a .permission inode op.
Paths are written relative to the proc root like sysfs, path mask where
mask is r/w/x or - to remove. The host (ve0) is unaffected and an empty
list exposes nothing extra. The lock-free readers load the map under rcu
against the writer's copy-on-write swap, and the seq read is serialised
against the writer by proc_perms_mutex.
Signed-off-by: Mirian Shilakadze <mirian.shilakadze at virtuozzo.com>
---
fs/proc/Makefile | 1 +
fs/proc/generic.c | 48 ++++++-
fs/proc/inode.c | 2 +
fs/proc/internal.h | 25 ++++
fs/proc/root.c | 1 +
fs/proc/ve.c | 345 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/ve.h | 1 +
kernel/ve/ve.c | 7 +
8 files changed, 423 insertions(+), 7 deletions(-)
create mode 100644 fs/proc/ve.c
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7b4db9c56e6a..61a999c03663 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -11,6 +11,7 @@ proc-$(CONFIG_MMU) := task_mmu.o
proc-y += inode.o root.o base.o generic.o array.o \
fd.o
+proc-$(CONFIG_VE) += ve.o
proc-$(CONFIG_TTY) += proc_tty.o
proc-y += cmdline.o
proc-y += consoles.o
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e8fd7c2d1c3a..791c38c49a86 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -30,7 +30,7 @@
#include "internal.h"
-static DEFINE_RWLOCK(proc_subdir_lock);
+DEFINE_RWLOCK(proc_subdir_lock);
struct kmem_cache *proc_dir_entry_cache __ro_after_init;
@@ -65,9 +65,9 @@ static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
subdir_node);
}
-static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
- const char *name,
- unsigned int len)
+struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+ const char *name,
+ unsigned int len)
{
struct rb_node *node = dir->subdir.rb_node;
@@ -120,6 +120,31 @@ static bool proc_in_container(struct super_block *sb)
return !ve_is_super(get_exec_env());
}
+/* Visible to the current VE: globally published (S_ISVTX) or per-VE allowed. */
+static bool pde_visible_to_ve(struct proc_dir_entry *de)
+{
+ return (de->mode & S_ISVTX) || proc_d_visible(de);
+}
+
+#ifdef CONFIG_VE
+static int proc_iop_permission(struct mnt_idmap *idmap, struct inode *inode,
+ int mask)
+{
+ struct proc_dir_entry *de = PDE(inode);
+ int ret = 0;
+
+ /*
+ * Runs safely during rcu-walk: proc_ve_permission() is a lockless rcu
+ * kmapset lookup and generic_permission() copes with rcu-walk on its
+ * own, so MAY_NOT_BLOCK needs no special handling here.
+ */
+ if (proc_in_container(inode->i_sb) && !(de->mode & S_ISVTX))
+ ret = proc_ve_permission(de, mask);
+
+ return ret ? ret : generic_permission(idmap, inode, mask);
+}
+#endif
+
static int proc_notify_change(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *iattr)
{
@@ -167,6 +192,9 @@ static int proc_getattr(struct mnt_idmap *idmap,
static const struct inode_operations proc_file_inode_operations = {
.setattr = proc_notify_change,
+#ifdef CONFIG_VE
+ .permission = proc_iop_permission,
+#endif
};
/*
@@ -264,7 +292,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
read_lock(&proc_subdir_lock);
de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
if (de) {
- if (in_container && !(de->mode & S_ISVTX)) {
+ if (in_container && !pde_visible_to_ve(de)) {
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
}
@@ -317,7 +345,7 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
read_unlock(&proc_subdir_lock);
return 0;
}
- if (!in_container || (de->mode & S_ISVTX)) {
+ if (!in_container || pde_visible_to_ve(de)) {
if (!i)
break;
i--;
@@ -328,7 +356,7 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
do {
struct proc_dir_entry *next;
- if (in_container && !(de->mode & S_ISVTX)) {
+ if (in_container && !pde_visible_to_ve(de)) {
de = pde_subdir_next(de);
continue;
}
@@ -389,6 +417,9 @@ static const struct inode_operations proc_dir_inode_operations = {
.lookup = proc_lookup,
.getattr = proc_getattr,
.setattr = proc_notify_change,
+#ifdef CONFIG_VE
+ .permission = proc_iop_permission,
+#endif
};
/* returns the registered entry, or frees dp and returns NULL on failure */
@@ -413,6 +444,7 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
out_free_inum:
proc_free_inum(dp->low_ino);
out_free_entry:
+ proc_put_ve_perms(dp);
pde_free(dp);
return NULL;
}
@@ -471,6 +503,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
ent->nlink = nlink;
ent->subdir = RB_ROOT;
refcount_set(&ent->refcnt, 1);
+ proc_get_ve_perms(ent);
spin_lock_init(&ent->pde_unload_lock);
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
@@ -498,6 +531,7 @@ struct proc_dir_entry *proc_symlink_mode(const char *name, umode_t mode,
ent->proc_iops = &proc_link_inode_operations;
ent = proc_register(parent, ent);
} else {
+ proc_put_ve_perms(ent);
pde_free(ent);
ent = NULL;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 5d1a75408aa4..ac943a9768f4 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -272,6 +272,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
spin_lock(&de->pde_unload_lock);
}
spin_unlock(&de->pde_unload_lock);
+
+ proc_put_ve_perms(de);
}
static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 77a517f91821..4a28da7d5dee 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -17,6 +17,7 @@
struct ctl_table_header;
struct mempolicy;
+struct kmapset_map;
/*
* This is not completely implemented yet. The idea is to
@@ -64,6 +65,9 @@ struct proc_dir_entry {
umode_t mode;
u8 flags;
u8 namelen;
+#ifdef CONFIG_VE
+ struct kmapset_map __rcu *ve_perms_map; /* per-VE r/w/x mask for this node */
+#endif
char inline_name[];
} __randomize_layout;
@@ -102,6 +106,27 @@ static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
+extern rwlock_t proc_subdir_lock;
+struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+ const char *name, unsigned int len);
+
+#ifdef CONFIG_VE
+void proc_init_ve_perms(void);
+void proc_get_ve_perms(struct proc_dir_entry *de);
+void proc_put_ve_perms(struct proc_dir_entry *de);
+bool proc_d_visible(struct proc_dir_entry *de);
+int proc_ve_permission(struct proc_dir_entry *de, int mask);
+#else
+static inline void proc_init_ve_perms(void) { }
+static inline void proc_get_ve_perms(struct proc_dir_entry *de) { }
+static inline void proc_put_ve_perms(struct proc_dir_entry *de) { }
+static inline bool proc_d_visible(struct proc_dir_entry *de) { return false; }
+static inline int proc_ve_permission(struct proc_dir_entry *de, int mask)
+{
+ return 0;
+}
+#endif
+
union proc_op {
int (*proc_get_link)(struct dentry *, struct path *);
int (*proc_show)(struct seq_file *m,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 3f61de56ffff..9e0c5bf87602 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -297,6 +297,7 @@ static struct file_system_type proc_fs_type = {
void __init proc_root_init(void)
{
proc_init_kmemcache();
+ proc_init_ve_perms();
set_proc_pid_nlink();
proc_self_init();
proc_thread_self_init();
diff --git a/fs/proc/ve.c b/fs/proc/ve.c
new file mode 100644
index 000000000000..10106c2a4e53
--- /dev/null
+++ b/fs/proc/ve.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per-VE /proc permissions (ve.proc_permissions), the procfs counterpart of
+ * the sysfs ve.sysfs_permissions mechanism. Each proc_dir_entry carries a
+ * kmapset map keyed by VE, so the single shared proc tree gives per-VE
+ * answers. The filesystem agnostic leaf logic lives in fs/ve_perms.c, this
+ * file owns the proc tree walk and the locking.
+ *
+ * Copyright (c) 2026 Virtuozzo International GmbH. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/cgroup.h>
+#include <linux/ve.h>
+#include <linux/kmapset.h>
+#include <linux/ve-perms.h>
+#include <linux/proc_fs.h>
+
+#include "internal.h"
+
+struct kmapset_set proc_ve_perms_set;
+
+static bool proc_ve_perms_inited;
+
+static DEFINE_MUTEX(proc_perms_mutex);
+
+void __init proc_init_ve_perms(void)
+{
+ struct kmapset_map *map;
+
+ kmapset_init_set(&proc_ve_perms_set);
+ map = kmapset_new(&proc_ve_perms_set);
+ if (map)
+ RCU_INIT_POINTER(proc_root.ve_perms_map, kmapset_commit(map));
+ proc_ve_perms_inited = true;
+}
+
+void proc_get_ve_perms(struct proc_dir_entry *de)
+{
+ struct kmapset_map *map;
+
+ if (!proc_ve_perms_inited)
+ return;
+
+ map = kmapset_new(&proc_ve_perms_set);
+ if (map)
+ rcu_assign_pointer(de->ve_perms_map, kmapset_commit(map));
+ else
+ pr_warn_once("proc: no ve_perms_map for %s, hidden from containers\n",
+ de->name);
+}
+
+/*
+ * Drop the node's permission map. kmapset_put() can sleep (it takes the
+ * kmapset set mutex on the last reference), so every caller must be in process
+ * context. Registered entries drop the map from proc_entry_rundown() when they
+ * are removed. Entries that never reach the tree drop it on their creation error
+ * path (proc_register(), proc_symlink_mode()). pde_free() therefore never
+ * touches the map and stays safe to run from the .free_inode RCU callback, which
+ * is atomic. The mutex here serialises against a concurrent proc_perms_set() on
+ * a registered entry. The NULL fast path skips it when there is nothing to drop.
+ */
+void proc_put_ve_perms(struct proc_dir_entry *de)
+{
+ struct kmapset_map *map;
+
+ /* Atomic-safe fast path: already dropped at rundown, or never set. */
+ if (!rcu_access_pointer(de->ve_perms_map))
+ return;
+
+ /* Serialise against a concurrent proc_perms_set() on this entry. */
+ mutex_lock(&proc_perms_mutex);
+ map = rcu_dereference_protected(de->ve_perms_map,
+ lockdep_is_held(&proc_perms_mutex));
+ rcu_assign_pointer(de->ve_perms_map, NULL);
+ mutex_unlock(&proc_perms_mutex);
+ kmapset_put(map);
+}
+
+bool proc_d_visible(struct proc_dir_entry *de)
+{
+ struct ve_struct *ve = get_exec_env();
+ struct kmapset_map *map;
+ bool visible;
+
+ if (ve_is_super(ve))
+ return true;
+
+ /*
+ * proc_perms_set() can swap this map pointer concurrently and free the
+ * old map through kfree_rcu(). Hold rcu across both the load and the
+ * lookup so the map cannot be freed under us.
+ */
+ rcu_read_lock();
+ map = rcu_dereference(de->ve_perms_map);
+ visible = map && ve_perms_visible(map, &ve->proc_perms_key);
+ rcu_read_unlock();
+ return visible;
+}
+
+int proc_ve_permission(struct proc_dir_entry *de, int mask)
+{
+ struct ve_struct *ve = get_exec_env();
+ struct kmapset_map *map;
+ int ret;
+
+ if (ve_is_super(ve))
+ return 0;
+
+ rcu_read_lock();
+ map = rcu_dereference(de->ve_perms_map);
+ ret = map ? ve_perms_check(map, &ve->proc_perms_key, mask) : -EACCES;
+ rcu_read_unlock();
+ return ret;
+}
+
+static int proc_perms_set(char *path, struct ve_struct *ve, int mask)
+{
+ struct proc_dir_entry *de, *nde;
+ char *sep = path, *dname;
+ int ret = 0;
+
+ read_lock(&proc_subdir_lock);
+ de = &proc_root;
+ pde_get(de);
+ do {
+ dname = sep;
+
+ sep = strchr(sep, '/');
+ if (sep)
+ *sep++ = 0;
+
+ if (!*dname)
+ break;
+
+ nde = pde_subdir_find(de, dname, strlen(dname));
+ if (!nde) {
+ read_unlock(&proc_subdir_lock);
+ ret = -ENOENT;
+ goto out;
+ }
+ pde_get(nde);
+ pde_put(de);
+ de = nde;
+ } while (sep);
+ read_unlock(&proc_subdir_lock);
+
+ /* empty or leading-slash path walks to nothing, reject it */
+ if (de == &proc_root) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!rcu_access_pointer(de->ve_perms_map)) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ ret = ve_perms_apply(&de->ve_perms_map, &ve->proc_perms_key,
+ ve_is_super(ve), mask);
+out:
+ pde_put(de);
+ return ret;
+}
+
+static int proc_perms_line(struct ve_struct *ve, char *line)
+{
+ int mask, ret;
+
+ ret = ve_perms_parse(line, &mask);
+ if (ret)
+ return ret;
+ return proc_perms_set(line, ve, mask);
+}
+
+static struct proc_dir_entry *proc_next_recursive(struct proc_dir_entry *de)
+{
+ struct rb_node *node;
+
+ node = rb_first(&de->subdir);
+ if (node)
+ return rb_entry(node, struct proc_dir_entry, subdir_node);
+
+ while (de->parent != de) {
+ node = rb_next(&de->subdir_node);
+ if (node)
+ return rb_entry(node, struct proc_dir_entry,
+ subdir_node);
+ de = de->parent;
+ }
+ return NULL;
+}
+
+static bool proc_perms_shown(struct ve_struct *ve, struct proc_dir_entry *de)
+{
+ bool shown;
+
+ if (!rcu_access_pointer(de->ve_perms_map))
+ return false;
+
+ /* ve_perms_shown calls kmapset_lookup, an rcu list walk, guard it. */
+ rcu_read_lock();
+ shown = ve_perms_shown(rcu_dereference(de->ve_perms_map),
+ &ve->proc_perms_key, ve_is_super(ve));
+ rcu_read_unlock();
+ return shown;
+}
+
+static void *proc_perms_start(struct seq_file *m, loff_t *ppos)
+ __acquires(&proc_subdir_lock)
+{
+ struct ve_struct *ve = css_to_ve(seq_css(m));
+ struct proc_dir_entry *de;
+ loff_t pos = *ppos;
+
+ mutex_lock(&proc_perms_mutex);
+ read_lock(&proc_subdir_lock);
+ for (de = &proc_root; de; de = proc_next_recursive(de)) {
+ if (proc_perms_shown(ve, de) && !pos--)
+ break;
+ }
+ return de;
+}
+
+static void *proc_perms_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+ struct ve_struct *ve = css_to_ve(seq_css(m));
+ struct proc_dir_entry *de = v;
+
+ (*ppos)++;
+ while ((de = proc_next_recursive(de))) {
+ if (proc_perms_shown(ve, de))
+ break;
+ }
+ return de;
+}
+
+static void proc_perms_stop(struct seq_file *m, void *v)
+ __releases(&proc_subdir_lock)
+{
+ read_unlock(&proc_subdir_lock);
+ mutex_unlock(&proc_perms_mutex);
+}
+
+static int proc_perms_show(struct seq_file *m, void *v)
+{
+ struct ve_struct *ve = css_to_ve(seq_css(m));
+ struct proc_dir_entry *de = v;
+ struct kmapset_map *map;
+ char *buf;
+ size_t size, len, off;
+ int mask;
+
+ map = rcu_dereference_protected(de->ve_perms_map,
+ lockdep_is_held(&proc_perms_mutex));
+ if (ve_is_super(ve))
+ mask = map->default_value;
+ else
+ mask = kmapset_get_value(map, &ve->proc_perms_key);
+
+ size = seq_get_buf(m, &buf);
+ if (size) {
+ off = size;
+ do {
+ len = strlen(de->name);
+ if (len >= off) {
+ seq_commit(m, -1);
+ return 0;
+ }
+ if (S_ISDIR(de->mode))
+ buf[--off] = '/';
+ off -= len;
+ memcpy(buf + off, de->name, len);
+ de = de->parent;
+ } while (de && de != &proc_root);
+ memmove(buf, buf + off, size - off);
+ seq_commit(m, size - off);
+ }
+
+ ve_perms_emit(m, mask);
+ return 0;
+}
+
+static ssize_t proc_perms_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct ve_struct *ve = css_to_ve(of_css(of));
+ char *line, *next = buf;
+ int ret = -EINVAL;
+
+ mutex_lock(&proc_perms_mutex);
+ do {
+ line = skip_spaces(next);
+ if (!*line)
+ break;
+
+ next = strchr(line, '\n');
+ if (next)
+ *next++ = '\0';
+
+ if (*line != '#') {
+ ret = proc_perms_line(ve, line);
+ if (ret)
+ break;
+ }
+ } while (next);
+ mutex_unlock(&proc_perms_mutex);
+
+ return ret ? ret : nbytes;
+}
+
+static struct cftype proc_ve_cftypes[] = {
+ {
+ .name = "default_proc_permissions",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_start = proc_perms_start,
+ .seq_next = proc_perms_next,
+ .seq_stop = proc_perms_stop,
+ .seq_show = proc_perms_show,
+ .write = proc_perms_write,
+ },
+ {
+ .name = "proc_permissions",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_start = proc_perms_start,
+ .seq_next = proc_perms_next,
+ .seq_stop = proc_perms_stop,
+ .seq_show = proc_perms_show,
+ .write = proc_perms_write,
+ },
+ { },
+};
+
+static int init_proc_ve_perms(void)
+{
+ return cgroup_add_cftypes(&ve_cgrp_subsys, proc_ve_cftypes);
+}
+module_init(init_proc_ve_perms);
diff --git a/include/linux/ve.h b/include/linux/ve.h
index b037f60225bb..cba827260d07 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -69,6 +69,7 @@ struct ve_struct {
int fsync_enable;
struct kmapset_key sysfs_perms_key;
+ struct kmapset_key proc_perms_key;
atomic_t netns_avail_nr;
int netns_max_nr;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e58ffb22da87..d8ef28eedabd 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -44,6 +44,9 @@
#include "../sched/sched.h" /* For css_tg() */
extern struct kmapset_set sysfs_ve_perms_set;
+#ifdef CONFIG_PROC_FS
+extern struct kmapset_set proc_ve_perms_set;
+#endif
static struct kmem_cache *ve_cachep;
@@ -771,6 +774,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
init_rwsem(&ve->op_sem);
INIT_LIST_HEAD(&ve->ve_list);
kmapset_init_key(&ve->sysfs_perms_key);
+ kmapset_init_key(&ve->proc_perms_key);
atomic_set(&ve->arp_neigh_nr, 0);
atomic_set(&ve->nd_neigh_nr, 0);
@@ -866,6 +870,9 @@ static void ve_destroy(struct cgroup_subsys_state *css)
free_ve_devmnts(ve);
kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set);
+#ifdef CONFIG_PROC_FS
+ kmapset_unlink(&ve->proc_perms_key, &proc_ve_perms_set);
+#endif
ve_log_destroy(ve);
ve_free_vdso(ve);
mntput(ve->devtmpfs_mnt);
--
2.43.0
More information about the Devel
mailing list