[Devel] [PATCH vz10 7/7] fs/proc, ve: add per-VE ve.proc_permissions

Mirian Shilakadze mirian.shilakadze at virtuozzo.com
Sun Jun 28 12:26:05 MSK 2026


Add a per-VE allowlist of /proc paths exposed through the cgroup file
ve.proc_permissions, the procfs counterpart of ve.sysfs_permissions.
Each proc_dir_entry gains a kmapset map keyed by VE (proc_perms_key on
ve_struct), so the single shared proc tree yields per-VE answers. The
filesystem agnostic leaf logic is reused from fs/ve_perms.c, this commit
adds the proc tree walk, the locking, and the VFS hooks: visibility in
proc_lookup_de/proc_readdir_de and a .permission inode op.

Paths are written relative to the proc root like sysfs, path mask where
mask is r/w/x or - to remove. The host (ve0) is unaffected and an empty
list exposes nothing extra. The lock-free readers load the map under rcu
against the writer's copy-on-write swap, and the seq read is serialised
against the writer by proc_perms_mutex.

Signed-off-by: Mirian Shilakadze <mirian.shilakadze at virtuozzo.com>
---
 fs/proc/Makefile   |   1 +
 fs/proc/generic.c  |  48 ++++++-
 fs/proc/inode.c    |   2 +
 fs/proc/internal.h |  25 ++++
 fs/proc/root.c     |   1 +
 fs/proc/ve.c       | 345 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ve.h |   1 +
 kernel/ve/ve.c     |   7 +
 8 files changed, 423 insertions(+), 7 deletions(-)
 create mode 100644 fs/proc/ve.c

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7b4db9c56e6a..61a999c03663 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -11,6 +11,7 @@ proc-$(CONFIG_MMU)	:= task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
 		fd.o
+proc-$(CONFIG_VE)	+= ve.o
 proc-$(CONFIG_TTY)      += proc_tty.o
 proc-y	+= cmdline.o
 proc-y	+= consoles.o
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e8fd7c2d1c3a..791c38c49a86 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -30,7 +30,7 @@
 
 #include "internal.h"
 
-static DEFINE_RWLOCK(proc_subdir_lock);
+DEFINE_RWLOCK(proc_subdir_lock);
 
 struct kmem_cache *proc_dir_entry_cache __ro_after_init;
 
@@ -65,9 +65,9 @@ static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir)
 			     subdir_node);
 }
 
-static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
-					      const char *name,
-					      unsigned int len)
+struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+				       const char *name,
+				       unsigned int len)
 {
 	struct rb_node *node = dir->subdir.rb_node;
 
@@ -120,6 +120,31 @@ static bool proc_in_container(struct super_block *sb)
 	return !ve_is_super(get_exec_env());
 }
 
+/* Visible to the current VE: globally published (S_ISVTX) or per-VE allowed. */
+static bool pde_visible_to_ve(struct proc_dir_entry *de)
+{
+	return (de->mode & S_ISVTX) || proc_d_visible(de);
+}
+
+#ifdef CONFIG_VE
+static int proc_iop_permission(struct mnt_idmap *idmap, struct inode *inode,
+			       int mask)
+{
+	struct proc_dir_entry *de = PDE(inode);
+	int ret = 0;
+
+	/*
+	 * Runs safely during rcu-walk: proc_ve_permission() is a lockless rcu
+	 * kmapset lookup and generic_permission() copes with rcu-walk on its
+	 * own, so MAY_NOT_BLOCK needs no special handling here.
+	 */
+	if (proc_in_container(inode->i_sb) && !(de->mode & S_ISVTX))
+		ret = proc_ve_permission(de, mask);
+
+	return ret ? ret : generic_permission(idmap, inode, mask);
+}
+#endif
+
 static int proc_notify_change(struct mnt_idmap *idmap,
 			      struct dentry *dentry, struct iattr *iattr)
 {
@@ -167,6 +192,9 @@ static int proc_getattr(struct mnt_idmap *idmap,
 
 static const struct inode_operations proc_file_inode_operations = {
 	.setattr	= proc_notify_change,
+#ifdef CONFIG_VE
+	.permission	= proc_iop_permission,
+#endif
 };
 
 /*
@@ -264,7 +292,7 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
 	read_lock(&proc_subdir_lock);
 	de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
 	if (de) {
-		if (in_container && !(de->mode & S_ISVTX)) {
+		if (in_container && !pde_visible_to_ve(de)) {
 			read_unlock(&proc_subdir_lock);
 			return ERR_PTR(-ENOENT);
 		}
@@ -317,7 +345,7 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
 			read_unlock(&proc_subdir_lock);
 			return 0;
 		}
-		if (!in_container || (de->mode & S_ISVTX)) {
+		if (!in_container || pde_visible_to_ve(de)) {
 			if (!i)
 				break;
 			i--;
@@ -328,7 +356,7 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
 	do {
 		struct proc_dir_entry *next;
 
-		if (in_container && !(de->mode & S_ISVTX)) {
+		if (in_container && !pde_visible_to_ve(de)) {
 			de = pde_subdir_next(de);
 			continue;
 		}
@@ -389,6 +417,9 @@ static const struct inode_operations proc_dir_inode_operations = {
 	.lookup		= proc_lookup,
 	.getattr	= proc_getattr,
 	.setattr	= proc_notify_change,
+#ifdef CONFIG_VE
+	.permission	= proc_iop_permission,
+#endif
 };
 
 /* returns the registered entry, or frees dp and returns NULL on failure */
@@ -413,6 +444,7 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
 out_free_inum:
 	proc_free_inum(dp->low_ino);
 out_free_entry:
+	proc_put_ve_perms(dp);
 	pde_free(dp);
 	return NULL;
 }
@@ -471,6 +503,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
 	ent->nlink = nlink;
 	ent->subdir = RB_ROOT;
 	refcount_set(&ent->refcnt, 1);
+	proc_get_ve_perms(ent);
 	spin_lock_init(&ent->pde_unload_lock);
 	INIT_LIST_HEAD(&ent->pde_openers);
 	proc_set_user(ent, (*parent)->uid, (*parent)->gid);
@@ -498,6 +531,7 @@ struct proc_dir_entry *proc_symlink_mode(const char *name, umode_t mode,
 			ent->proc_iops = &proc_link_inode_operations;
 			ent = proc_register(parent, ent);
 		} else {
+			proc_put_ve_perms(ent);
 			pde_free(ent);
 			ent = NULL;
 		}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 5d1a75408aa4..ac943a9768f4 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -272,6 +272,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
 		spin_lock(&de->pde_unload_lock);
 	}
 	spin_unlock(&de->pde_unload_lock);
+
+	proc_put_ve_perms(de);
 }
 
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 77a517f91821..4a28da7d5dee 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -17,6 +17,7 @@
 
 struct ctl_table_header;
 struct mempolicy;
+struct kmapset_map;
 
 /*
  * This is not completely implemented yet. The idea is to
@@ -64,6 +65,9 @@ struct proc_dir_entry {
 	umode_t mode;
 	u8 flags;
 	u8 namelen;
+#ifdef CONFIG_VE
+	struct kmapset_map __rcu *ve_perms_map;	/* per-VE r/w/x mask for this node */
+#endif
 	char inline_name[];
 } __randomize_layout;
 
@@ -102,6 +106,27 @@ static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
 extern struct kmem_cache *proc_dir_entry_cache;
 void pde_free(struct proc_dir_entry *pde);
 
+extern rwlock_t proc_subdir_lock;
+struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir,
+				       const char *name, unsigned int len);
+
+#ifdef CONFIG_VE
+void proc_init_ve_perms(void);
+void proc_get_ve_perms(struct proc_dir_entry *de);
+void proc_put_ve_perms(struct proc_dir_entry *de);
+bool proc_d_visible(struct proc_dir_entry *de);
+int proc_ve_permission(struct proc_dir_entry *de, int mask);
+#else
+static inline void proc_init_ve_perms(void) { }
+static inline void proc_get_ve_perms(struct proc_dir_entry *de) { }
+static inline void proc_put_ve_perms(struct proc_dir_entry *de) { }
+static inline bool proc_d_visible(struct proc_dir_entry *de) { return false; }
+static inline int proc_ve_permission(struct proc_dir_entry *de, int mask)
+{
+	return 0;
+}
+#endif
+
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_show)(struct seq_file *m,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 3f61de56ffff..9e0c5bf87602 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -297,6 +297,7 @@ static struct file_system_type proc_fs_type = {
 void __init proc_root_init(void)
 {
 	proc_init_kmemcache();
+	proc_init_ve_perms();
 	set_proc_pid_nlink();
 	proc_self_init();
 	proc_thread_self_init();
diff --git a/fs/proc/ve.c b/fs/proc/ve.c
new file mode 100644
index 000000000000..10106c2a4e53
--- /dev/null
+++ b/fs/proc/ve.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Per-VE /proc permissions (ve.proc_permissions), the procfs counterpart of
+ *  the sysfs ve.sysfs_permissions mechanism. Each proc_dir_entry carries a
+ *  kmapset map keyed by VE, so the single shared proc tree gives per-VE
+ *  answers. The filesystem agnostic leaf logic lives in fs/ve_perms.c, this
+ *  file owns the proc tree walk and the locking.
+ *
+ *  Copyright (c) 2026 Virtuozzo International GmbH. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/cgroup.h>
+#include <linux/ve.h>
+#include <linux/kmapset.h>
+#include <linux/ve-perms.h>
+#include <linux/proc_fs.h>
+
+#include "internal.h"
+
+struct kmapset_set proc_ve_perms_set;
+
+static bool proc_ve_perms_inited;
+
+static DEFINE_MUTEX(proc_perms_mutex);
+
+void __init proc_init_ve_perms(void)
+{
+	struct kmapset_map *map;
+
+	kmapset_init_set(&proc_ve_perms_set);
+	map = kmapset_new(&proc_ve_perms_set);
+	if (map)
+		RCU_INIT_POINTER(proc_root.ve_perms_map, kmapset_commit(map));
+	proc_ve_perms_inited = true;
+}
+
+void proc_get_ve_perms(struct proc_dir_entry *de)
+{
+	struct kmapset_map *map;
+
+	if (!proc_ve_perms_inited)
+		return;
+
+	map = kmapset_new(&proc_ve_perms_set);
+	if (map)
+		rcu_assign_pointer(de->ve_perms_map, kmapset_commit(map));
+	else
+		pr_warn_once("proc: no ve_perms_map for %s, hidden from containers\n",
+			     de->name);
+}
+
+/*
+ * Drop the node's permission map. kmapset_put() can sleep (it takes the
+ * kmapset set mutex on the last reference), so every caller must be in process
+ * context. Registered entries drop the map from proc_entry_rundown() when they
+ * are removed. Entries that never reach the tree drop it on their creation error
+ * path (proc_register(), proc_symlink_mode()). pde_free() therefore never
+ * touches the map and stays safe to run from the .free_inode RCU callback, which
+ * is atomic. The mutex here serialises against a concurrent proc_perms_set() on
+ * a registered entry. The NULL fast path skips it when there is nothing to drop.
+ */
+void proc_put_ve_perms(struct proc_dir_entry *de)
+{
+	struct kmapset_map *map;
+
+	/* Atomic-safe fast path: already dropped at rundown, or never set. */
+	if (!rcu_access_pointer(de->ve_perms_map))
+		return;
+
+	/* Serialise against a concurrent proc_perms_set() on this entry. */
+	mutex_lock(&proc_perms_mutex);
+	map = rcu_dereference_protected(de->ve_perms_map,
+				       lockdep_is_held(&proc_perms_mutex));
+	rcu_assign_pointer(de->ve_perms_map, NULL);
+	mutex_unlock(&proc_perms_mutex);
+	kmapset_put(map);
+}
+
+bool proc_d_visible(struct proc_dir_entry *de)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct kmapset_map *map;
+	bool visible;
+
+	if (ve_is_super(ve))
+		return true;
+
+	/*
+	 * proc_perms_set() can swap this map pointer concurrently and free the
+	 * old map through kfree_rcu(). Hold rcu across both the load and the
+	 * lookup so the map cannot be freed under us.
+	 */
+	rcu_read_lock();
+	map = rcu_dereference(de->ve_perms_map);
+	visible = map && ve_perms_visible(map, &ve->proc_perms_key);
+	rcu_read_unlock();
+	return visible;
+}
+
+int proc_ve_permission(struct proc_dir_entry *de, int mask)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct kmapset_map *map;
+	int ret;
+
+	if (ve_is_super(ve))
+		return 0;
+
+	rcu_read_lock();
+	map = rcu_dereference(de->ve_perms_map);
+	ret = map ? ve_perms_check(map, &ve->proc_perms_key, mask) : -EACCES;
+	rcu_read_unlock();
+	return ret;
+}
+
+static int proc_perms_set(char *path, struct ve_struct *ve, int mask)
+{
+	struct proc_dir_entry *de, *nde;
+	char *sep = path, *dname;
+	int ret = 0;
+
+	read_lock(&proc_subdir_lock);
+	de = &proc_root;
+	pde_get(de);
+	do {
+		dname = sep;
+
+		sep = strchr(sep, '/');
+		if (sep)
+			*sep++ = 0;
+
+		if (!*dname)
+			break;
+
+		nde = pde_subdir_find(de, dname, strlen(dname));
+		if (!nde) {
+			read_unlock(&proc_subdir_lock);
+			ret = -ENOENT;
+			goto out;
+		}
+		pde_get(nde);
+		pde_put(de);
+		de = nde;
+	} while (sep);
+	read_unlock(&proc_subdir_lock);
+
+	/* empty or leading-slash path walks to nothing, reject it */
+	if (de == &proc_root) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!rcu_access_pointer(de->ve_perms_map)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	ret = ve_perms_apply(&de->ve_perms_map, &ve->proc_perms_key,
+			     ve_is_super(ve), mask);
+out:
+	pde_put(de);
+	return ret;
+}
+
+static int proc_perms_line(struct ve_struct *ve, char *line)
+{
+	int mask, ret;
+
+	ret = ve_perms_parse(line, &mask);
+	if (ret)
+		return ret;
+	return proc_perms_set(line, ve, mask);
+}
+
+static struct proc_dir_entry *proc_next_recursive(struct proc_dir_entry *de)
+{
+	struct rb_node *node;
+
+	node = rb_first(&de->subdir);
+	if (node)
+		return rb_entry(node, struct proc_dir_entry, subdir_node);
+
+	while (de->parent != de) {
+		node = rb_next(&de->subdir_node);
+		if (node)
+			return rb_entry(node, struct proc_dir_entry,
+					subdir_node);
+		de = de->parent;
+	}
+	return NULL;
+}
+
+static bool proc_perms_shown(struct ve_struct *ve, struct proc_dir_entry *de)
+{
+	bool shown;
+
+	if (!rcu_access_pointer(de->ve_perms_map))
+		return false;
+
+	/* ve_perms_shown calls kmapset_lookup, an rcu list walk, guard it. */
+	rcu_read_lock();
+	shown = ve_perms_shown(rcu_dereference(de->ve_perms_map),
+			       &ve->proc_perms_key, ve_is_super(ve));
+	rcu_read_unlock();
+	return shown;
+}
+
+static void *proc_perms_start(struct seq_file *m, loff_t *ppos)
+	__acquires(&proc_subdir_lock)
+{
+	struct ve_struct *ve = css_to_ve(seq_css(m));
+	struct proc_dir_entry *de;
+	loff_t pos = *ppos;
+
+	mutex_lock(&proc_perms_mutex);
+	read_lock(&proc_subdir_lock);
+	for (de = &proc_root; de; de = proc_next_recursive(de)) {
+		if (proc_perms_shown(ve, de) && !pos--)
+			break;
+	}
+	return de;
+}
+
+static void *proc_perms_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+	struct ve_struct *ve = css_to_ve(seq_css(m));
+	struct proc_dir_entry *de = v;
+
+	(*ppos)++;
+	while ((de = proc_next_recursive(de))) {
+		if (proc_perms_shown(ve, de))
+			break;
+	}
+	return de;
+}
+
+static void proc_perms_stop(struct seq_file *m, void *v)
+	__releases(&proc_subdir_lock)
+{
+	read_unlock(&proc_subdir_lock);
+	mutex_unlock(&proc_perms_mutex);
+}
+
+static int proc_perms_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = css_to_ve(seq_css(m));
+	struct proc_dir_entry *de = v;
+	struct kmapset_map *map;
+	char *buf;
+	size_t size, len, off;
+	int mask;
+
+	map = rcu_dereference_protected(de->ve_perms_map,
+				       lockdep_is_held(&proc_perms_mutex));
+	if (ve_is_super(ve))
+		mask = map->default_value;
+	else
+		mask = kmapset_get_value(map, &ve->proc_perms_key);
+
+	size = seq_get_buf(m, &buf);
+	if (size) {
+		off = size;
+		do {
+			len = strlen(de->name);
+			if (len >= off) {
+				seq_commit(m, -1);
+				return 0;
+			}
+			if (S_ISDIR(de->mode))
+				buf[--off] = '/';
+			off -= len;
+			memcpy(buf + off, de->name, len);
+			de = de->parent;
+		} while (de && de != &proc_root);
+		memmove(buf, buf + off, size - off);
+		seq_commit(m, size - off);
+	}
+
+	ve_perms_emit(m, mask);
+	return 0;
+}
+
+static ssize_t proc_perms_write(struct kernfs_open_file *of, char *buf,
+				size_t nbytes, loff_t off)
+{
+	struct ve_struct *ve = css_to_ve(of_css(of));
+	char *line, *next = buf;
+	int ret = -EINVAL;
+
+	mutex_lock(&proc_perms_mutex);
+	do {
+		line = skip_spaces(next);
+		if (!*line)
+			break;
+
+		next = strchr(line, '\n');
+		if (next)
+			*next++ = '\0';
+
+		if (*line != '#') {
+			ret = proc_perms_line(ve, line);
+			if (ret)
+				break;
+		}
+	} while (next);
+	mutex_unlock(&proc_perms_mutex);
+
+	return ret ? ret : nbytes;
+}
+
+static struct cftype proc_ve_cftypes[] = {
+	{
+		.name		= "default_proc_permissions",
+		.flags		= CFTYPE_ONLY_ON_ROOT,
+		.seq_start	= proc_perms_start,
+		.seq_next	= proc_perms_next,
+		.seq_stop	= proc_perms_stop,
+		.seq_show	= proc_perms_show,
+		.write		= proc_perms_write,
+	},
+	{
+		.name		= "proc_permissions",
+		.flags		= CFTYPE_NOT_ON_ROOT,
+		.seq_start	= proc_perms_start,
+		.seq_next	= proc_perms_next,
+		.seq_stop	= proc_perms_stop,
+		.seq_show	= proc_perms_show,
+		.write		= proc_perms_write,
+	},
+	{ },
+};
+
+static int init_proc_ve_perms(void)
+{
+	return cgroup_add_cftypes(&ve_cgrp_subsys, proc_ve_cftypes);
+}
+module_init(init_proc_ve_perms);
diff --git a/include/linux/ve.h b/include/linux/ve.h
index b037f60225bb..cba827260d07 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -69,6 +69,7 @@ struct ve_struct {
 	int			fsync_enable;
 
 	struct kmapset_key	sysfs_perms_key;
+	struct kmapset_key	proc_perms_key;
 
 	atomic_t		netns_avail_nr;
 	int			netns_max_nr;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e58ffb22da87..d8ef28eedabd 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -44,6 +44,9 @@
 #include "../sched/sched.h"		/* For css_tg() */
 
 extern struct kmapset_set sysfs_ve_perms_set;
+#ifdef CONFIG_PROC_FS
+extern struct kmapset_set proc_ve_perms_set;
+#endif
 
 static struct kmem_cache *ve_cachep;
 
@@ -771,6 +774,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	init_rwsem(&ve->op_sem);
 	INIT_LIST_HEAD(&ve->ve_list);
 	kmapset_init_key(&ve->sysfs_perms_key);
+	kmapset_init_key(&ve->proc_perms_key);
 
 	atomic_set(&ve->arp_neigh_nr, 0);
 	atomic_set(&ve->nd_neigh_nr, 0);
@@ -866,6 +870,9 @@ static void ve_destroy(struct cgroup_subsys_state *css)
 	free_ve_devmnts(ve);
 
 	kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set);
+#ifdef CONFIG_PROC_FS
+	kmapset_unlink(&ve->proc_perms_key, &proc_ve_perms_set);
+#endif
 	ve_log_destroy(ve);
 	ve_free_vdso(ve);
 	mntput(ve->devtmpfs_mnt);
-- 
2.43.0



More information about the Devel mailing list