[Devel] [PATCH VZ10 v2 1/2] ve/bpf: Add VE_FEATURE_BPF to allow bpf device cgroup programs per VE

Fri Mar 27 19:51:22 MSK 2026

When feature is enabled for VE:

* Allow bpf_prog_query(BPF_CGROUP_DEVICE) with restrictions:

  a) CAP_NET/SYS_ADMIN of VE only
  b) BPF_F_QUERY_EFFECTIVE prohibited
  c) Cgroup must be a decendant of VE root cgroup

This gives Docker information about device control programs attached to
cgroups.

Note: We don't allow BPF_PROG_GET_FD_BY_ID so Docker does not gain any
controll of those programs. Docker is ok with getting EPERM from
BPF_PROG_GET_FD_BY_ID (to be able to run in more restricted
environments), so we are fine. Docker recreates cgroups for its
container on restart, so in reality there should be no leftovers it can
find there anyway.

* Allow bpf_prog_load(BPF_PROG_TYPE_CGROUP_DEVICE) with restrictions:

  a) CAP_NET/SYS_ADMIN of VE only
  b) Instruction count limited by 4096
  c) Unaligned access not allowed
  d) Similar to kernel.unprivileged_bpf_disabled

This gives Docker the ability to actually load device control programs.

Note: All the capability checks with fallback to CAP_SYS_ADMIN are
similar to original capability checks in the original code path, with an
exception that they are now relative to ve.

Note: Restrictions similar to kernel.unprivileged_bpf_disabled give us
hope that those programs will be less prone to verifier targeted
exploits than if we allow it without restrictions.

https://virtuozzo.atlassian.net/browse/VSTOR-126504
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

--
Possible problems:

- Limited bpf JIT memory budget. We can later add bpf program count
limits per VE to avoid one container to consume all JIT memory on the
system by creating excessive numbers of programs.

- The cgroup_bpf_query() uses cgroup_mutex, and verifier uses
bpf_verifier_lock. Both are global locks thus allowing to take those in
containers can lead to lock contention.

- If Docker's device controllers will start to use maps this would
not be enough and we would need to patch more bpf checks.

- Mainstream code does not prohibit attaching (bpf_prog_attach) programs
via any capable() checks. And a container is now able to load a eBPF
programs, so it can also attach them anyware on cgroupfs. Only
restriction is that container does not see cgroups of other containers
and host, unless explicitly leaked by host.
---
 include/uapi/linux/vzcalluser.h |  1 +
 kernel/bpf/syscall.c            | 77 +++++++++++++++++++++++++++++----
 2 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/vzcalluser.h b/include/uapi/linux/vzcalluser.h
index b04594d31666..000e3ee107ad 100644
--- a/include/uapi/linux/vzcalluser.h
+++ b/include/uapi/linux/vzcalluser.h
@@ -48,6 +48,7 @@ struct vzctl_ve_configure {
 #define VE_FEATURE_BRIDGE	(1ULL << 7)
 #define VE_FEATURE_NFSD		(1ULL << 8)
 #define VE_FEATURE_TIME		(1ULL << 9)
+#define VE_FEATURE_BPF		(1ULL << 10)
 
 #define VE_FEATURES_OLD		(VE_FEATURE_SYSFS)
 #define VE_FEATURES_DEF		(VE_FEATURE_SYSFS | VE_FEATURE_DEF_PERMS)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2c901fc67570..a77632ee69e6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -36,6 +36,8 @@
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
 #include <linux/trace_events.h>
+#include <linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <net/netfilter/nf_bpf_link.h>
 #include <net/netkit.h>
@@ -1217,6 +1219,12 @@ static bool bpf_net_capable(void)
 	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
 }
 
+static bool ve_bpf_capable(int cap)
+{
+	return feature_capable(VE_FEATURE_BPF, cap) ||
+	       (cap != CAP_SYS_ADMIN && feature_capable(VE_FEATURE_BPF, CAP_SYS_ADMIN));
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD map_token_fd
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -2705,22 +2713,34 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	 * object creation success. Even with unprivileged BPF disabled,
 	 * capability checks are still carried out for these
 	 * and other operations.
+	 *
+	 * Intent for ve_bpf_capable() check is to allow the same handling of
+	 * BPF_PROG_TYPE_CGROUP_DEVICE as is done for mainstream unprivileged
+	 * BPF programs in unprivileged_bpf_disabled == false case, but make it
+	 * on per-container basis.
 	 */
-	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
-		goto put_token;
+	if (!bpf_cap) {
+		if (type == BPF_PROG_TYPE_CGROUP_DEVICE) {
+			if (!ve_bpf_capable(CAP_BPF))
+				goto put_token;
+		} else if (type == BPF_PROG_TYPE_SOCKET_FILTER ||
+			   type == BPF_PROG_TYPE_CGROUP_SKB) {
+			if (sysctl_unprivileged_bpf_disabled)
+				goto put_token;
+		} else {
+			goto put_token;
+		}
+	}
 
 	if (attr->insn_cnt == 0 ||
 	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
 		err = -E2BIG;
 		goto put_token;
 	}
-	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
-	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !bpf_cap)
-		goto put_token;
 
 	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
-		goto put_token;
+		if (type != BPF_PROG_TYPE_CGROUP_DEVICE || !ve_bpf_capable(CAP_NET_ADMIN))
+			goto put_token;
 	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
 		goto put_token;
 
@@ -4195,10 +4215,51 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 
 #define BPF_PROG_QUERY_LAST_FIELD query.revision
 
+static bool ve_bpf_prog_query_cgroup_device_allowed(const union bpf_attr *attr)
+{
+	struct cgroup *cgrp, *ve_root;
+	bool ret = false;
+
+	/*
+	 * Prohibit getting information about programs attached to ancestor
+	 * cgroups in VE. To avoid VE processes peeking into host programs.
+	 */
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		return false;
+
+	/* Allow only BPF_CGROUP_DEVICE programs in VE. */
+	if (attr->query.attach_type != BPF_CGROUP_DEVICE)
+		return false;
+
+	/* Allow only VE net or sys admin to query bpf programs. */
+	if (!ve_bpf_capable(CAP_NET_ADMIN))
+		return false;
+
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return false;
+
+	/* Allow only query on non-root cgroups belonging to current VE */
+	rcu_read_lock();
+	ve_root = cgroup_ve_root1(cgrp);
+	if (!ve_root || ve_root == cgrp)
+		goto denied;
+
+	if (rcu_dereference(ve_root->ve_owner) != get_exec_env())
+		goto denied;
+
+	ret = true;
+denied:
+	rcu_read_unlock();
+	cgroup_put(cgrp);
+	return ret;
+}
+
 static int bpf_prog_query(const union bpf_attr *attr,
 			  union bpf_attr __user *uattr)
 {
-	if (!bpf_net_capable())
+	if (!bpf_net_capable() &&
+	    !ve_bpf_prog_query_cgroup_device_allowed(attr))
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_QUERY))
 		return -EINVAL;
-- 
2.53.0