[Devel] [PATCH RHEL10 COMMIT] ve/bpf: Limit number of BPF programs loadable per-VE

Thu Jun 18 19:58:26 MSK 2026

The commit is pushed to "branch-rh10-6.12.0-211.16.1.12.x.vz10-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh10-6.12.0-211.16.1.12.3.vz10
------>
commit 0ffe385175dc3369e6303e821149b930149c45c9
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Fri May 29 16:42:11 2026 +0200

    ve/bpf: Limit number of BPF programs loadable per-VE
    
    Without a per-VE cap a single container could exhaust the system-wide
    bpf JIT memory budget by loading excessive numbers of CGROUP_DEVICE
    programs via the VE_FEATURE_BPF path.
    
    Add bpf_prog_avail_nr / bpf_prog_max_nr counters to ve_struct and
    enforce them in bpf_prog_load() for non-bpf-capable callers loading
    CGROUP_DEVICE programs.
    
    Lifetime note: A BPF program loadded in VE takes a reference to ve, when
    container is stopped, all open fds to the BPF programm will be closed
    and when container manager removes container cgroups the BPF program
    will be released and thus releasing the reference to VE.
    
    Default max number note: It is somehow similar to ve.netif_max_nr, there
    each docker container creates two veths, and I also observe that docker
    container loads two bpf programs (one by dockerd, one by systemd). So
    let's use the same number.
    
    https://virtuozzo.atlassian.net/browse/VSTOR-131947
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    Feature: ve: allow BPF in Containers
    
    --
    v2: Also put load_ve definition under CONFIG_VE.
---
 include/linux/bpf.h  |  8 ++++++++
 include/linux/ve.h   |  4 ++++
 kernel/bpf/core.c    |  8 ++++++++
 kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++
 kernel/ve/ve.c       |  5 +++++
 5 files changed, 62 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5643c6e196b9f..1e30bb0867834 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -57,6 +57,7 @@ struct cgroup;
 struct bpf_token;
 struct user_namespace;
 struct super_block;
+struct ve_struct;
 struct inode;
 
 extern struct idr btf_idr;
@@ -1654,6 +1655,13 @@ struct bpf_prog_aux {
 	void *security;
 #endif
 	struct bpf_token *token;
+#ifdef CONFIG_VE
+	/* VE that loaded the program via VE_FEATURE_BPF path and against whose
+	 * bpf_prog_avail_nr counter the program is accounted. NULL for programs
+	 * loaded through the regular (non VE-restricted) path.
+	 */
+	struct ve_struct *owner_ve;
+#endif
 	struct bpf_prog_offload *offload;
 	struct btf *btf;
 	struct bpf_func_info *func_info;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 7c5515548ad23..b037f60225bb0 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -76,6 +76,9 @@ struct ve_struct {
 	atomic_t		netif_avail_nr;
 	int			netif_max_nr;
 
+	atomic_t		bpf_prog_avail_nr;
+	int			bpf_prog_max_nr;
+
 	atomic64_t		_uevent_seqnum;
 
 	int			_randomize_va_space;
@@ -130,6 +133,7 @@ extern int nr_ve;
 
 #define NETNS_MAX_NR_DEFAULT	256	/* number of net-namespaces per-VE */
 #define NETIF_MAX_NR_DEFAULT	256	/* number of net-interfaces per-VE */
+#define BPF_PROG_MAX_NR_DEFAULT	256	/* number of loaded BPF progs per-VE */
 
 extern unsigned int sysctl_ve_mount_nr;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 986455d06d368..5503ec305a846 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -38,6 +38,7 @@
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
 #include <linux/execmem.h>
+#include <linux/ve.h>
 
 #include <asm/barrier.h>
 #include <linux/unaligned.h>
@@ -2938,6 +2939,13 @@ void bpf_prog_free(struct bpf_prog *fp)
 	if (aux->dst_prog)
 		bpf_prog_put(aux->dst_prog);
 	bpf_token_put(aux->token);
+#ifdef CONFIG_VE
+	if (aux->owner_ve) {
+		atomic_inc(&aux->owner_ve->bpf_prog_avail_nr);
+		put_ve(aux->owner_ve);
+		aux->owner_ve = NULL;
+	}
+#endif
 	INIT_WORK(&aux->work, bpf_prog_free_deferred);
 	schedule_work(&aux->work);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 318f7bbf69a78..ff2a51c59f047 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2798,6 +2798,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	struct bpf_prog *prog, *dst_prog = NULL;
 	struct btf *attach_btf = NULL;
 	struct bpf_token *token = NULL;
+#ifdef CONFIG_VE
+	struct ve_struct *load_ve = NULL;
+#endif
 	bool bpf_cap;
 	int err;
 	char license[128];
@@ -2879,6 +2882,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
 		goto put_token;
 
+#ifdef CONFIG_VE
+	/* Restrict the number of BPF programs that can be loaded via the
+	 * VE-allowed path. Without this, a single container could exhaust
+	 * the system-wide bpf JIT memory budget by loading excessive
+	 * numbers of CGROUP_DEVICE programs.
+	 */
+	if (!bpf_cap && type == BPF_PROG_TYPE_CGROUP_DEVICE) {
+		load_ve = get_exec_env();
+		if (atomic_dec_if_positive(&load_ve->bpf_prog_avail_nr) < 0) {
+			load_ve = NULL;
+			err = -ENOSPC;
+			goto put_token;
+		}
+	}
+#endif
+
 	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
 	 * or btf, we need to check which one it is
 	 */
@@ -2944,6 +2963,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 	prog->aux->dev_bound = !!attr->prog_ifindex;
 	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
 
+#ifdef CONFIG_VE
+	/* Hand the avail_nr slot reservation over to the prog. bpf_prog_free()
+	 * will release it via put_ve + counter increment.
+	 */
+	if (load_ve) {
+		prog->aux->owner_ve = get_ve(load_ve);
+		load_ve = NULL;
+	}
+#endif
+
 	/* move token into prog->aux, reuse taken refcnt */
 	prog->aux->token = token;
 	token = NULL;
@@ -3067,6 +3096,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
 		btf_put(prog->aux->attach_btf);
 	bpf_prog_free(prog);
 put_token:
+#ifdef CONFIG_VE
+	/* The load_ve is non-NULL only if we decremented bpf_prog_avail_nr
+	 * but did not hand the reservation off to the prog yet (i.e. failure
+	 * happened before bpf_prog_alloc()). Roll back the counter.
+	 */
+	if (load_ve)
+		atomic_inc(&load_ve->bpf_prog_avail_nr);
+#endif
 	bpf_token_put(token);
 	return err;
 }
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 7485ee4ad009a..9938d87c8462c 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -66,6 +66,8 @@ struct ve_struct ve0 = {
 	.netns_max_nr		= INT_MAX,
 	.netif_avail_nr		= ATOMIC_INIT(INT_MAX),
 	.netif_max_nr		= INT_MAX,
+	.bpf_prog_avail_nr	= ATOMIC_INIT(INT_MAX),
+	.bpf_prog_max_nr	= INT_MAX,
 	.fsync_enable		= FSYNC_FILTERED,
 	._randomize_va_space	=
 #ifdef CONFIG_COMPAT_BRK
@@ -746,6 +748,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	atomic_set(&ve->netif_avail_nr, NETIF_MAX_NR_DEFAULT);
 	ve->netif_max_nr = NETIF_MAX_NR_DEFAULT;
 
+	atomic_set(&ve->bpf_prog_avail_nr, BPF_PROG_MAX_NR_DEFAULT);
+	ve->bpf_prog_max_nr = BPF_PROG_MAX_NR_DEFAULT;
+
 	err = ve_log_init(ve);
 	if (err)
 		goto err_log;