[Devel] [PATCH v2 VZ10 1/2] ve/bpf: Limit number of BPF programs loadable per-VE

Thu Jun 18 19:41:01 MSK 2026

i will apply the patch, but i would appreciate in case you add some WARN_ONCE
in case some CT hits the limit of available bpf programs.

This will ease issue investigations + at some point might hints us that we need to increase the default, for example.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 5/29/26 16:42, Pavel Tikhomirov wrote:
> Without a per-VE cap a single container could exhaust the system-wide
> bpf JIT memory budget by loading excessive numbers of CGROUP_DEVICE
> programs via the VE_FEATURE_BPF path.
> 
> Add bpf_prog_avail_nr / bpf_prog_max_nr counters to ve_struct and
> enforce them in bpf_prog_load() for non-bpf-capable callers loading
> CGROUP_DEVICE programs.
> 
> Lifetime note: A BPF program loadded in VE takes a reference to ve, when
> container is stopped, all open fds to the BPF programm will be closed
> and when container manager removes container cgroups the BPF program
> will be released and thus releasing the reference to VE.
> 
> Default max number note: It is somehow similar to ve.netif_max_nr, there
> each docker container creates two veths, and I also observe that docker
> container loads two bpf programs (one by dockerd, one by systemd). So
> let's use the same number.
> 
> https://virtuozzo.atlassian.net/browse/VSTOR-131947
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> Feature: ve: allow BPF in Containers
> 
> --
> v2: Also put load_ve definition under CONFIG_VE.
> ---
>  include/linux/bpf.h  |  8 ++++++++
>  include/linux/ve.h   |  4 ++++
>  kernel/bpf/core.c    |  8 ++++++++
>  kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++
>  kernel/ve/ve.c       |  5 +++++
>  5 files changed, 62 insertions(+)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 80175c7a21c27..0212806d5efc2 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -56,6 +56,7 @@ struct cgroup;
>  struct bpf_token;
>  struct user_namespace;
>  struct super_block;
> +struct ve_struct;
>  struct inode;
>  
>  extern struct idr btf_idr;
> @@ -1522,6 +1523,13 @@ struct bpf_prog_aux {
>  	void *security;
>  #endif
>  	struct bpf_token *token;
> +#ifdef CONFIG_VE
> +	/* VE that loaded the program via VE_FEATURE_BPF path and against whose
> +	 * bpf_prog_avail_nr counter the program is accounted. NULL for programs
> +	 * loaded through the regular (non VE-restricted) path.
> +	 */
> +	struct ve_struct *owner_ve;
> +#endif
>  	struct bpf_prog_offload *offload;
>  	struct btf *btf;
>  	struct bpf_func_info *func_info;
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 224acf012821f..88b4d531c466e 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -76,6 +76,9 @@ struct ve_struct {
>  	atomic_t		netif_avail_nr;
>  	int			netif_max_nr;
>  
> +	atomic_t		bpf_prog_avail_nr;
> +	int			bpf_prog_max_nr;
> +
>  	atomic64_t		_uevent_seqnum;
>  
>  	int			_randomize_va_space;
> @@ -149,6 +152,7 @@ extern int nr_ve;
>  
>  #define NETNS_MAX_NR_DEFAULT	256	/* number of net-namespaces per-VE */
>  #define NETIF_MAX_NR_DEFAULT	256	/* number of net-interfaces per-VE */
> +#define BPF_PROG_MAX_NR_DEFAULT	256	/* number of loaded BPF progs per-VE */
>  
>  extern unsigned int sysctl_ve_mount_nr;
>  
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 4de8774458aca..7aaf73180fcdc 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -38,6 +38,7 @@
>  #include <linux/bpf_mem_alloc.h>
>  #include <linux/memcontrol.h>
>  #include <linux/execmem.h>
> +#include <linux/ve.h>
>  
>  #include <asm/barrier.h>
>  #include <linux/unaligned.h>
> @@ -2828,6 +2829,13 @@ void bpf_prog_free(struct bpf_prog *fp)
>  	if (aux->dst_prog)
>  		bpf_prog_put(aux->dst_prog);
>  	bpf_token_put(aux->token);
> +#ifdef CONFIG_VE
> +	if (aux->owner_ve) {
> +		atomic_inc(&aux->owner_ve->bpf_prog_avail_nr);
> +		put_ve(aux->owner_ve);
> +		aux->owner_ve = NULL;
> +	}
> +#endif
>  	INIT_WORK(&aux->work, bpf_prog_free_deferred);
>  	schedule_work(&aux->work);
>  }
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 0475a72c93c06..481ae62429097 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2663,6 +2663,9 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>  	struct bpf_prog *prog, *dst_prog = NULL;
>  	struct btf *attach_btf = NULL;
>  	struct bpf_token *token = NULL;
> +#ifdef CONFIG_VE
> +	struct ve_struct *load_ve = NULL;
> +#endif
>  	bool bpf_cap;
>  	int err;
>  	char license[128];
> @@ -2744,6 +2747,22 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>  	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
>  		goto put_token;
>  
> +#ifdef CONFIG_VE
> +	/* Restrict the number of BPF programs that can be loaded via the
> +	 * VE-allowed path. Without this, a single container could exhaust
> +	 * the system-wide bpf JIT memory budget by loading excessive
> +	 * numbers of CGROUP_DEVICE programs.
> +	 */
> +	if (!bpf_cap && type == BPF_PROG_TYPE_CGROUP_DEVICE) {
> +		load_ve = get_exec_env();
> +		if (atomic_dec_if_positive(&load_ve->bpf_prog_avail_nr) < 0) {
> +			load_ve = NULL;
> +			err = -ENOSPC;
> +			goto put_token;
> +		}
> +	}
> +#endif
> +
>  	/* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog
>  	 * or btf, we need to check which one it is
>  	 */
> @@ -2809,6 +2828,16 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>  	prog->aux->dev_bound = !!attr->prog_ifindex;
>  	prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
>  
> +#ifdef CONFIG_VE
> +	/* Hand the avail_nr slot reservation over to the prog. bpf_prog_free()
> +	 * will release it via put_ve + counter increment.
> +	 */
> +	if (load_ve) {
> +		prog->aux->owner_ve = get_ve(load_ve);
> +		load_ve = NULL;
> +	}
> +#endif
> +
>  	/* move token into prog->aux, reuse taken refcnt */
>  	prog->aux->token = token;
>  	token = NULL;
> @@ -2932,6 +2961,14 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>  		btf_put(prog->aux->attach_btf);
>  	bpf_prog_free(prog);
>  put_token:
> +#ifdef CONFIG_VE
> +	/* The load_ve is non-NULL only if we decremented bpf_prog_avail_nr
> +	 * but did not hand the reservation off to the prog yet (i.e. failure
> +	 * happened before bpf_prog_alloc()). Roll back the counter.
> +	 */
> +	if (load_ve)
> +		atomic_inc(&load_ve->bpf_prog_avail_nr);
> +#endif
>  	bpf_token_put(token);
>  	return err;
>  }
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 198c82f010cc1..48da546117bb7 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -76,6 +76,8 @@ struct ve_struct ve0 = {
>  	.netns_max_nr		= INT_MAX,
>  	.netif_avail_nr		= ATOMIC_INIT(INT_MAX),
>  	.netif_max_nr		= INT_MAX,
> +	.bpf_prog_avail_nr	= ATOMIC_INIT(INT_MAX),
> +	.bpf_prog_max_nr	= INT_MAX,
>  	.fsync_enable		= FSYNC_FILTERED,
>  	._randomize_va_space	=
>  #ifdef CONFIG_COMPAT_BRK
> @@ -983,6 +985,9 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>  	atomic_set(&ve->netif_avail_nr, NETIF_MAX_NR_DEFAULT);
>  	ve->netif_max_nr = NETIF_MAX_NR_DEFAULT;
>  
> +	atomic_set(&ve->bpf_prog_avail_nr, BPF_PROG_MAX_NR_DEFAULT);
> +	ve->bpf_prog_max_nr = BPF_PROG_MAX_NR_DEFAULT;
> +
>  	err = ve_log_init(ve);
>  	if (err)
>  		goto err_log;