[Devel] [PATCH VZ10 v3 1/2] ve/bpf: Add VE_FEATURE_BPF to allow bpf device cgroup programs per VE

Thu Apr 9 15:55:26 MSK 2026

Reviewed-by: Vasileios Almpanis <vasileios.almpanis at virtuozzo.com>

On 4/9/26 2:47 PM, Pavel Tikhomirov wrote:
> When feature is enabled for VE:
>
> * Allow bpf_prog_query(BPF_CGROUP_DEVICE) with restrictions:
>
>    a) CAP_NET/SYS_ADMIN of VE only
>    b) BPF_F_QUERY_EFFECTIVE prohibited
>    c) Cgroup must be a descendant of VE root cgroup
>
> This gives Docker information about device control programs attached to
> cgroups.
>
> Note: We don't allow BPF_PROG_GET_FD_BY_ID so Docker does not gain any
> control of those programs. Docker is ok with getting EPERM from
> BPF_PROG_GET_FD_BY_ID (to be able to run in more restricted
> environments), so we are fine. Docker recreates cgroups for its
> container on restart, so in reality there should be no leftovers it can
> find there anyway.
>
> * Allow bpf_prog_load(BPF_PROG_TYPE_CGROUP_DEVICE) with restrictions:
>
>    a) CAP_NET/SYS_ADMIN of VE only
>    b) Instruction count limited by 4096
>    c) Unaligned access not allowed
>    d) Similar to kernel.unprivileged_bpf_disabled
>
> This gives Docker the ability to actually load device control programs.
>
> Note: All the capability checks with fallback to CAP_SYS_ADMIN are
> similar to original capability checks in the original code path, with an
> exception that they are now relative to ve.
>
> Note: Restrictions similar to kernel.unprivileged_bpf_disabled give us
> hope that those programs will be less prone to verifier targeted
> exploits than if we allow it without restrictions.
>
> https://virtuozzo.atlassian.net/browse/VSTOR-126504
> Signed-off-by: Pavel Tikhomirov<ptikhomirov at virtuozzo.com>
>
> --
> Possible problems:
>
> - Limited bpf JIT memory budget. We can later add bpf program count
> limits per VE to avoid one container to consume all JIT memory on the
> system by creating excessive numbers of programs.
>
> - The cgroup_bpf_query() uses cgroup_mutex, and verifier uses
> bpf_verifier_lock. Both are global locks thus allowing to take those in
> containers can lead to lock contention.
>
> - If Docker's device controllers will start to use maps this would
> not be enough and we would need to patch more bpf checks.
>
> - Mainstream code does not prohibit attaching (bpf_prog_attach) programs
> via any capable() checks. And a container is now able to load a eBPF
> programs, so it can also attach them anywhere on cgroupfs. Only
> restriction is that container does not see cgroups of other containers
> and host, unless explicitly leaked by host.
> --
> v3: report bad cgroup fd from ve_bpf_prog_query_cgroup_device_allowed
> ---
>   include/uapi/linux/vzcalluser.h |  1 +
>   kernel/bpf/syscall.c            | 91 +++++++++++++++++++++++++++++----
>   2 files changed, 83 insertions(+), 9 deletions(-)
>
> diff --git a/include/uapi/linux/vzcalluser.h b/include/uapi/linux/vzcalluser.h
> index b04594d31666..000e3ee107ad 100644
> --- a/include/uapi/linux/vzcalluser.h
> +++ b/include/uapi/linux/vzcalluser.h
> @@ -48,6 +48,7 @@ struct vzctl_ve_configure {
>   #define VE_FEATURE_BRIDGE	(1ULL << 7)
>   #define VE_FEATURE_NFSD		(1ULL << 8)
>   #define VE_FEATURE_TIME		(1ULL << 9)
> +#define VE_FEATURE_BPF		(1ULL << 10)
>   
>   #define VE_FEATURES_OLD		(VE_FEATURE_SYSFS)
>   #define VE_FEATURES_DEF		(VE_FEATURE_SYSFS | VE_FEATURE_DEF_PERMS)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 2c901fc67570..b75a34589880 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -36,6 +36,8 @@
>   #include <linux/rcupdate_trace.h>
>   #include <linux/memcontrol.h>
>   #include <linux/trace_events.h>
> +#include <linux/vzcalluser.h>
> +#include <linux/ve.h>
>   
>   #include <net/netfilter/nf_bpf_link.h>
>   #include <net/netkit.h>
> @@ -1217,6 +1219,12 @@ static bool bpf_net_capable(void)
>   	return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
>   }
>   
> +static bool ve_bpf_capable(int cap)
> +{
> +	return feature_capable(VE_FEATURE_BPF, cap) ||
> +	       (cap != CAP_SYS_ADMIN && feature_capable(VE_FEATURE_BPF, CAP_SYS_ADMIN));
> +}
> +
>   #define BPF_MAP_CREATE_LAST_FIELD map_token_fd
>   /* called via syscall */
>   static int map_create(union bpf_attr *attr)
> @@ -2705,22 +2713,34 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
>   	 * object creation success. Even with unprivileged BPF disabled,
>   	 * capability checks are still carried out for these
>   	 * and other operations.
> +	 *
> +	 * Intent for ve_bpf_capable() check is to allow the same handling of
> +	 * BPF_PROG_TYPE_CGROUP_DEVICE as is done for mainstream unprivileged
> +	 * BPF programs in unprivileged_bpf_disabled == false case, but make it
> +	 * on per-container basis.
>   	 */
> -	if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
> -		goto put_token;
> +	if (!bpf_cap) {
> +		if (type == BPF_PROG_TYPE_CGROUP_DEVICE) {
> +			if (!ve_bpf_capable(CAP_BPF))
> +				goto put_token;
> +		} else if (type == BPF_PROG_TYPE_SOCKET_FILTER ||
> +			   type == BPF_PROG_TYPE_CGROUP_SKB) {
> +			if (sysctl_unprivileged_bpf_disabled)
> +				goto put_token;
> +		} else {
> +			goto put_token;
> +		}
> +	}
>   
>   	if (attr->insn_cnt == 0 ||
>   	    attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
>   		err = -E2BIG;
>   		goto put_token;
>   	}
> -	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
> -	    type != BPF_PROG_TYPE_CGROUP_SKB &&
> -	    !bpf_cap)
> -		goto put_token;
>   
>   	if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
> -		goto put_token;
> +		if (type != BPF_PROG_TYPE_CGROUP_DEVICE || !ve_bpf_capable(CAP_NET_ADMIN))
> +			goto put_token;
>   	if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
>   		goto put_token;
>   
> @@ -4195,11 +4215,64 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>   
>   #define BPF_PROG_QUERY_LAST_FIELD query.revision
>   
> +/*
> + * Returns 0 when prog query is allowed and -error otherwise. We actually
> + * always return -EPERM, except for the rare case of problems with target_fd,
> + * to indicate to the user clearly that provided fd is bad with -EBADF.
> + * It is imperative to return 0 only when we are sure that the caller is
> + * allowed to query the program, as by returning 0 we override
> + * bpf_net_capable() failed check!
> + */
> +static bool ve_bpf_prog_query_cgroup_device_allowed(const union bpf_attr *attr)
> +{
> +	struct cgroup *cgrp, *ve_root;
> +	int ret = -EPERM;
> +
> +	/*
> +	 * Prohibit getting information about programs attached to ancestor
> +	 * cgroups in VE. To avoid VE processes peeking into host programs.
> +	 */
> +	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
> +		return ret;
> +
> +	/* Allow only BPF_CGROUP_DEVICE programs in VE. */
> +	if (attr->query.attach_type != BPF_CGROUP_DEVICE)
> +		return ret;
> +
> +	/* Allow only VE net or sys admin to query bpf programs. */
> +	if (!ve_bpf_capable(CAP_NET_ADMIN))
> +		return ret;
> +
> +	cgrp = cgroup_get_from_fd(attr->query.target_fd);
> +	if (IS_ERR(cgrp))
> +		return -EBADF;
> +
> +	/* Allow only query on non-root cgroups belonging to current VE */
> +	rcu_read_lock();
> +	ve_root = cgroup_ve_root1(cgrp);
> +	if (!ve_root || ve_root == cgrp)
> +		goto denied;
> +
> +	if (rcu_dereference(ve_root->ve_owner) != get_exec_env())
> +		goto denied;
> +
> +	ret = 0;
> +denied:
> +	rcu_read_unlock();
> +	cgroup_put(cgrp);
> +	return ret;
> +}
> +
>   static int bpf_prog_query(const union bpf_attr *attr,
>   			  union bpf_attr __user *uattr)
>   {
> -	if (!bpf_net_capable())
> -		return -EPERM;
> +	if (!bpf_net_capable()) {
> +		int ret;
> +
> +		ret = ve_bpf_prog_query_cgroup_device_allowed(attr);
> +		if (ret < 0)
> +			return ret;
> +	}
>   	if (CHECK_ATTR(BPF_PROG_QUERY))
>   		return -EINVAL;
>   	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
> -- 2.53.0

-- 
Best regards, Vasileios Almpanis
Software Developer, Virtuozzo.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openvz.org/pipermail/devel/attachments/20260409/5f9628d2/attachment-0001.html>