[Devel] [PATCH VZ10 v3 1/2] ve/bpf: Add VE_FEATURE_BPF to allow bpf device cgroup programs per VE
Vasileios Almpanis
vasileios.almpanis at virtuozzo.com
Thu Apr 9 15:55:26 MSK 2026
Reviewed-by: Vasileios Almpanis <vasileios.almpanis at virtuozzo.com>
On 4/9/26 2:47 PM, Pavel Tikhomirov wrote:
> When feature is enabled for VE:
>
> * Allow bpf_prog_query(BPF_CGROUP_DEVICE) with restrictions:
>
> a) CAP_NET/SYS_ADMIN of VE only
> b) BPF_F_QUERY_EFFECTIVE prohibited
> c) Cgroup must be a descendant of VE root cgroup
>
> This gives Docker information about device control programs attached to
> cgroups.
>
> Note: We don't allow BPF_PROG_GET_FD_BY_ID so Docker does not gain any
> control of those programs. Docker is ok with getting EPERM from
> BPF_PROG_GET_FD_BY_ID (to be able to run in more restricted
> environments), so we are fine. Docker recreates cgroups for its
> container on restart, so in reality there should be no leftovers it can
> find there anyway.
>
> * Allow bpf_prog_load(BPF_PROG_TYPE_CGROUP_DEVICE) with restrictions:
>
> a) CAP_NET/SYS_ADMIN of VE only
> b) Instruction count limited by 4096
> c) Unaligned access not allowed
> d) Similar to kernel.unprivileged_bpf_disabled
>
> This gives Docker the ability to actually load device control programs.
>
> Note: All the capability checks with fallback to CAP_SYS_ADMIN are
> similar to original capability checks in the original code path, with an
> exception that they are now relative to ve.
>
> Note: Restrictions similar to kernel.unprivileged_bpf_disabled give us
> hope that those programs will be less prone to verifier targeted
> exploits than if we allow it without restrictions.
>
> https://virtuozzo.atlassian.net/browse/VSTOR-126504
> Signed-off-by: Pavel Tikhomirov<ptikhomirov at virtuozzo.com>
>
> --
> Possible problems:
>
> - Limited bpf JIT memory budget. We can later add bpf program count
> limits per VE to avoid one container to consume all JIT memory on the
> system by creating excessive numbers of programs.
>
> - The cgroup_bpf_query() uses cgroup_mutex, and verifier uses
> bpf_verifier_lock. Both are global locks thus allowing to take those in
> containers can lead to lock contention.
>
> - If Docker's device controllers will start to use maps this would
> not be enough and we would need to patch more bpf checks.
>
> - Mainstream code does not prohibit attaching (bpf_prog_attach) programs
> via any capable() checks. And a container is now able to load a eBPF
> programs, so it can also attach them anywhere on cgroupfs. Only
> restriction is that container does not see cgroups of other containers
> and host, unless explicitly leaked by host.
> --
> v3: report bad cgroup fd from ve_bpf_prog_query_cgroup_device_allowed
> ---
> include/uapi/linux/vzcalluser.h | 1 +
> kernel/bpf/syscall.c | 91 +++++++++++++++++++++++++++++----
> 2 files changed, 83 insertions(+), 9 deletions(-)
>
> diff --git a/include/uapi/linux/vzcalluser.h b/include/uapi/linux/vzcalluser.h
> index b04594d31666..000e3ee107ad 100644
> --- a/include/uapi/linux/vzcalluser.h
> +++ b/include/uapi/linux/vzcalluser.h
> @@ -48,6 +48,7 @@ struct vzctl_ve_configure {
> #define VE_FEATURE_BRIDGE (1ULL << 7)
> #define VE_FEATURE_NFSD (1ULL << 8)
> #define VE_FEATURE_TIME (1ULL << 9)
> +#define VE_FEATURE_BPF (1ULL << 10)
>
> #define VE_FEATURES_OLD (VE_FEATURE_SYSFS)
> #define VE_FEATURES_DEF (VE_FEATURE_SYSFS | VE_FEATURE_DEF_PERMS)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 2c901fc67570..b75a34589880 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -36,6 +36,8 @@
> #include <linux/rcupdate_trace.h>
> #include <linux/memcontrol.h>
> #include <linux/trace_events.h>
> +#include <linux/vzcalluser.h>
> +#include <linux/ve.h>
>
> #include <net/netfilter/nf_bpf_link.h>
> #include <net/netkit.h>
> @@ -1217,6 +1219,12 @@ static bool bpf_net_capable(void)
> return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
> }
>
> +static bool ve_bpf_capable(int cap)
> +{
> + return feature_capable(VE_FEATURE_BPF, cap) ||
> + (cap != CAP_SYS_ADMIN && feature_capable(VE_FEATURE_BPF, CAP_SYS_ADMIN));
> +}
> +
> #define BPF_MAP_CREATE_LAST_FIELD map_token_fd
> /* called via syscall */
> static int map_create(union bpf_attr *attr)
> @@ -2705,22 +2713,34 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
> * object creation success. Even with unprivileged BPF disabled,
> * capability checks are still carried out for these
> * and other operations.
> + *
> + * Intent for ve_bpf_capable() check is to allow the same handling of
> + * BPF_PROG_TYPE_CGROUP_DEVICE as is done for mainstream unprivileged
> + * BPF programs in unprivileged_bpf_disabled == false case, but make it
> + * on per-container basis.
> */
> - if (sysctl_unprivileged_bpf_disabled && !bpf_cap)
> - goto put_token;
> + if (!bpf_cap) {
> + if (type == BPF_PROG_TYPE_CGROUP_DEVICE) {
> + if (!ve_bpf_capable(CAP_BPF))
> + goto put_token;
> + } else if (type == BPF_PROG_TYPE_SOCKET_FILTER ||
> + type == BPF_PROG_TYPE_CGROUP_SKB) {
> + if (sysctl_unprivileged_bpf_disabled)
> + goto put_token;
> + } else {
> + goto put_token;
> + }
> + }
>
> if (attr->insn_cnt == 0 ||
> attr->insn_cnt > (bpf_cap ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) {
> err = -E2BIG;
> goto put_token;
> }
> - if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
> - type != BPF_PROG_TYPE_CGROUP_SKB &&
> - !bpf_cap)
> - goto put_token;
>
> if (is_net_admin_prog_type(type) && !bpf_token_capable(token, CAP_NET_ADMIN))
> - goto put_token;
> + if (type != BPF_PROG_TYPE_CGROUP_DEVICE || !ve_bpf_capable(CAP_NET_ADMIN))
> + goto put_token;
> if (is_perfmon_prog_type(type) && !bpf_token_capable(token, CAP_PERFMON))
> goto put_token;
>
> @@ -4195,11 +4215,64 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>
> #define BPF_PROG_QUERY_LAST_FIELD query.revision
>
> +/*
> + * Returns 0 when prog query is allowed and -error otherwise. We actually
> + * always return -EPERM, except for the rare case of problems with target_fd,
> + * to indicate to the user clearly that provided fd is bad with -EBADF.
> + * It is imperative to return 0 only when we are sure that the caller is
> + * allowed to query the program, as by returning 0 we override
> + * bpf_net_capable() failed check!
> + */
> +static bool ve_bpf_prog_query_cgroup_device_allowed(const union bpf_attr *attr)
> +{
> + struct cgroup *cgrp, *ve_root;
> + int ret = -EPERM;
> +
> + /*
> + * Prohibit getting information about programs attached to ancestor
> + * cgroups in VE. To avoid VE processes peeking into host programs.
> + */
> + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
> + return ret;
> +
> + /* Allow only BPF_CGROUP_DEVICE programs in VE. */
> + if (attr->query.attach_type != BPF_CGROUP_DEVICE)
> + return ret;
> +
> + /* Allow only VE net or sys admin to query bpf programs. */
> + if (!ve_bpf_capable(CAP_NET_ADMIN))
> + return ret;
> +
> + cgrp = cgroup_get_from_fd(attr->query.target_fd);
> + if (IS_ERR(cgrp))
> + return -EBADF;
> +
> + /* Allow only query on non-root cgroups belonging to current VE */
> + rcu_read_lock();
> + ve_root = cgroup_ve_root1(cgrp);
> + if (!ve_root || ve_root == cgrp)
> + goto denied;
> +
> + if (rcu_dereference(ve_root->ve_owner) != get_exec_env())
> + goto denied;
> +
> + ret = 0;
> +denied:
> + rcu_read_unlock();
> + cgroup_put(cgrp);
> + return ret;
> +}
> +
> static int bpf_prog_query(const union bpf_attr *attr,
> union bpf_attr __user *uattr)
> {
> - if (!bpf_net_capable())
> - return -EPERM;
> + if (!bpf_net_capable()) {
> + int ret;
> +
> + ret = ve_bpf_prog_query_cgroup_device_allowed(attr);
> + if (ret < 0)
> + return ret;
> + }
> if (CHECK_ATTR(BPF_PROG_QUERY))
> return -EINVAL;
> if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
> -- 2.53.0
--
Best regards, Vasileios Almpanis
Software Developer, Virtuozzo.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openvz.org/pipermail/devel/attachments/20260409/5f9628d2/attachment-0001.html>
More information about the Devel
mailing list