[Devel] [PATCH VZ10 03/12] ve: Introduce VE namespace

Konstantin Khorenko khorenko at virtuozzo.com
Mon Dec 1 20:04:52 MSK 2025


On 11/24/25 12:20, Pavel Tikhomirov wrote:
...
> +static struct ve_namespace *clone_ve_ns(struct user_namespace *user_ns,
> +					struct ve_namespace *old_ns)
> +{
> +	struct ve_namespace *ns;
> +	struct ucounts *ucounts;
> +	int err;
> +
> +	ucounts = inc_ve_namespaces(user_ns);
> +	if (!ucounts)
> +		return ERR_PTR(-ENOSPC);
> +
> +	err = -ENOMEM;
> +	ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
> +	if (!ns)
> +		goto err_dec_ucount;
> +
> +	refcount_set(&ns->ns.count, 1);
> +
> +	err = ns_alloc_inum(&ns->ns);
> +	if (err)
> +		goto err_free_ns;
> +
> +	ns->ucounts = ucounts;
> +	ns->ns.ops = &ve_ns_operations;
> +	ns->user_ns = get_user_ns(user_ns);
> +
> +	/*
> +	 * VE namespace links to current ve cgroup
> +	 * FIXME it should be a 1:1 link
> +	 */
> +	ns->ve = get_ve(css_to_ve(current->cgroups->subsys[ve_cgrp_id]));

   Complete Scenario: When `current->cgroups->subsys[ve_cgrp_id]` Can Be NULL


   Step 1: Creating a cgroup hierarchy WITHOUT VE subsystem


      1 │# Administrator creates a new cgroup hierarchy without VE subsystem
      2 │mkdir /sys/fs/cgroup/myhierarchy
      3 │mount -t cgroup2 none /sys/fs/cgroup/myhierarchy
      4 │# VE subsystem is NOT enabled in this hierarchy

   Result: root->subsys_mask & (1UL << ve_cgrp_id) == 0

   Step 2: Process migrates into this cgroup


      1 │// kernel/cgroup/cgroup.c:find_existing_css_set()
      2 │static struct css_set *find_existing_css_set(...)
      3 │{
      4 │    for_each_subsys(ss, i) {
      5 │        if (root->subsys_mask & (1UL << i)) {
      6 │            // VE subsystem is NOT enabled, this block does NOT execute
      7 │            template[i] = cgroup_e_css_by_mask(cgrp, ss);
      8 │        } else {
      9 │            // Uses old css from old_cset
     10 │            template[i] = old_cset->subsys[i];
     11 │        }
     12 │    }
     13 │}

   If old_cset->subsys[ve_cgrp_id] was already NULL (e.g., process was created in this hierarchy), then
   template[ve_cgrp_id] = NULL.

   Step 3: Creating a new css_set


      1 │// kernel/cgroup/cgroup.c:find_css_set() (line 1257)
      2 │cset = kzalloc(sizeof(*cset), GFP_KERNEL);  // Zeroes memory
      3 │// ...
      4 │memcpy(cset->subsys, template, sizeof(cset->subsys));  // Copies template

   Result: cset->subsys[ve_cgrp_id] = NULL

   Step 4: Process receives this css_set


      1 │// Process now has:
      2 │current->cgroups = cset;  // where cset->subsys[ve_cgrp_id] == NULL


   Step 5: Process calls `clone()` with `CLONE_NEWVE`


      1 │// Userspace:
      2 │pid = clone(child_func, stack, CLONE_NEWVE | SIGCHLD, NULL);


   Step 6: Kernel calls `copy_ve_ns()`


      1 │// kernel/fork.c:copy_process() (line 2391)
      2 │retval = copy_ve_ns(clone_flags, p);


      1 │// kernel/ve/ve_namespace.c:copy_ve_ns() (line 67)
      2 │int copy_ve_ns(unsigned long flags, struct task_struct *p)
      3 │{
      4 │    // ...
      5 │    if (!(flags & CLONE_NEWVE)) {
      6 │        get_ve_ns(old_ve_ns);
      7 │        return 0;
      8 │    }
      9 │
     10 │    if (!ns_capable(user_ns, CAP_SYS_ADMIN))
     11 │        return -EPERM;
     12 │
     13 │    new_ve_ns = clone_ve_ns(user_ns, p->ve_ns);  // <-- Call
     14 │    // ...
     15 │}


   Step 7: `clone_ve_ns()` called with NULL


      1 │// kernel/ve/ve_namespace.c:clone_ve_ns() (line 57)
      2 │ns->ve = get_ve(css_to_ve(current->cgroups->subsys[ve_cgrp_id]));
      3 │//                                      ^^^^^^^^^^^^^^^^^^^^^^^^^
      4 │//                                      THIS IS NULL!


   Step 8: Call breakdown


      1 │// include/linux/ve.h:css_to_ve() (line 198)
      2 │static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state *css)
      3 │{
      4 │    return css ? container_of(css, struct ve_struct, css) : NULL;
      5 │    //     ^^^^
      6 │    //     css == NULL, returns NULL
      7 │}
      8 │
      9 │// kernel/ve/ve.c:get_ve() (line 115)
     10 │struct ve_struct *get_ve(struct ve_struct *ve)
     11 │{
     12 │    if (ve)  // ve == NULL, this block does NOT execute
     13 │        css_get(&ve->css);
     14 │    return ve;  // Returns NULL
     15 │}

   Result: ns->ve = NULL — this is a problem.
   ---

   When This Can Happen

   1. Process created in a cgroup hierarchy without VE subsystem:


      1 │   // When creating a process in such hierarchy:
      2 │   // init_css_set may not have ve_cgrp_id if VE subsystem was not initialized
      3 │   // or was disabled in this hierarchy

   2. Process migrated to a cgroup without VE subsystem:


      1 │   # Process was in VE cgroup, then migrated:
      2 │   echo $PID > /sys/fs/cgroup/myhierarchy/cgroup.procs
      3 │   # where myhierarchy does not have VE subsystem

   3. VE subsystem disabled in hierarchy:


      1 │   // kernel/cgroup/cgroup.c:cgroup_apply_control()
      2 │   // VE subsystem can be disabled dynamically
      3 │   root->subsys_mask &= ~(1 << ve_cgrp_id);

   ---

   Why This Is a Problem

   1. ns->ve = NULL — violates structure invariant.
   2. Subsequent accesses to ns->ve can cause NULL pointer dereference.
   3. Inconsistency: process has task_ve (via get_exec_env()), but namespace is not linked to a VE.

   ---

> +
> +	return ns;
> +err_free_ns:
> +	kfree(ns);
> +err_dec_ucount:
> +	dec_ve_namespaces(ucounts);
> +	return ERR_PTR(err);
> +}
...


More information about the Devel mailing list