[Devel] [PATCH RHEL9 COMMIT] ve/cgroup: fix cgroup file active count vs cgroup_mutex deadlock

Mon Aug 15 20:29:09 MSK 2022

The commit is pushed to "branch-rh9-5.14.0-70.13.1.vz9.16.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-70.13.1.vz9.16.8
------>
commit e20f5d90491b4ed73d0be7d443b80909077cb684
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Mon Jul 4 13:35:05 2022 +0300

    ve/cgroup: fix cgroup file active count vs cgroup_mutex deadlock
    
    Any lock B taken in write callback of kernfs file, for instance ve.state
    cgroup file, gets a lockdep dependency from this kernfs file refcount
    increment A, same is valid for any locks C which in their turn are taken
    from under B in other code paths.
    
    Meaning that if we wait for a refcount A to become released under B (or
    C) while from other thread we hold the refcount A and try to lock B
    (which is held by other thread which tries to lock C) we would get ABBA
    (or ABBCCA) deadlock.
    
    In cgroup_rmdir() we wait for kernfs refcount release under cgroup_mutex
    so the cgroup_mutex can't be in position B or C.
    
    Though we have two places where the above rule is violated:
    
    First, cgroup_mutex is taken in ve_start_container() which is under the
    "ve.state" file kernfs refcount held, which puts the lock in position B.
    
    Second, cgroup_mutex is taken in cgroup_unmark_ve_roots() which is under
    ve->op_sem mutex, and ve->op_sem mutex in it's turn is taken from the
    ve_state_write() which is also under "ve.state" file kernfs refcount
    held, which puts the lock in position C.
    
    So we need to move cgroup_mutex both from under the refcount and from
    under ve->op_sem. We do this by splitting the actual cgroup file
    structure modification code (which requires cgroup_mutex) into separate
    functions which on container start called through task_work to remove
    dependencies and on container stop called before taking ve->op_sem lock.
    
    https://jira.sw.ru/browse/PSBM-140700
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    Reviewed-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
    
    Feature: cgroup: per-CT cgroup release_agent
---
 include/linux/cgroup.h |   2 +
 include/linux/ve.h     |   1 +
 kernel/cgroup/cgroup.c | 157 ++++++++++++++++++++++++++++++++++---------------
 kernel/ve/ve.c         |  26 +++++---
 4 files changed, 129 insertions(+), 57 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d2586a8397f3..9b4d11c97aec 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -891,6 +891,8 @@ void cgroup1_release_agent(struct work_struct *work);
 #ifdef CONFIG_VE
 extern int cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
+int ve_release_agent_setup(struct ve_struct *ve);
+void ve_release_agent_teardown(struct ve_struct *ve);
 struct ve_struct *cgroup_ve_owner(struct cgroup *cgrp);
 #endif
 
diff --git a/include/linux/ve.h b/include/linux/ve.h
index d64ed3a7d3a4..18b0e30366b7 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -113,6 +113,7 @@ struct ve_struct {
 	/* Should take rcu_read_lock and check ve->is_running before queue */
 	struct workqueue_struct	*wq;
 	struct work_struct	release_agent_work;
+	struct callback_head	ve_release_agent_setup_head;
 
 	/*
 	 * List of data, private for each root cgroup in
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ac66df6bfd59..deb2178ac6cb 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2094,43 +2094,101 @@ static inline bool ve_check_root_cgroups(struct css_set *cset)
 	return false;
 }
 
-static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
-			   struct cftype *cft, bool activate);
-
 int cgroup_mark_ve_roots(struct ve_struct *ve)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	struct cgroup *cgrp;
-	struct cftype *cft;
-	int err = 0;
-
-	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
-	BUG_ON(!cft || cft->file_offset);
-
-	mutex_lock(&cgroup_mutex);
-	spin_lock_irq(&css_set_lock);
 
 	/*
-	 * We can safely use ve->ve_ns without rcu_read_lock here, as we are
-	 * always called _after_ ve_grab_context under ve->op_sem, so we've
-	 * just set ve_ns and nobody else can modify it under us.
+	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
+	 * locking as we do it from container init at container start after
+	 * ve_grab_context and only container init can tear those down.
 	 */
-	cset = rcu_dereference_protected(ve->ve_ns,
-			lockdep_is_held(&ve->op_sem))->cgroup_ns->root_cset;
+	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
+	BUG_ON(!cset);
 
+	spin_lock_irq(&css_set_lock);
 	if (ve_check_root_cgroups(cset)) {
 		spin_unlock_irq(&css_set_lock);
-		mutex_unlock(&cgroup_mutex);
 		return -EINVAL;
 	}
 
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		cgrp = link->cgrp;
+
+		if (!is_virtualized_cgroup(cgrp))
+			continue;
+
+		rcu_assign_pointer(cgrp->ve_owner, ve);
+		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	/* FIXME: implies cpu cgroup in cset, which is not always right */
+	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
+	synchronize_rcu();
+	return 0;
+}
+
+void cgroup_unmark_ve_roots(struct ve_struct *ve)
+{
+	struct cgrp_cset_link *link;
+	struct css_set *cset;
+	struct cgroup *cgrp;
+
+	/*
+	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
+	 * locking as we do it from container init at container start after
+	 * ve_grab_context and only container init can tear those down.
+	 */
+	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
+	BUG_ON(!cset);
+
+	spin_lock_irq(&css_set_lock);
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+		cgrp = link->cgrp;
+
+		if (!is_virtualized_cgroup(cgrp))
+			continue;
+
+		/* Set by cgroup_mark_ve_roots */
+		if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
+			continue;
+
+		rcu_assign_pointer(cgrp->ve_owner, NULL);
+		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+	}
+	spin_unlock_irq(&css_set_lock);
+	/* ve_owner == NULL will be visible */
+	synchronize_rcu();
+}
+
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+			   struct cftype *cft, bool activate);
+
+void ve_release_agent_setup_work(struct callback_head *head)
+{
+	struct cgrp_cset_link *link;
+	struct ve_struct *ve;
+	struct css_set *cset;
+	struct cgroup *cgrp;
+	struct cftype *cft;
+	int err;
+
+	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
+	BUG_ON(!cft || cft->file_offset);
+
+	ve = container_of(head, struct ve_struct,
+			  ve_release_agent_setup_head);
+	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
+
 	/*
 	 * We want to traverse the cset->cgrp_links list and
 	 * call cgroup_add_file() function which will call
 	 * memory allocation functions with GFP_KERNEL flag.
-	 * We can't continue to hold css_set_lock, but it's
-	 * safe to hold cgroup_mutex.
+	 * We can't lock css_set_lock, instead it's safe to
+	 * hold cgroup_mutex.
 	 *
 	 * It's safe to hold only cgroup_mutex and traverse
 	 * the cset list just because in all scenarious where
@@ -2144,18 +2202,17 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
 	 * check which prevents any list modifications if someone is still
 	 * holding the refcnt to cset.
 	 * See copy_cgroup_ns() function it's taking refcnt's by get_css_set().
-	 *
 	 */
-	spin_unlock_irq(&css_set_lock);
-
+	mutex_lock(&cgroup_mutex);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		cgrp = link->cgrp;
 
 		if (!is_virtualized_cgroup(cgrp))
 			continue;
 
-		rcu_assign_pointer(cgrp->ve_owner, ve);
-		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+		/* Set by cgroup_mark_ve_roots */
+		if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
+			continue;
 
 		/*
 		 * The cgroupns can hold reference on cgroups which are already
@@ -2171,17 +2228,29 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
 			}
 		}
 	}
-
-	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 	mutex_unlock(&cgroup_mutex);
-	synchronize_rcu();
+}
 
-	if (err)
-		cgroup_unmark_ve_roots(ve);
-	return err;
+int ve_release_agent_setup(struct ve_struct *ve)
+{
+	/*
+	 * Add release_agent files to ve root cgroups via task_work.
+	 * This is to resolve deadlock between cgroup file refcount and
+	 * cgroup_mutex, we can't take cgroup_mutex under the refcount
+	 * or under locks, which are under the refcount (ve->op_sem).
+	 *
+	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset inside task_work
+	 * without extra locking as we do it from container init before exiting
+	 * to userspace just after container start and only container init can
+	 * tear those down.
+	 */
+	init_task_work(&ve->ve_release_agent_setup_head,
+		       ve_release_agent_setup_work);
+	return task_work_add(current, &ve->ve_release_agent_setup_head,
+			     TWA_RESUME);
 }
 
-void cgroup_unmark_ve_roots(struct ve_struct *ve)
+void ve_release_agent_teardown(struct ve_struct *ve)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
@@ -2191,36 +2260,28 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
 	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
 	BUG_ON(!cft || cft->file_offset);
 
-	mutex_lock(&cgroup_mutex);
-
 	/*
-	 * We can safely use ve->ve_ns without rcu_read_lock here, as we are
-	 * always called _before_ ve_drop_context under ve->op_sem, so we
-	 * did not change ve_ns yet and nobody else can modify it under us.
+	 * It's safe to use ve->ve_ns->cgroup_ns->root_cset here without extra
+	 * locking as we do it from container init at container stop before
+	 * ve_drop_context and only container init can tear those down.
 	 */
-	cset = rcu_dereference_protected(ve->ve_ns,
-			lockdep_is_held(&ve->op_sem))->cgroup_ns->root_cset;
-	BUG_ON(!cset);
+	cset = rcu_dereference_protected(ve->ve_ns, 1)->cgroup_ns->root_cset;
 
-	/*
-	 * Traversing @cgrp_links without @css_set_lock is safe here for
-	 * the same reasons as in cgroup_mark_ve_roots().
-	 */
+	mutex_lock(&cgroup_mutex);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		cgrp = link->cgrp;
 
 		if (!is_virtualized_cgroup(cgrp))
 			continue;
 
+		/* Set by cgroup_mark_ve_roots */
+		if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
+			continue;
+
 		if (!cgroup_is_dead(cgrp))
 			cgroup_rm_file(cgrp, cft);
-		rcu_assign_pointer(cgrp->ve_owner, NULL);
-		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
-
 	mutex_unlock(&cgroup_mutex);
-	/* ve_owner == NULL will be visible */
-	synchronize_rcu();
 }
 
 struct cgroup_subsys_state *css_ve_root1(struct cgroup_subsys_state *css)
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index d8c381c02e2c..9ee16b66ba4e 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -754,6 +754,10 @@ static int ve_start_container(struct ve_struct *ve)
 	if (err)
 		goto err_mark_ve;
 
+	err = ve_release_agent_setup(ve);
+	if (err)
+		goto err_release_agent_setup;
+
 	ve->is_running = 1;
 
 	printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
@@ -762,6 +766,8 @@ static int ve_start_container(struct ve_struct *ve)
 
 	return 0;
 
+err_release_agent_setup:
+	cgroup_unmark_ve_roots(ve);
 err_mark_ve:
 	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 err_iterate:
@@ -823,26 +829,28 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	struct ve_struct *ve = current->task_ve;
 	struct nsproxy *ve_ns;
 
-	down_write(&ve->op_sem);
-	ve_ns = rcu_dereference_protected(ve->ve_ns, lockdep_is_held(&ve->op_sem));
 	/*
-	 * current->cgroups already switched to init_css_set in cgroup_exit(),
-	 * but current->task_ve still points to our exec ve.
+	 * Check that root container pidns dies and we are here to stop VE
 	 */
-	if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
-		goto unlock;
-
-	cgroup_unmark_ve_roots(ve);
+	rcu_read_lock();
+	ve_ns = rcu_dereference(ve->ve_ns);
+	if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns) {
+		rcu_read_unlock();
+		return;
+	}
+	rcu_read_unlock();
 
 	/*
 	 * At this point all userspace tasks in container are dead.
 	 */
+	ve_release_agent_teardown(ve);
+	down_write(&ve->op_sem);
+	cgroup_unmark_ve_roots(ve);
 	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 	ve_list_del(ve);
 	ve_drop_context(ve);
 	printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
 	put_ve(ve); /* from ve_start_container() */
-unlock:
 	up_write(&ve->op_sem);
 }