[Devel] [PATCH RHEL9 COMMIT] ve/cgroup: Set release_agent_path for root cgroups separately

Wed Oct 20 11:43:57 MSK 2021

The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-4.vz9.10.12
------>
commit ee84e05864f9acf498132dbf166e0462d0d5665e
Author: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Date:   Wed Oct 20 11:43:57 2021 +0300

    ve/cgroup: Set release_agent_path for root cgroups separately
    
    Previous patches already switched release agent binary calling mechanism
    to call the binary in ve. Now we need to switch the binary path to
    per-ve path. So we store release agent path in ve_ra_data structures in
    per-ve ve->ra_data_list.
    
    Short history of the patch:
    
    Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
    
    https://jira.sw.ru/browse/PSBM-83887
    (cherry-picked from vz7 commit f1199bd9589b7c0914343dcc72f49ddaa9b98496)
    https://jira.sw.ru/browse/PSBM-108270
    (cherry picked from vz8 commit 40381d9afaff5618fbe93428f125a7acc2bde56d)
    +++
    ve/cgroup: Add release_agent to each container root cgroup
    (cherry picked from vz8 commit 00bd9c411dba17692408fcdd1769fa90b275313b)
    
    vz9 changes:
    - rework and cleanup patch description
    - don't remove cgroup_kn_lock_live error checking
    - switch to new ve_ra_data_set / ve_ra_data_get_path_locked
    - add BUG_ON if cft->file_offset is set as that can lead to
      cgroup_add_file(NULL,...) crash
    - lock cgroup_mutex as we need it for cgroup_add_file / cgroup_rm_file
      and cgroup_is_dead consistency
    - fixup hunks from 00bd9c411dba are merged here and there but without excess
      cft_q_node related stuff
    - if for some reason ve sees host root cgroup don't allow writing to it
      and showing it
    
    Note: ve_ra_data_set does allocation so we do not want it under
    rcu_read_lock, so we get_ve and release rcu_read_lock. We can get_ve
    (not tryget) as we just saw ve_owner so ve is not fully stopped yet and
    should be populated.
    
    https://jira.sw.ru/browse/PSBM-134002
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    Reviewed-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 include/linux/cgroup-defs.h     |   3 --
 kernel/cgroup/cgroup-internal.h |   1 +
 kernel/cgroup/cgroup-v1.c       | 105 +++++++++++++++++++++++++++-------------
 kernel/cgroup/cgroup.c          |  40 ++++++++++++++-
 4 files changed, 111 insertions(+), 38 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 2035909f7a0a..ef8ebd8a1363 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -539,9 +539,6 @@ struct cgroup_root {
 	/* Hierarchy-specific flags */
 	unsigned int flags;
 
-	/* The path to use for release notifications. */
-	char release_agent_path[PATH_MAX];
-
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 81bf97510198..c23ea4aa8e19 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -267,6 +267,7 @@ extern const struct proc_ns_operations cgroupns_operations;
 extern struct cftype cgroup1_base_files[];
 extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 extern const struct fs_parameter_spec cgroup1_fs_parameters[];
+struct cftype *get_cftype_by_name(const char *name);
 
 int proc_cgroupstats_show(struct seq_file *m, void *v);
 bool cgroup1_ssid_disabled(int ssid);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 5b3e8e79cc32..fe781bda5962 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -39,9 +39,6 @@ static bool cgroup_no_v1_named;
  */
 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 
-/* protects cgroup_subsys->release_agent_path */
-static DEFINE_SPINLOCK(release_agent_path_lock);
-
 bool cgroup1_ssid_disabled(int ssid)
 {
 	return cgroup_no_v1_mask & (1 << ssid);
@@ -542,28 +539,62 @@ static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of,
 static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
 					  char *buf, size_t nbytes, loff_t off)
 {
+	struct cgroup *root_cgrp;
+	struct ve_struct *ve = NULL;
 	struct cgroup *cgrp;
-
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+	int ret;
 
 	cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!cgrp)
 		return -ENODEV;
-	spin_lock(&release_agent_path_lock);
-	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
-		sizeof(cgrp->root->release_agent_path));
-	spin_unlock(&release_agent_path_lock);
+
+	rcu_read_lock();
+	root_cgrp = cgroup_ve_root1(cgrp);
+	if (!root_cgrp) {
+		if (ve_is_super(get_exec_env()) && cgrp == &cgrp->root->cgrp)
+			ve = &ve0;
+	} else {
+		if (root_cgrp == cgrp)
+			ve = rcu_dereference(root_cgrp->ve_owner);
+	}
+	get_ve(ve);
+	rcu_read_unlock();
+
+	if (ve) {
+		ret = ve_ra_data_set(ve, cgrp->root, strstrip(buf));
+	} else {
+		/* The ve is stopped or we have a bad cgrp */
+		ret = -ENODEV;
+	}
+
+	put_ve(ve);
 	cgroup_kn_unlock(of->kn);
-	return nbytes;
+	return ret ? : nbytes;
 }
 
 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
 {
+	const char *release_agent = NULL;
+	struct cgroup *root_cgrp;
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct ve_struct *ve = NULL;
+
+	rcu_read_lock();
+	root_cgrp = cgroup_ve_root1(cgrp);
+	if (!root_cgrp) {
+		if (ve_is_super(get_exec_env()) && cgrp == &cgrp->root->cgrp)
+			ve = &ve0;
+	} else {
+		if (root_cgrp == cgrp)
+			ve = rcu_dereference(root_cgrp->ve_owner);
+	}
+
+	if (ve)
+		release_agent = ve_ra_data_get_path_locked(ve, cgrp->root);
+	if (release_agent)
+		seq_puts(seq, release_agent);
+	rcu_read_unlock();
 
-	spin_lock(&release_agent_path_lock);
-	seq_puts(seq, cgrp->root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
 	seq_putc(seq, '\n');
 	return 0;
 }
@@ -661,7 +692,7 @@ struct cftype cgroup1_base_files[] = {
 	},
 	{
 		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
 		.seq_show = cgroup_release_agent_show,
 		.write = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX - 1,
@@ -674,6 +705,17 @@ struct cftype cgroup1_base_files[] = {
 	{ }	/* terminate */
 };
 
+struct cftype *get_cftype_by_name(const char *name)
+{
+	struct cftype *cft;
+
+	for (cft = cgroup1_base_files; cft->name[0] != '\0'; cft++) {
+		if (!strcmp(cft->name, name))
+			return cft;
+	}
+	return NULL;
+}
+
 #define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1)
 
 /* Display information about each subsystem and each hierarchy */
@@ -827,6 +869,7 @@ void cgroup1_release_agent(struct work_struct *work)
 	spin_lock_irqsave(&ve->release_list_lock, flags);
 	while (!list_empty(&ve->release_list)) {
 		struct cgroup *cgrp;
+		const char *release_agent;
 
 		cgrp = list_entry(ve->release_list.next,
 				  struct cgroup,
@@ -834,15 +877,14 @@ void cgroup1_release_agent(struct work_struct *work)
 		list_del_init(&cgrp->release_list);
 		spin_unlock_irqrestore(&ve->release_list_lock, flags);
 
-		/* snoop agent path and exit early if empty */
-		if (!cgrp->root->release_agent_path[0])
-			goto continue_locked;
-
-		spin_lock(&release_agent_path_lock);
-		strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
-		spin_unlock(&release_agent_path_lock);
-		if (!agentbuf[0])
+		rcu_read_lock();
+		release_agent = ve_ra_data_get_path_locked(ve, cgrp->root);
+		if (!release_agent || !release_agent[0]) {
+			rcu_read_unlock();
 			goto continue_locked;
+		}
+		strlcpy(agentbuf, release_agent, PATH_MAX);
+		rcu_read_unlock();
 
 		ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, ve_cgroup_ns);
 		if (ret < 0 || ret >= PATH_MAX)
@@ -914,6 +956,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
 
 static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
 {
+	const char *release_agent;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
 	int ssid;
@@ -928,12 +971,11 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
 	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
 		seq_puts(seq, ",cpuset_v2_mode");
 
-	spin_lock(&release_agent_path_lock);
-	if (strlen(root->release_agent_path))
-		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
-	spin_unlock(&release_agent_path_lock);
-
+	rcu_read_lock();
+	release_agent = ve_ra_data_get_path_locked(get_exec_env(), root);
+	if (release_agent && release_agent[0])
+		seq_show_option(seq, "release_agent", release_agent);
+	rcu_read_unlock();
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
@@ -1144,11 +1186,8 @@ int cgroup1_reconfigure(struct fs_context *fc)
 
 	WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask));
 
-	if (ctx->release_agent) {
-		spin_lock(&release_agent_path_lock);
-		strcpy(root->release_agent_path, ctx->release_agent);
-		spin_unlock(&release_agent_path_lock);
-	}
+	if (ctx->release_agent)
+		ret = ve_ra_data_set(get_exec_env(), root, ctx->release_agent);
 
 	trace_cgroup_remount(root);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1fcb3041abb7..396c0dc98b64 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -68,6 +68,8 @@
 /* let's not notify more than 100 times per second */
 #define CGROUP_FILE_NOTIFY_MIN_INTV	DIV_ROUND_UP(HZ, 100)
 
+#define CGROUP_FILENAME_RELEASE_AGENT "release_agent"
+
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -2050,12 +2052,21 @@ static inline bool ve_check_root_cgroups(struct css_set *cset)
 	return false;
 }
 
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+			   struct cftype *cft, bool activate);
+
 int cgroup_mark_ve_roots(struct ve_struct *ve)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	struct cgroup *cgrp;
+	struct cftype *cft;
+	int err = 0;
+
+	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
+	BUG_ON(!cft || cft->file_offset);
 
+	mutex_lock(&cgroup_mutex);
 	spin_lock_irq(&css_set_lock);
 
 	/*
@@ -2068,6 +2079,7 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
 
 	if (ve_check_root_cgroups(cset)) {
 		spin_unlock_irq(&css_set_lock);
+		mutex_unlock(&cgroup_mutex);
 		return -EINVAL;
 	}
 
@@ -2079,12 +2091,30 @@ int cgroup_mark_ve_roots(struct ve_struct *ve)
 
 		rcu_assign_pointer(cgrp->ve_owner, ve);
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+
+		/*
+		 * The cgroupns can hold reference on cgroups which are already
+		 * offline with already removed directory on the filesystem. We
+		 * should not add files to them.
+		 */
+		if (!cgroup_is_dead(cgrp)) {
+			err = cgroup_add_file(NULL, cgrp, cft, true);
+			if (err) {
+				pr_warn("failed to add file to VE_ROOT cgroup,"
+					" err:%d\n", err);
+				break;
+			}
+		}
 	}
 
 	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
 	spin_unlock_irq(&css_set_lock);
+	mutex_unlock(&cgroup_mutex);
 	synchronize_rcu();
-	return 0;
+
+	if (err)
+		cgroup_unmark_ve_roots(ve);
+	return err;
 }
 
 void cgroup_unmark_ve_roots(struct ve_struct *ve)
@@ -2092,7 +2122,11 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	struct cgroup *cgrp;
+	struct cftype *cft;
+
+	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
 
+	mutex_lock(&cgroup_mutex);
 	spin_lock_irq(&css_set_lock);
 
 	/*
@@ -2109,11 +2143,13 @@ void cgroup_unmark_ve_roots(struct ve_struct *ve)
 		if (!is_virtualized_cgroup(cgrp))
 			continue;
 
+		cgroup_rm_file(cgrp, cft);
 		rcu_assign_pointer(cgrp->ve_owner, NULL);
 		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
 	}
 
 	spin_unlock_irq(&css_set_lock);
+	mutex_unlock(&cgroup_mutex);
 	/* ve_owner == NULL will be visible */
 	synchronize_rcu();
 }
@@ -2196,7 +2232,7 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
 
 	root->flags = ctx->flags;
 	if (ctx->release_agent)
-		strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
+		ve_ra_data_set(get_exec_env(), root, ctx->release_agent);
 	if (ctx->name)
 		strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
 	if (ctx->cpuset_clone_children)