[Devel] [PATCH 1/2 v0] cgroup: 'release_agent' property is now per-cgroup instead of per-mount.

Fri Mar 13 13:09:28 MSK 2020

Until now 'release_agent' was a single property for the whole hierarchy
of cgroups in the same mounted subsystem. After mounting a subsystem
this property was represented by a single 'release_agent' file located
in the root cgroup's directoy. Container concept assumes that multiple
virtual cgroup root's breaking single mount - single root relationship,
single mount can have as many cgroup roots as there are containers's on
the system. As an example of the end user of 'release_agent' file is
systemd, that sets it's own binary by writing to this file at setup
step. The use case may not be limited by systemd only, so other
userspace scenarios might want to override 'release_agent' from their
containers.

'release_agent' field is moved from 'struct cgroupfs_root'
which is a mount-wide cgroup options holder, to 'struct cgroup' itself.
Former logic to only show 'release_agent' in root cgroup only is
preserved but is extended to also support virtual roots by adding this
file to cgroup directory, when it becomes marked as VE_ROOT at
ve_start_container time.

Drawbacks:
1.
Because release_agent is not only a cgroup property but also a cgroup
mount option, userspace can set this two ways:
 a) echo 'binary_path' > ..../cgroup/release_agent.
 b) mount -t cgroup .... -o release_agent=binary_path'

First is totally ok with our solution, second is not. We are not
changing the mount API for cgroups, so we leave the mount option as a
way of setting release_agent, at mount time we read this mount option
and put it into release_agent field of the top cgroup, which is the real
root cgroup for this mount.

When remounting cgroup kernel code will try to compose mount options
from the current mount and for release agent, it will take the value
from the top cgroup.

When the user is host and he want's to know mount options by typing
'mount' in the command line for example, we read release_agent back from
the top cgroup.

When the user is in container, we read release_agent from the VE_ROOT
cgroup.

Here is a problem: although there is only one mount, the user can see
different mount options for it. Bad thing will happen when the user will
decide to remount the cgroup, he can compose '-o' options by using
container release_agent and if it is different from the host's
(top-cgroup), then remount will fail, it forbids remount with varying
value of release_agent.

We ignore this problem as OK, because container processes are not allowed
to remount cgroups.

2.
Host user reading '../cgroup/{VE-CGROUP}/release_agent' file for a particular
container will get the binary path in container-local form, not the actual path
on the host filesystem.
This issue is minor and no valid reasoning found to make it the other way.

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 include/linux/cgroup.h |   9 ++--
 kernel/cgroup.c        | 127 ++++++++++++++++++++++++++++++++++++++++++-------
 kernel/ve/ve.c         |   8 +++-
 3 files changed, 123 insertions(+), 21 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index aa91e47..cad5b4f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -285,6 +285,12 @@ struct cgroup {
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
 	u64 subgroups_limit;
+
+	/*
+	 * Per-cgroup path to release agent binary for release
+	 * notifications.
+	 */
+	const char *release_agent_path;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -384,9 +390,6 @@ struct cgroupfs_root {
 	/* IDs for cgroups in this hierarchy */
 	struct ida cgroup_ida;
 
-	/* The path to use for release notifications. */
-	char release_agent_path[PATH_MAX];
-
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9ca8af9..0b64d88 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1038,8 +1038,14 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
 	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
 	struct cgroup_subsys *ss;
+	struct cgroup *root_cgrp = &root->top_cgroup;
+
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
 
 	mutex_lock(&cgroup_root_mutex);
+
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
@@ -1050,9 +1056,18 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",xattr");
 	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
 		seq_puts(seq, ",cpuset_v2_mode");
-	if (strlen(root->release_agent_path))
+#ifdef CONFIG_VE
+	if (!ve_is_super(ve)) {
+		mutex_lock(&cgroup_mutex);
+		root_cgrp = task_cgroup_from_root(ve->init_task, root);
+		mutex_unlock(&cgroup_mutex);
+	}
+#endif
+	if (root_cgrp && root_cgrp->release_agent_path &&
+		strlen(root_cgrp->release_agent_path)) {
 		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
+				root_cgrp->release_agent_path);
+	}
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
@@ -1275,6 +1290,26 @@ static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 	}
 }
 
+static int cgroup_set_release_agent_locked(struct cgroup *cgrp,
+					   const char *release_agent)
+{
+	size_t len;
+	char *path;
+
+	len = strlen(release_agent);
+	if (len >= PATH_MAX)
+		return -EINVAL;
+
+	path = kstrdup(release_agent, GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+
+	kfree(cgrp->release_agent_path);
+
+	cgrp->release_agent_path = path;
+	return 0;
+}
+
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
 	int ret = 0;
@@ -1331,7 +1366,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	cgroup_populate_dir(cgrp, false, added_mask);
 
 	if (opts.release_agent)
-		strcpy(root->release_agent_path, opts.release_agent);
+		ret = cgroup_set_release_agent_locked(cgrp, opts.release_agent);
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
@@ -1493,7 +1528,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	root->flags = opts->flags;
 	ida_init(&root->cgroup_ida);
 	if (opts->release_agent)
-		strcpy(root->release_agent_path, opts->release_agent);
+		cgroup_set_release_agent_locked(&root->top_cgroup,
+			opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
@@ -1867,6 +1903,30 @@ int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen)
 	return __cgroup_path(cgrp, buf, buflen, !ve_is_super(ve) && !ve->is_pseudosuper);
 }
 
+static inline struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
+{
+	/*
+	 * Find nearest root cgroup, which might be host cgroup root
+	 * or ve cgroup root.
+	 *
+	 *    <host_root_cgroup> -> local_root
+	 *     \                    ^
+	 *      <cgroup>            |
+	 *       \                  |
+	 *        <cgroup>   --->   from here
+	 *        \
+	 *         <ve_root_cgroup> -> local_root
+	 *         \                   ^
+	 *          <cgroup>           |
+	 *          \                  |
+	 *           <cgroup>  --->    from here
+	 */
+	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
+		cgrp = cgrp->parent;
+
+	return cgrp;
+}
+
 /*
  * Control Group taskset
  */
@@ -2258,19 +2318,16 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
+	int ret;
 	if (strlen(buffer) >= PATH_MAX)
 		return -EINVAL;
 
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 
-	mutex_lock(&cgroup_root_mutex);
-	strcpy(cgrp->root->release_agent_path, buffer);
-	mutex_unlock(&cgroup_root_mutex);
+	ret = cgroup_set_release_agent_locked(cgrp, buffer);
 	mutex_unlock(&cgroup_mutex);
-	return 0;
+	return ret;
 }
 
 static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
@@ -2278,7 +2335,8 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 {
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	seq_puts(seq, cgrp->root->release_agent_path);
+	if (cgrp->release_agent_path)
+		seq_puts(seq, cgrp->release_agent_path);
 	seq_putc(seq, '\n');
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -4094,7 +4152,7 @@ static struct cftype files[] = {
 	},
 	{
 		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
 		.read_seq_string = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
@@ -4223,21 +4281,52 @@ static int subgroups_count(struct cgroup *cgroup)
 	return cgrps_count;
 }
 
+static struct cftype *get_cftype_by_name(const char *name)
+{
+	struct cftype *cft;
+	for (cft = files; cft->name[0] != '\0'; cft++) {
+		if (!strcmp(cft->name, name))
+			return cft;
+	}
+	return NULL;
+}
+
+static int cgroup_add_file_on_mark_ve(struct cgroup *ve_root)
+{
+	int err = 0;
+	struct dentry *dir = ve_root->dentry;
+	struct cftype *cft = get_cftype_by_name("release_agent");
+	BUG_ON(!cft);
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	err = cgroup_add_file(ve_root, NULL, cft);
+	mutex_unlock(&dir->d_inode->i_mutex);
+	if (err) {
+		pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+			cft->name, err);
+	}
+	return err;
+}
+
 #ifdef CONFIG_VE
-void cgroup_mark_ve_root(struct ve_struct *ve)
+int cgroup_mark_ve_root(struct ve_struct *ve)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root;
+	int err = 0;
 
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
-
+		err = cgroup_add_file_on_mark_ve(cgrp);
+		if (err)
+			break;
 		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
 			link_ve_root_cpu_cgroup(cgrp);
 	}
 	mutex_unlock(&cgroup_mutex);
+	return err;
 }
 
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
@@ -4556,6 +4645,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	}
 	spin_unlock(&cgrp->event_list_lock);
 
+	kfree(cgrp->release_agent_path);
+
 	return 0;
 };
 
@@ -5366,6 +5457,7 @@ static void cgroup_release_agent(struct work_struct *work)
 		char *argv[3], *envp[3];
 		int i, err;
 		char *pathbuf = NULL, *agentbuf = NULL;
+		struct cgroup *root_cgrp;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
@@ -5374,9 +5466,12 @@ static void cgroup_release_agent(struct work_struct *work)
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
 			goto continue_free;
-		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+		root_cgrp = cgroup_get_local_root(cgrp);
+		if (root_cgrp->release_agent_path)
+			agentbuf = kstrdup(root_cgrp->release_agent_path,
+				GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ad3a698..a64b4a7 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -479,7 +479,7 @@ static void ve_drop_context(struct ve_struct *ve)
 
 static const struct timespec zero_time = { };
 
-extern void cgroup_mark_ve_root(struct ve_struct *ve);
+extern int cgroup_mark_ve_root(struct ve_struct *ve);
 
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
@@ -528,7 +528,9 @@ static int ve_start_container(struct ve_struct *ve)
 	if (err < 0)
 		goto err_iterate;
 
-	cgroup_mark_ve_root(ve);
+	err = cgroup_mark_ve_root(ve);
+	if (err)
+		goto err_mark_ve;
 
 	ve->is_running = 1;
 
@@ -538,6 +540,8 @@ static int ve_start_container(struct ve_struct *ve)
 
 	return 0;
 
+err_mark_ve:
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 err_iterate:
 	ve_stop_umh(ve);
 err_umh:
-- 
1.8.3.1