[Devel] [PATCH 1/2 v2] cgroup: 'release_agent' property is now per-cgroup instead of per-mount.

Mon Mar 23 17:34:39 MSK 2020

Until now, 'release_agent' was a single property for the whole hierarchy
of cgroups in the same mounted subsystem. After mounting a subsystem
this property was represented by a single 'release_agent' file located
in the root cgroup's directoy. Container concept assumes that multiple
virtual cgroup roots are possible, This breaks single mount - single root 
relationship, since single mount can have as many cgroup roots as there are 
containers's on the system. As an example of the end user of 'release_agent' 
file is systemd, that sets it's own binary by writing to this file at setup
step.

'release_agent' field is moved from 'struct cgroupfs_root'
which is a mount-wide cgroup options holder, to 'struct cgroup' itself.
Former logic to only show 'release_agent' in root cgroup only is
preserved but is extended to also support virtual roots by adding this
file to cgroup directory, when it becomes marked as VE_ROOT at
ve_start_container time.

Drawbacks:
1.
Because release_agent is not only a cgroup property but also a cgroup
mount option, userspace can set this two ways:
 a) echo 'binary_path' > ..../cgroup/release_agent.
 b) mount -t cgroup .... -o release_agent=binary_path'

First is totally ok with our solution, second is not. We are not
changing the mount API for cgroups, so we leave the mount option as a
way of setting release_agent, at mount time we read this mount option
and put it into release_agent field of the top cgroup, which is the real
root cgroup for this mount.

When remounting cgroup kernel code will try to compose mount options
from the current mount and for release agent, it will take the value
from the top cgroup.

When the user is host and he want's to know mount options by typing
'mount' in the command line for example, we read release_agent back from
the top cgroup.

When the user is in container, we read release_agent from the VE_ROOT
cgroup.

Here is a problem: although there is only one mount, the user can see
different mount options for it. Bad thing will happen when the user will
decide to remount the cgroup, he can compose '-o' options by using
container release_agent and if it is different from the host's
(top-cgroup), then remount will fail, it forbids remount with varying
value of release_agent.

We ignore this problem as OK, because container processes are not allowed
to remount cgroups.

2.
Host user reading '../cgroup/{VE-CGROUP}/release_agent' file for a particular
container will get the binary path in container-local form, not the actual path
on the host filesystem.
This issue is minor and no valid reasoning found to make it the other way.

https://jira.sw.ru/browse/PSBM-83887
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 include/linux/cgroup.h |  14 ++++-
 kernel/cgroup.c        | 166 +++++++++++++++++++++++++++++++++++++++++++------
 kernel/ve/ve.c         |   8 ++-
 3 files changed, 165 insertions(+), 23 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index aa91e47..5e289fc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -210,6 +210,11 @@ struct cgroup_name {
 	char name[];
 };
 
+struct cgroup_rcu_string {
+	struct rcu_head rcu_head;
+	char val[];
+};
+
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
@@ -285,6 +290,12 @@ struct cgroup {
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
 	u64 subgroups_limit;
+
+	/*
+	 * Per-cgroup path to release agent binary for release
+	 * notifications.
+	 */
+	struct cgroup_rcu_string __rcu *release_agent_path;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -384,9 +395,6 @@ struct cgroupfs_root {
 	/* IDs for cgroups in this hierarchy */
 	struct ida cgroup_ida;
 
-	/* The path to use for release notifications. */
-	char release_agent_path[PATH_MAX];
-
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9ca8af9..7af2487 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -191,6 +191,15 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
 	return test_bit(CGRP_REMOVED, &cgrp->flags);
 }
 
+static inline const char *cgroup_release_agent_path(const struct cgroup *cgrp)
+{
+	struct cgroup_rcu_string *str;
+	str = rcu_dereference(cgrp->release_agent_path);
+	if (str)
+		return str->val;
+	return NULL;
+}
+
 /**
  * cgroup_is_descendant - test ancestry
  * @cgrp: the cgroup to be tested
@@ -772,6 +781,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 	return inode;
 }
 
+static struct cgroup_rcu_string *cgroup_rcu_strdup(const char *str, int len)
+{
+	struct cgroup_rcu_string *result;
+
+	result = kmalloc(sizeof(*result) + len + 1, GFP_KERNEL);
+	if (!result)
+		return NULL;
+	strcpy(result->val, str);
+	return result;
+}
+
 static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 {
 	struct cgroup_name *name;
@@ -1036,10 +1056,17 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 
 static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
+	const char *release_agent;
 	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
 	struct cgroup_subsys *ss;
+	struct cgroup *root_cgrp = &root->top_cgroup;
+
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
 
 	mutex_lock(&cgroup_root_mutex);
+
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
@@ -1050,9 +1077,18 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",xattr");
 	if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
 		seq_puts(seq, ",cpuset_v2_mode");
-	if (strlen(root->release_agent_path))
-		seq_show_option(seq, "release_agent",
-				root->release_agent_path);
+#ifdef CONFIG_VE
+	if (!ve_is_super(ve)) {
+		mutex_lock(&cgroup_mutex);
+		root_cgrp = task_cgroup_from_root(ve->init_task, root);
+		mutex_unlock(&cgroup_mutex);
+	}
+#endif
+	rcu_read_lock();
+	release_agent = cgroup_release_agent_path(root_cgrp);
+	if (release_agent && release_agent[0])
+		seq_show_option(seq, "release_agent", release_agent);
+	rcu_read_unlock();
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
@@ -1275,6 +1311,24 @@ static void drop_parsed_module_refcounts(unsigned long subsys_mask)
 	}
 }
 
+static int cgroup_set_release_agent_locked(struct cgroup *cgrp,
+					   const char *release_agent)
+{
+	struct cgroup_rcu_string *new_path, *old_path;
+	lockdep_assert_held(&cgroup_mutex);
+
+	new_path = cgroup_rcu_strdup(release_agent, strlen(release_agent));
+	if (!new_path)
+		return -ENOMEM;
+
+	old_path = cgrp->release_agent_path;
+	rcu_assign_pointer(cgrp->release_agent_path, new_path);
+
+	if (old_path)
+		kfree_rcu(old_path, rcu_head);
+	return 0;
+}
+
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
 	int ret = 0;
@@ -1331,7 +1385,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	cgroup_populate_dir(cgrp, false, added_mask);
 
 	if (opts.release_agent)
-		strcpy(root->release_agent_path, opts.release_agent);
+		ret = cgroup_set_release_agent_locked(cgrp, opts.release_agent);
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
@@ -1474,6 +1528,7 @@ static int cgroup_test_super(struct super_block *sb, void *data)
 
 static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 {
+	int ret;
 	struct cgroupfs_root *root;
 
 	if (!opts->subsys_mask && !opts->none)
@@ -1487,13 +1542,17 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 		kfree(root);
 		return ERR_PTR(-ENOMEM);
 	}
+	if (opts->release_agent) {
+		ret = cgroup_set_release_agent_locked(&root->top_cgroup,
+			opts->release_agent);
+		if (ret)
+			return ERR_PTR(ret);
+	}
 	init_cgroup_root(root);
 
 	root->subsys_mask = opts->subsys_mask;
 	root->flags = opts->flags;
 	ida_init(&root->cgroup_ida);
-	if (opts->release_agent)
-		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
@@ -1867,6 +1926,31 @@ int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen)
 	return __cgroup_path(cgrp, buf, buflen, !ve_is_super(ve) && !ve->is_pseudosuper);
 }
 
+static inline struct cgroup *cgroup_get_local_root(struct cgroup *cgrp)
+{
+	/*
+	 * Find nearest root cgroup, which might be host cgroup root
+	 * or ve cgroup root.
+	 *
+	 *    <host_root_cgroup> -> local_root
+	 *     \                    ^
+	 *      <cgroup>            |
+	 *       \                  |
+	 *        <cgroup>   --->   from here
+	 *        \
+	 *         <ve_root_cgroup> -> local_root
+	 *         \                   ^
+	 *          <cgroup>           |
+	 *          \                  |
+	 *           <cgroup>  --->    from here
+	 */
+	lockdep_assert_held(&cgroup_mutex);
+	while (cgrp->parent && !test_bit(CGRP_VE_ROOT, &cgrp->flags))
+		cgrp = cgrp->parent;
+
+	return cgrp;
+}
+
 /*
  * Control Group taskset
  */
@@ -2258,27 +2342,29 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
-	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
-
+	int ret;
 	if (strlen(buffer) >= PATH_MAX)
 		return -EINVAL;
 
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 
-	mutex_lock(&cgroup_root_mutex);
-	strcpy(cgrp->root->release_agent_path, buffer);
-	mutex_unlock(&cgroup_root_mutex);
+	ret = cgroup_set_release_agent_locked(cgrp, buffer);
 	mutex_unlock(&cgroup_mutex);
-	return 0;
+	return ret;
 }
 
 static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 				     struct seq_file *seq)
 {
+	const char *release_agent;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	seq_puts(seq, cgrp->root->release_agent_path);
+	rcu_read_lock();
+	release_agent = cgroup_release_agent_path(cgrp);
+	if (release_agent)
+		seq_puts(seq, release_agent);
+	rcu_read_unlock();
 	seq_putc(seq, '\n');
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -4094,7 +4180,7 @@ static struct cftype files[] = {
 	},
 	{
 		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
 		.read_seq_string = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
@@ -4223,21 +4309,52 @@ static int subgroups_count(struct cgroup *cgroup)
 	return cgrps_count;
 }
 
+static struct cftype *get_cftype_by_name(const char *name)
+{
+	struct cftype *cft;
+	for (cft = files; cft->name[0] != '\0'; cft++) {
+		if (!strcmp(cft->name, name))
+			return cft;
+	}
+	return NULL;
+}
+
+static int cgroup_add_file_on_mark_ve(struct cgroup *ve_root)
+{
+	int err;
+	struct dentry *dir = ve_root->dentry;
+	struct cftype *cft = get_cftype_by_name("release_agent");
+	BUG_ON(!cft);
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	err = cgroup_add_file(ve_root, NULL, cft);
+	mutex_unlock(&dir->d_inode->i_mutex);
+	if (err) {
+		pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
+			cft->name, err);
+	}
+	return err;
+}
+
 #ifdef CONFIG_VE
-void cgroup_mark_ve_root(struct ve_struct *ve)
+int cgroup_mark_ve_root(struct ve_struct *ve)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root;
+	int err = 0;
 
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
 		cgrp = task_cgroup_from_root(ve->init_task, root);
 		set_bit(CGRP_VE_ROOT, &cgrp->flags);
-
+		err = cgroup_add_file_on_mark_ve(cgrp);
+		if (err)
+			break;
 		if (test_bit(cpu_cgroup_subsys_id, &root->subsys_mask))
 			link_ve_root_cpu_cgroup(cgrp);
 	}
 	mutex_unlock(&cgroup_mutex);
+	return err;
 }
 
 struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
@@ -4477,6 +4594,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	struct cgroup_subsys *ss;
 	struct cgroup *child;
 	bool empty;
+	struct cgroup_rcu_string *release_agent;
 
 	lockdep_assert_held(&d->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
@@ -4556,6 +4674,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	}
 	spin_unlock(&cgrp->event_list_lock);
 
+	release_agent = cgrp->release_agent_path;
+	RCU_INIT_POINTER(cgrp->release_agent_path, NULL);
+	if (release_agent)
+		kfree_rcu(release_agent, rcu_head);
+
 	return 0;
 };
 
@@ -5365,7 +5488,9 @@ static void cgroup_release_agent(struct work_struct *work)
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i, err;
+		const char *release_agent;
 		char *pathbuf = NULL, *agentbuf = NULL;
+		struct cgroup *root_cgrp;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
@@ -5374,9 +5499,14 @@ static void cgroup_release_agent(struct work_struct *work)
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+		if (__cgroup_path(cgrp, pathbuf, PAGE_SIZE, true) < 0)
 			goto continue_free;
-		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+		root_cgrp = cgroup_get_local_root(cgrp);
+		rcu_read_lock();
+		release_agent = cgroup_release_agent_path(root_cgrp);
+		if (release_agent)
+			agentbuf = kstrdup(release_agent, GFP_KERNEL);
+		rcu_read_unlock();
 		if (!agentbuf)
 			goto continue_free;
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ad3a698..a64b4a7 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -479,7 +479,7 @@ static void ve_drop_context(struct ve_struct *ve)
 
 static const struct timespec zero_time = { };
 
-extern void cgroup_mark_ve_root(struct ve_struct *ve);
+extern int cgroup_mark_ve_root(struct ve_struct *ve);
 
 /* under ve->op_sem write-lock */
 static int ve_start_container(struct ve_struct *ve)
@@ -528,7 +528,9 @@ static int ve_start_container(struct ve_struct *ve)
 	if (err < 0)
 		goto err_iterate;
 
-	cgroup_mark_ve_root(ve);
+	err = cgroup_mark_ve_root(ve);
+	if (err)
+		goto err_mark_ve;
 
 	ve->is_running = 1;
 
@@ -538,6 +540,8 @@ static int ve_start_container(struct ve_struct *ve)
 
 	return 0;
 
+err_mark_ve:
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 err_iterate:
 	ve_stop_umh(ve);
 err_umh:
-- 
1.8.3.1