[Devel] [PATCH VZ8 v3 11/14] ve/cgroup: added release_agent to each container root cgroup.

Tue Feb 16 11:44:33 MSK 2021

Each container will now have access to it's own cgroup release_agent
file.
Creation:
  Normally all cgroup files are created during a call to cgroup_create
  by cgroup_populate_dir function. It creates or not creates all cgroup
  files once and they immediately become visible to userspace as
  filesystem objects.
  Due to specifics of container creation process, it is not possible to
  use the same code for 'release_agent' file creation. For VE to start
  operating, first a list of ordinary cgroups is being created for
  each subsystem, then the set of newly created cgroups are converted to
  "virtual roots", so at the time when cgroup_create is executed, there
  is no knowledge of wheather or not "release_agent" file should be
  created. This information only comes at "conversion" step which is
  'cgroup_mark_ve_roots' function. As the file is created dynamically
  in a live cgroup, a rather delicate locking sequence is present in
  the new code:
    - each new "virtual root" cgroup will have to add "release_agent" file,
      thus each cgroup's directory would need to be locked during
      the insertion time by cgroup->dentry->d_inode->i_mutex.
    - d_inode->i_mutex has an ordering dependency with cgroup_mutex
      (see cgroup_mount/cgroup_remount). They can not be locked in order
      {lock(cgroup_mutex), lock(inode->i_mutex)}.
    - to collect a list of cgroups, that need to become virtual we need
      cgroup_mutex lock to iterate active roots.
    - to overcome the above conflict we first need to collect a list of
      all virtual cgroups under cgroup_mutex lock, then release it and
      after that to insert "release_agent" to each root under
      inode->i_mutex lock.
    - to collect a list of cgroups on stack we utilize
      cgroup->cft_q_node, made specially for that purpose under it's own
      cgroup_cft_mutex.

Destruction:
  Destruction is done in reverse from the above within
  cgroup_unmark_ve_roots.
  After file destruction we must prevent further write operations to
  this file in case when someone has opened this file prior to VE
  and cgroup destruction. This is achieved by checking if cgroup
  in the argument to cgroup_file_write function has features of host
  or virtual root.

https://jira.sw.ru/browse/PSBM-83887

Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
+++
cgroup: add missing dput() in cgroup_unmark_ve_roots()

cgroup_unmark_ve_roots() calls dget() on cgroup's dentry but don't
have the corresponding dput() call. This leads to leaking cgroups.

Add missing dput() to fix this.

https://jira.sw.ru/browse/PSBM-107328
Fixes: 1ac69e183447 ("ve/cgroup: added release_agent to each container root cgroup.")

(Cherry-picked from 4a1635024df1bae4f4809a3bc445f0cf64d4acf4)
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 include/linux/cgroup-defs.h     |   3 +
 include/linux/cgroup.h          |   2 +-
 include/linux/ve.h              |   4 +-
 kernel/cgroup/cgroup-internal.h |   1 +
 kernel/cgroup/cgroup-v1.c       |  37 +++++++++-
 kernel/cgroup/cgroup.c          | 127 ++++++++++++++++++++++++++------
 kernel/ve/ve.c                  |  42 ++++++++---
 7 files changed, 179 insertions(+), 37 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 57ee48874404..be7d0f599179 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -451,6 +451,9 @@ struct cgroup {
 	 */
 	struct list_head cset_links;
 
+	/* Used for cgroup_mark/umark ve */
+	struct list_head cft_q_node;
+
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6693cd36fd82..8f0d057abb25 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -900,7 +900,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
 void cgroup1_release_agent(struct work_struct *work);
 
 #ifdef CONFIG_VE
-extern void cgroup_mark_ve_root(struct ve_struct *ve);
+int cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
 struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
 #endif
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 65c19f2b9b98..7cef4b39847e 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -145,11 +145,13 @@ extern int nr_ve;
 void ve_add_to_release_list(struct cgroup *cgrp);
 void ve_rm_from_release_list(struct cgroup *cgrp);
 
-int ve_set_release_agent_path(struct ve_struct *ve, struct cgroup *cgroot,
+int ve_set_release_agent_path(struct cgroup *cgroot,
 	const char *release_agent);
 
 const char *ve_get_release_agent_path(struct cgroup *cgrp_root);
 
+void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp);
+
 extern struct ve_struct *get_ve(struct ve_struct *ve);
 extern void put_ve(struct ve_struct *ve);
 
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index be0cd157d4dc..112bd917e99d 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -227,6 +227,7 @@ extern const struct proc_ns_operations cgroupns_operations;
  */
 extern struct cftype cgroup1_base_files[];
 extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
+struct cftype *get_cftype_by_name(const char *name);
 
 int proc_cgroupstats_show(struct seq_file *m, void *v);
 bool cgroup1_ssid_disabled(int ssid);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 31585ab907a3..46be2f688503 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -557,13 +557,34 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
 	struct cgroup *root_cgrp;
 
 	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
+	/*
+	 * Call to cgroup_get_local_root is a way to make sure
+	 * that cgrp in the argument is valid "virtual root"
+	 * cgroup. If it's not - this means that cgroup_unmark_ve_roots
+	 * has already cleared CGRP_VE_ROOT flag from this cgroup while
+	 * the file was still opened by other process and
+	 * that we shouldn't allow further writings via that file.
+	 */
 	root_cgrp = cgroup_get_local_root(cgrp);
 	BUG_ON(!root_cgrp);
+
+	if (root_cgrp != cgrp) {
+		ret = -EPERM;
+		goto out;
+	}
+
 	if (root_cgrp->ve_owner)
-		ret = ve_set_release_agent_path(root_cgrp, buffer);
+		ret = ve_set_release_agent_path(root_cgrp, strstrip(buf));
 	else
 		ret = -ENODEV;
 
+	if (!ret)
+		ret = nbytes;
+
+out:
 	cgroup_kn_unlock(of->kn);
 	return ret;
 }
@@ -680,7 +701,7 @@ struct cftype cgroup1_base_files[] = {
 	},
 	{
 		.name = "release_agent",
-		.flags = CFTYPE_ONLY_ON_ROOT,
+		.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_VE_WRITABLE,
 		.seq_show = cgroup_release_agent_show,
 		.write = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX - 1,
@@ -693,6 +714,18 @@ struct cftype cgroup1_base_files[] = {
 	{ }	/* terminate */
 };
 
+struct cftype *get_cftype_by_name(const char *name)
+{
+	struct cftype *cft;
+
+	for (cft = cgroup1_base_files; cft->name[0] != '\0'; cft++) {
+		if (!strcmp(cft->name, name))
+			return cft;
+	}
+	return NULL;
+}
+
+
 #define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1)
 
 /* Display information about each subsystem and each hierarchy */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7b4d303b9d7e..8aea78f07b5b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -70,6 +70,8 @@
 /* let's not notify more than 100 times per second */
 #define CGROUP_FILE_NOTIFY_MIN_INTV	DIV_ROUND_UP(HZ, 100)
 
+#define CGROUP_FILENAME_RELEASE_AGENT "release_agent"
+
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -1921,55 +1923,137 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	return 0;
 }
 
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+			   struct cftype *cft, bool activate);
+
 #ifdef CONFIG_VE
-void cgroup_mark_ve_root(struct ve_struct *ve)
+int cgroup_mark_ve_roots(struct ve_struct *ve)
 {
+	int err;
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
-	struct cgroup *cgrp;
+	struct cgroup *cgrp, *tmp;
+	struct cftype *cft;
+	struct cgroup_subsys_state *cpu_css;
+	LIST_HEAD(pending);
 
-	spin_lock_irq(&css_set_lock);
+	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
+	BUG_ON(!cft);
 
+	/*
+	 * locking scheme:
+	 * collect the right cgroups in a list under the locks below and
+	 * take actions later without these locks:
+	 * css_set_lock - to iterate cset->cgrp_links
+	 *
+	 * cgroup_add_file can not be called under css_set_lock spinlock,
+	 * because eventually it calls __kernfs_new_node, which does some
+	 * allocations.
+	 */
+
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
+
 	cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
-	if (WARN_ON(!cset))
-		goto unlock;
+	if (WARN_ON(!cset)) {
+		rcu_read_unlock();
+		return -ENODEV;
+	}
 
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		cgrp = link->cgrp;
-		rcu_assign_pointer(cgrp->ve_owner, ve);
-		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+
+		/*
+		 * At container start, vzctl creates special cgroups to serve
+		 * as virtualized cgroup roots. They are bind-mounted on top
+		 * of original cgroup mount point in container namespace. But
+		 * not all cgroup mounts undergo this procedure. We should
+		 * skip cgroup mounts that are not virtualized.
+		 */
+		if (!is_virtualized_cgroup(cgrp))
+			continue;
+
+		list_add_tail(&cgrp->cft_q_node, &pending);
 	}
-	link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);
-unlock:
+	cpu_css = cset->subsys[cpu_cgrp_id];
 	rcu_read_unlock();
-
 	spin_unlock_irq(&css_set_lock);
+
+	/*
+	 * reads to cgrp->ve_owner is protected by rcu.
+	 * writes to cgrp->ve_owner is protected by ve->op_sem taken
+	 * by the caller. Possible race that would require cgroup_mutex
+	 * is when two separate ve's will have same cgroups in their css
+	 * sets, but this should be protected elsewhere.
+	 */
+	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
+		rcu_assign_pointer(cgrp->ve_owner, ve);
+		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+
+		if (!cgroup_is_dead(cgrp)) {
+			err = cgroup_add_file(NULL, cgrp, cft, true);
+			if (err) {
+				pr_warn("failed to add file to VE_ROOT cgroup,"
+					" err:%d\n", err);
+				break;
+			}
+		}
+		list_del_init(&cgrp->cft_q_node);
+	}
+	link_ve_root_cpu_cgroup(cpu_css);
 	synchronize_rcu();
+
+	if (err) {
+		pr_warn("failed to mark cgroups as VE_ROOT,"
+			" err:%d, falling back\n", err);
+		cgroup_unmark_ve_roots(ve);
+	}
+	return err;
 }
 
 void cgroup_unmark_ve_roots(struct ve_struct *ve)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
-	struct cgroup *cgrp;
+	struct cgroup *cgrp, *tmp;
+	struct cftype *cft;
+	LIST_HEAD(pending);
+
+	cft = get_cftype_by_name(CGROUP_FILENAME_RELEASE_AGENT);
 
 	spin_lock_irq(&css_set_lock);
 
 	rcu_read_lock();
 	cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
-	if (WARN_ON(!cset))
-		goto unlock;
+	if (WARN_ON(!cset)) {
+		rcu_read_unlock();
+		spin_unlock_irq(&css_set_lock);
+		return;
+	}
 
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		cgrp = link->cgrp;
-		rcu_assign_pointer(cgrp->ve_owner, NULL);
-		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+
+		/*
+		 * For this line see comments in
+		 * cgroup_mark_ve_roots
+		 */
+		if (!is_virtualized_cgroup(cgrp))
+			continue;
+
+		BUG_ON(cgrp->ve_owner != ve);
+
+		list_add_tail(&cgrp->cft_q_node, &pending);
 	}
-unlock:
 	rcu_read_unlock();
-
 	spin_unlock_irq(&css_set_lock);
+
+	list_for_each_entry_safe(cgrp, tmp, &pending, cft_q_node) {
+		kernfs_remove_by_name(cgrp->kn, CGROUP_FILENAME_RELEASE_AGENT);
+		rcu_assign_pointer(cgrp->ve_owner, NULL);
+		clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+		list_del_init(&cgrp->cft_q_node);
+	}
 	/* ve_owner == NULL will be visible */
 	synchronize_rcu();
 
@@ -2022,6 +2106,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->self.children);
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->pidlists);
+	INIT_LIST_HEAD(&cgrp->cft_q_node);
+	INIT_LIST_HEAD(&cgrp->release_list);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->self.cgroup = cgrp;
 	cgrp->self.flags |= CSS_ONLINE;
@@ -2048,10 +2134,8 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts)
 	idr_init(&root->cgroup_idr);
 
 	root->flags = opts->flags;
-	if (opts.release_agent) {
-		ret = ve_set_release_agent_path(root_cgrp,
-			opts.release_agent);
-	}
+	if (opts->release_agent)
+		ve_set_release_agent_path(cgrp, opts->release_agent);
 
 	if (opts->name)
 		strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN);
@@ -2259,6 +2343,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 		percpu_ref_kill(&root->cgrp.self.refcnt);
 	cgroup_put(&root->cgrp);
 	kernfs_kill_sb(sb);
+	ve_cleanup_per_cgroot_data(NULL, &root->cgrp);
 }
 
 struct file_system_type cgroup_fs_type = {
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index d704c77f9bc7..e8945e10e070 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -311,12 +311,13 @@ const char *ve_get_release_agent_path(struct cgroup *cgroot)
 	struct per_cgroot_data *data;
 	struct cgroup_rcu_string *str;
 	struct ve_struct *ve;
+	unsigned long flags;
 
 	ve = rcu_dereference(cgroot->ve_owner);
 	if (!ve)
 		return NULL;
 
-	raw_spin_lock(&ve->per_cgroot_list_lock);
+	spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
 
 	data = per_cgroot_data_find_locked(&ve->per_cgroot_list, cgroot);
 	if (data) {
@@ -324,7 +325,7 @@ const char *ve_get_release_agent_path(struct cgroup *cgroot)
 		if (str)
 			result = str->val;
 	}
-	raw_spin_unlock(&ve->per_cgroot_list_lock);
+	spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags);
 	return result;
 }
 
@@ -638,7 +639,9 @@ static int ve_start_container(struct ve_struct *ve)
 	if (err < 0)
 		goto err_iterate;
 
-	cgroup_mark_ve_root(ve);
+	err = cgroup_mark_ve_roots(ve);
+	if (err)
+		goto err_mark_ve;
 
 	ve->is_running = 1;
 
@@ -648,6 +651,8 @@ static int ve_start_container(struct ve_struct *ve)
 
 	return 0;
 
+err_mark_ve:
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
 err_iterate:
 	ve_workqueue_stop(ve);
 err_workqueue:
@@ -662,22 +667,35 @@ static int ve_start_container(struct ve_struct *ve)
 	return err;
 }
 
-static void ve_per_cgroot_free(struct ve_struct *ve)
+static inline void per_cgroot_data_free(struct per_cgroot_data *data)
+{
+	struct cgroup_rcu_string *release_agent = data->release_agent_path;
+
+	RCU_INIT_POINTER(data->release_agent_path, NULL);
+	if (release_agent)
+		kfree_rcu(release_agent, rcu_head);
+	kfree(data);
+}
+
+void ve_cleanup_per_cgroot_data(struct ve_struct *ve, struct cgroup *cgrp)
 {
 	struct per_cgroot_data *data, *saved;
 	unsigned long flags;
-	struct cgroup_rcu_string *release_agent;
+
+	BUG_ON(!ve && !cgrp);
+	rcu_read_lock();
+	if (!ve)
+		ve = cgroup_get_ve_owner(cgrp);
 
 	spin_lock_irqsave(&ve->per_cgroot_list_lock, flags);
 	list_for_each_entry_safe(data, saved, &ve->per_cgroot_list, list) {
-		release_agent = data->release_agent_path;
-		RCU_INIT_POINTER(data->release_agent_path, NULL);
-		if (release_agent)
-			kfree_rcu(release_agent, rcu_head);
-		list_del_init(&data->list);
-		kfree(data);
+		if (!cgrp || data->cgroot == cgrp) {
+			list_del_init(&data->list);
+			per_cgroot_data_free(data);
+		}
 	}
 	spin_unlock_irqrestore(&ve->per_cgroot_list_lock, flags);
+	rcu_read_unlock();
 }
 
 void ve_stop_ns(struct pid_namespace *pid_ns)
@@ -736,7 +754,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 
 	ve_workqueue_stop(ve);
 
-	ve_per_cgroot_free(ve);
+	ve_cleanup_per_cgroot_data(ve, NULL);
 
 	/*
 	 * At this point all userspace tasks in container are dead.
-- 
2.27.0