[Devel] [PATCH RHEL9 COMMIT] ve/cgroup: Private per-cgroup-root data container

Konstantin Khorenko khorenko at virtuozzo.com
Wed Oct 20 11:43:56 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-4.vz9.10.12
------>
commit 8e4204ebcd344fd05675b06b0f490aaf31beee1a
Author: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Date:   Wed Oct 20 11:43:56 2021 +0300

    ve/cgroup: Private per-cgroup-root data container
    
    To save release agent binary path per-ve we need to have some place to
    store it, ve root cgroups are already linked to ve via cgroup->ve_owner
    and ve->ve_ns->cgroup_ns->root_cset, so we can put release agent data to
    ve. Also we need to have one path per cgroup_root as it previously was
    for host, so we put paths to list.
    
    One should use ve_ra_data_get_path_locked under rcu_read_lock to get
    consistent release agent data associated with ve+cgroot pair, as it's
    freed under call_rcu. To change release agent path ve_ra_data_set should
    be used which takes care about proper locking.
    
    Short history of the patch:
    
    https://jira.sw.ru/browse/PSBM-83887
    Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
    
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    (cherry picked from vz7 commit 4a98f07102fd248ad4218a07b5ec5ec90da10288)
    +++
    cgroup: Add rcu node string wrapper for in-cgroup usage
    (cherry picked from vz7 commit e828803a5d776125c9c329f194aff74fb4ec181a)
    https://jira.sw.ru/browse/PSBM-108270
    (cherry picked from vz8 commit b013035950978e64b0134774f62088dd583d0969)
    (cherry picked from vz8 commit 02a34b97095633d5213f7562069ab9ff6a29baf1)
    +++
    cgroup/ve: Pass cgroup_root to ve_set(get)_release_agent
    (cherry picked from vz8 commit b2b3c2a73206273257383d8b7dcc98057cd92768)
    
    vz9 changes:
    - rework it quite a lot
    - rename per_cgroot_data to ve_ra_data as we use it for release agent
      only and it would help simplifying code a bit
    - don't allocate release agent string separately but put it into data
      and protect the data itself with rcu, this way we can fix the race we
      had on vz8 where we get data from list under lock, then release the
      lock and data can be freed under us by ve stopping so we have
      potential use after free
    - merge replacing cgroup -> cgroup_root in per_cgroot_data
    - on cgroup_kill_sb cleanup ra data for all online ve cgroups (not just
      ve0, we can have stopped ve cgroups with data still set), lifetime of
      ra data is now while ve cgroup directory exists and while cgroup root
      exits
    - move cgroot_ve_cleanup_ra_data to cgroup_destroy_root where we
      actually don't need root data anymore
    
    https://jira.sw.ru/browse/PSBM-134002
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    Reviewed-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 include/linux/ve.h     |  14 +++++
 kernel/cgroup/cgroup.c |   1 +
 kernel/ve/ve.c         | 135 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index 25dee1b4d4e9..79717884d2b8 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -117,6 +117,13 @@ struct ve_struct {
 	struct workqueue_struct	*wq;
 	struct work_struct	release_agent_work;
 
+	/*
+	 * List of data, private for each root cgroup in
+	 * ve's css_set.
+	 */
+	struct list_head	ra_data_list;
+	spinlock_t		ra_data_lock;
+
 	struct vfsmount		*devtmpfs_mnt;
 };
 
@@ -143,6 +150,13 @@ extern unsigned int sysctl_ve_mount_nr;
 #ifdef CONFIG_VE
 void ve_add_to_release_list(struct cgroup *cgrp);
 void ve_rm_from_release_list(struct cgroup *cgrp);
+
+const char *ve_ra_data_get_path_locked(struct ve_struct *ve,
+				       struct cgroup_root *cgroot);
+int ve_ra_data_set(struct ve_struct *ve, struct cgroup_root *cgroot,
+		   const char *release_agent);
+void cgroot_ve_cleanup_ra_data(struct cgroup_root *cgroot);
+
 extern struct ve_struct *get_ve(struct ve_struct *ve);
 extern void put_ve(struct ve_struct *ve);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c0c92a4cf791..1fcb3041abb7 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1392,6 +1392,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 
 	cgroup_rstat_exit(cgrp);
 	kernfs_destroy_root(root->kf_root);
+	cgroot_ve_cleanup_ra_data(root);
 	cgroup_free_root(root);
 }
 
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 9c593e93a600..557a14f216c4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -40,6 +40,16 @@
 #include "../fs/mount.h"
 #include "../cgroup/cgroup-internal.h" /* For cgroup_task_count() */
 
+struct ve_ra_data {
+	struct list_head list;
+	struct rcu_head rcu;
+	/*
+	 * data is related to this cgroup
+	 */
+	struct cgroup_root *cgroot;
+	char *release_agent_path;
+};
+
 extern struct kmapset_set sysfs_ve_perms_set;
 
 static struct kmem_cache *ve_cachep;
@@ -79,6 +89,8 @@ struct ve_struct ve0 = {
 	.release_list		= LIST_HEAD_INIT(ve0.release_list),
 	.release_agent_work	= __WORK_INITIALIZER(ve0.release_agent_work,
 					cgroup1_release_agent),
+	.ra_data_list	= LIST_HEAD_INIT(ve0.ra_data_list),
+	.ra_data_lock	= __SPIN_LOCK_UNLOCKED(ve0.ra_data_lock),
 };
 EXPORT_SYMBOL(ve0);
 
@@ -281,6 +293,125 @@ int nr_threads_ve(struct ve_struct *ve)
 }
 EXPORT_SYMBOL(nr_threads_ve);
 
+static struct ve_ra_data *alloc_ve_ra_data(struct cgroup_root *cgroot,
+					   const char *str)
+{
+	struct ve_ra_data *data;
+	size_t buflen;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	/* Don't allow more than page */
+	buflen = min(strlen(str) + 1, PAGE_SIZE);
+
+	data->release_agent_path = kmalloc(buflen, GFP_KERNEL);
+	if (!data->release_agent_path) {
+		kfree(data);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_LIST_HEAD(&data->list);
+	data->cgroot = cgroot;
+
+	if (strlcpy(data->release_agent_path, str, buflen) >= buflen) {
+		kfree(data->release_agent_path);
+		kfree(data);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return data;
+}
+
+static void free_ve_ra_data(struct rcu_head *head)
+{
+	struct ve_ra_data *data = container_of(head, struct ve_ra_data, rcu);
+
+	kfree(data->release_agent_path);
+	kfree(data);
+}
+
+/*
+ * Either rcu_read_lock or ve->ra_data_lock
+ * should be held so that data is not freed under us.
+ */
+static struct ve_ra_data *ve_ra_data_find_locked(struct ve_struct *ve,
+						 struct cgroup_root *cgroot)
+{
+	struct list_head *ve_ra_data_list = &ve->ra_data_list;
+	struct ve_ra_data *data;
+
+	list_for_each_entry_rcu(data, ve_ra_data_list, list) {
+		if (data->cgroot == cgroot)
+			return data;
+	}
+
+	return NULL;
+}
+
+const char *ve_ra_data_get_path_locked(struct ve_struct *ve,
+				       struct cgroup_root *cgroot)
+{
+	struct ve_ra_data *data;
+
+	data = ve_ra_data_find_locked(ve, cgroot);
+
+	return data ? data->release_agent_path : NULL;
+}
+
+int ve_ra_data_set(struct ve_struct *ve, struct cgroup_root *cgroot,
+		   const char *release_agent)
+{
+	struct ve_ra_data *data, *other_data;
+	unsigned long flags;
+
+	data = alloc_ve_ra_data(cgroot, release_agent);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	spin_lock_irqsave(&ve->ra_data_lock, flags);
+	other_data =  ve_ra_data_find_locked(ve, cgroot);
+	if (other_data) {
+		list_del_rcu(&other_data->list);
+		call_rcu(&other_data->rcu, free_ve_ra_data);
+	}
+
+	list_add_rcu(&data->list, &ve->ra_data_list);
+	spin_unlock_irqrestore(&ve->ra_data_lock, flags);
+
+	return 0;
+}
+
+static void ve_cleanup_ra_data(struct ve_struct *ve, struct cgroup_root *cgroot)
+{
+	struct ve_ra_data *data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ve->ra_data_lock, flags);
+	list_for_each_entry_rcu(data, &ve->ra_data_list, list) {
+		if (cgroot && data->cgroot != cgroot)
+			continue;
+
+		list_del_rcu(&data->list);
+		call_rcu(&data->rcu, free_ve_ra_data);
+	}
+	spin_unlock_irqrestore(&ve->ra_data_lock, flags);
+}
+
+void cgroot_ve_cleanup_ra_data(struct cgroup_root *cgroot)
+{
+	struct cgroup_subsys_state *css;
+	struct ve_struct *ve;
+
+	rcu_read_lock();
+	css_for_each_descendant_pre(css, &ve0.css) {
+		ve = css_to_ve(css);
+		ve_cleanup_ra_data(ve, cgroot);
+	}
+	rcu_read_unlock();
+}
+
 struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id)
 {
 	struct cgroup_subsys_state *css;
@@ -792,6 +923,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	INIT_WORK(&ve->release_agent_work, cgroup1_release_agent);
 	spin_lock_init(&ve->release_list_lock);
 	INIT_LIST_HEAD(&ve->release_list);
+	spin_lock_init(&ve->ra_data_lock);
+	INIT_LIST_HEAD(&ve->ra_data_list);
 
 	ve->_randomize_va_space = ve0._randomize_va_space;
 
@@ -882,6 +1015,8 @@ static void ve_offline(struct cgroup_subsys_state *css)
 
 	kfree(ve->ve_name);
 	ve->ve_name = NULL;
+
+	ve_cleanup_ra_data(ve, NULL);
 }
 
 static void ve_devmnt_free(struct ve_devmnt *devmnt)


More information about the Devel mailing list