[Devel] [PATCH RHEL7 COMMIT] ve/cgroups: Drop virtualization code, v5

Thu May 7 14:44:15 PDT 2015

The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.4.11
------>
commit 7f018a3da1e259083cbd37dc5f1198b17c775f7b
Author: Cyrill Gorcunov <gorcunov at odin.com>
Date:   Fri May 8 01:44:15 2015 +0400

    ve/cgroups: Drop virtualization code, v5
    
    Here we rip off all the virtualization code we introduced into kernel to
    behave close to rhel6.
    
    Because we're trying a new concept (bindmounting from the node) it is
    no longer needed.
    
    Now some details:
    
     - drop cgroup_show_path -- we don't hide VEID in /proc/self/cgroup output,
       it doesn't break criu so no need to carry it, same applies to changes
       in cgroup_path;
    
     - because we drop virtualization of systemd -- disable creation of new
       hierarchies in container: we don't support it, neither we need it. The
       primary reason why we allowed new hierarchies in container was that
       CRIU has been running restore procedure inside VE but now we initiate
       restore from VE0, so we can safely disable new hierarchies;
    
     - in cgroup_addrm_files go back to former RHEL7 code; if we need something
       special here it must be reviewed carefully and separately;
    
     - no need to hide /proc/cgroups in VE, there is no sensible info present.
    
    v2:
     - take into account commits 38f039db6e023ac14517219ad6f674633c4e99ca
       and c2ac6df22b20389ae2d0af49c436b00ff3243e89 removing cgroup_is_disposable,
       cgroup_kernel_destroy, ve::ve_cgroup_head.
    
     - drop GRPP_WEAK, CGRP_SELF_DESTRUCTION and CGRP_VE_TOP_CGROUP_VIRTUAL flags
       which implies the cgroups no longer auto-cleaned up but user-space tool
       (read vzctl and friends) should handle cgroups removal
    
     - because we're moving to native cgroups code we don't virtualize release
       agent anymore
    
     - still cgroup::cgroup_ve member is needed because we're using it
       all over the code
    
    v3:
     - move back ve_offline, we need to free ve id
    
    v4:
    - use native call_usermodehelper in release_agent execution, we don't
      virtualized cgroups, but I kept error code and pr_warn so it would
      be easier identify problems if ever
    - drop cgroup::cgroup_ve member, no longer used
    - drop unused cgroup_kernel_destory
    
    v5:
     - disable mounting of cgroups inside VE
     - disable modifying toplevel bindmount cgroup
       files from inside of container, except ve cgroup,
       where we need to write "START" to kick container to
       run (probably we will need more control here for
       "restore" via CRIU case, hasn't investigated it
       yet)
     - drop redundant @cgrp from ve_offline
    
    Signed-off-by: Cyrill Gorcunov <gorcunov at odin.com>
    Acked-by: Vladimir Davydov <vdavydov at odin.com>
    
    CC: Konstantin Khorenko <khorenko at odin.com>
    CC: Pavel Emelyanov <xemul at odin.com>
    CC: Andrey Vagin <avagin at odin.com>
---
 include/linux/cgroup.h  |  17 +---
 include/linux/ve.h      |   1 -
 kernel/bc/beancounter.c |  11 +--
 kernel/cgroup.c         | 252 +++++++++++-------------------------------------
 kernel/ve/ve.c          |  10 --
 kernel/ve/vecalls.c     |   6 +-
 6 files changed, 63 insertions(+), 234 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6c6105..a7b6941 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -46,7 +46,6 @@ struct cgroup_sb_opts {
 enum cgroup_open_flags {
 	CGRP_CREAT	= 0x0001,	/* create if not found */
 	CGRP_EXCL	= 0x0002,	/* fail if already exist */
-	CGRP_WEAK	= 0x0004,	/* arm cgroup self-destruction */
 };
 
 struct vfsmount *cgroup_kernel_mount(struct cgroup_sb_opts *opts);
@@ -56,7 +55,6 @@ struct cgroup *cgroup_kernel_open(struct cgroup *parent,
 int cgroup_kernel_remove(struct cgroup *parent, const char *name);
 int cgroup_kernel_attach(struct cgroup *cgrp, struct task_struct *tsk);
 void cgroup_kernel_close(struct cgroup *cgrp);
-void cgroup_kernel_destroy(struct cgroup *cgrp);
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -190,10 +188,6 @@ enum {
 	CGRP_CPUSET_CLONE_CHILDREN,
 	/* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
 	CGRP_SANE_BEHAVIOR,
-	CGRP_SELF_DESTRUCTION,
-
-	/* container virtualization */
-	CGRP_VE_TOP_CGROUP_VIRTUAL,
 };
 
 struct cgroup_name {
@@ -241,13 +235,6 @@ struct cgroup {
 
 	struct cgroupfs_root *root;
 
-	/* The path to use for release notifications. */
-	char *release_agent;
-
-	/* Owner VE for fake cgroup hierarchy */
-	struct ve_struct *cgroup_ve;
-	struct list_head cgroup_ve_list;
-
 	/*
 	 * List of cg_cgroup_links pointing at css_sets with
 	 * tasks in this cgroup. Protected by css_set_lock
@@ -325,7 +312,6 @@ enum {
 
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
-	CGRP_ROOT_VIRTUAL	= (1 << 3), /* VE-based root-cgroup virtualization */
 };
 
 /*
@@ -369,6 +355,9 @@ struct cgroupfs_root {
 	/* IDs for cgroups in this hierarchy */
 	struct ida cgroup_ida;
 
+	/* The path to use for release notifications. */
+	char release_agent_path[PATH_MAX];
+
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 833a731..03e90e4 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -124,7 +124,6 @@ struct ve_struct {
 	struct mutex		sync_mutex;
 
 	struct kmapset_key	ve_sysfs_perms;
-	struct list_head	ve_cgroup_head;
 #if IS_ENABLED(CONFIG_DEVTMPFS)
 	struct path		devtmpfs_root;
 #endif
diff --git a/kernel/bc/beancounter.c b/kernel/bc/beancounter.c
index 76936e0..28dfe43 100644
--- a/kernel/bc/beancounter.c
+++ b/kernel/bc/beancounter.c
@@ -109,8 +109,7 @@ static int ub_mem_cgroup_attach_task(struct user_beancounter *ub,
 	struct cgroup *cg;
 	int ret;
 
-	cg = cgroup_kernel_open(mem_cgroup_root,
-				CGRP_CREAT|CGRP_WEAK, ub->ub_name);
+	cg = cgroup_kernel_open(mem_cgroup_root, CGRP_CREAT, ub->ub_name);
 	if (IS_ERR(cg))
 		return PTR_ERR(cg);
 
@@ -134,8 +133,7 @@ static int ub_blkio_cgroup_attach_task(struct user_beancounter *ub,
 	if (!ubc_ioprio)
 		return 0;
 
-	cg = cgroup_kernel_open(blkio_cgroup_root,
-				CGRP_CREAT|CGRP_WEAK, ub->ub_name);
+	cg = cgroup_kernel_open(blkio_cgroup_root, CGRP_CREAT, ub->ub_name);
 	if (IS_ERR(cg))
 		return PTR_ERR(cg);
 
@@ -152,8 +150,7 @@ static int ub_cgroup_attach_task(struct user_beancounter *ub,
 	int ret;
 
 	if (ub != get_ub0()) {
-		cg = cgroup_kernel_open(ub_cgroup_root,
-					CGRP_CREAT|CGRP_WEAK, ub->ub_name);
+		cg = cgroup_kernel_open(ub_cgroup_root, CGRP_CREAT, ub->ub_name);
 		if (IS_ERR(cg))
 			return PTR_ERR(cg);
 	} else
@@ -398,7 +395,7 @@ struct user_beancounter *get_beancounter_by_name(const char *name, int create)
 		if (IS_ERR(cg))
 			return NULL;
 		if (!cg) {
-			cg = cgroup_kernel_open(ub_cgroup_root, CGRP_CREAT|CGRP_WEAK, name);
+			cg = cgroup_kernel_open(ub_cgroup_root, CGRP_CREAT, name);
 			if (IS_ERR_OR_NULL(cg))
 				return NULL;
 			pr_warn_once("Allocating UB with syslimits is deprecated!\n");
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 396365e..f881f69 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -256,19 +256,12 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 }
 EXPORT_SYMBOL_GPL(cgroup_is_descendant);
 
-static int cgroup_is_disposable(const struct cgroup *cgrp)
-{
-	return (cgrp->flags & ((1 << CGRP_NOTIFY_ON_RELEASE) |
-				(1 << CGRP_SELF_DESTRUCTION))) > 0;
-}
-
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
-		(1 << CGRP_NOTIFY_ON_RELEASE) |
-		(1 << CGRP_SELF_DESTRUCTION);
-	return (cgrp->flags & bits) > (1 << CGRP_RELEASABLE);
+		(1 << CGRP_NOTIFY_ON_RELEASE);
+	return (cgrp->flags & bits) == bits;
 }
 
 static int notify_on_release(const struct cgroup *cgrp)
@@ -422,7 +415,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 		 */
 		rcu_read_lock();
 		if (atomic_dec_and_test(&cgrp->count) &&
-		    cgroup_is_disposable(cgrp)) {
+		    notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
@@ -854,7 +847,6 @@ static void cgroup_free_fn(struct work_struct *work)
 		ss->css_free(cgrp);
 
 	cgrp->root->number_of_cgroups--;
-	kfree(cgrp->release_agent);
 	mutex_unlock(&cgroup_mutex);
 
 	/*
@@ -1095,7 +1087,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 {
 	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-	struct cgroup *top_cgrp;
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_root_mutex);
@@ -1108,14 +1099,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (root->flags & CGRP_ROOT_XATTR)
 		seq_puts(seq, ",xattr");
 
-	/* bindmount to attribute file? */
-	if (!S_ISDIR(dentry->d_inode->i_mode))
-		dentry = dentry->d_parent;
-	top_cgrp = dentry->d_fsdata;
-	/* release_agent is stored on top cgroup */
-	top_cgrp = &top_cgrp->root->top_cgroup;
-	if (top_cgrp->release_agent)
-		seq_printf(seq, ",release_agent=%s", top_cgrp->release_agent);
+	if (strlen(root->release_agent_path))
+		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
 		seq_puts(seq, ",clone_children");
@@ -1125,18 +1110,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	return 0;
 }
 
-static int cgroup_show_path(struct seq_file *m, struct dentry *root)
-{
-	struct ve_struct *ve = get_exec_env();
-	struct cgroup *cgrp = __d_cgrp(root);
-
-	if (!ve_is_super(ve) && test_bit(CGRP_VE_TOP_CGROUP_VIRTUAL, &cgrp->flags))
-		seq_puts(m, "/");
-	else
-		seq_dentry(m, root, " \t\n\\");
-	return 0;
-}
-
 /*
  * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
  * with cgroup_mutex held to protect the subsys[] array. This function takes
@@ -1299,26 +1272,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	if (!opts->subsys_mask && !opts->name)
 		return -EINVAL;
 
-	/* virtualize 'systemd' hierarchy */
-	if (!ve_is_super(get_exec_env()) && !opts->subsys_mask && opts->name && !strcmp(opts->name, "systemd"))
-		set_bit(CGRP_ROOT_VIRTUAL, &opts->flags);
-
-	/* forbid non-virtualized hierarchies in containers */
-	if (!ve_is_super(get_exec_env()) && !test_bit(CGRP_ROOT_VIRTUAL, &opts->flags)) {
-		WARN_ONCE(1, "Allow non-virtualized hierarchies for CRIU sake\n");
-		/*
-		 * FIXME
-		 *
-		 * We need to somehow limit this ability for CRIU only, because
-		 * we've to run restore procedure from inside of VE cgroup
-		 * (otherwise a number of get_exec_env() in network code
-		 * won't work as needed).
-		 *
-		 *   -- cyrillos
-		 */
-		/* return opts->subsys_mask ? -ENOENT : -EPERM; */
-	}
-
 	/*
 	 * Grab references on all the modules we'll need, so the subsystems
 	 * don't dance around before rebind_subsystems attaches them. This may
@@ -1422,11 +1375,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	/* re-populate subsystem files */
 	cgroup_populate_dir(cgrp, false, added_mask);
 
-	if (opts.release_agent) {
-		kfree(cgrp->release_agent);
-		cgrp->release_agent = opts.release_agent;
-		opts.release_agent = NULL;
-	}
+	if (opts.release_agent)
+		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
@@ -1441,7 +1391,6 @@ static const struct super_operations cgroup_ops = {
 	.drop_inode = generic_delete_inode,
 	.show_options = cgroup_show_options,
 	.remount_fs = cgroup_remount,
-	.show_path = cgroup_show_path,
 };
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1451,7 +1400,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->files);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->allcg_node);
-	INIT_LIST_HEAD(&cgrp->cgroup_ve_list);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
@@ -1540,6 +1488,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	root->subsys_mask = opts->subsys_mask;
 	root->flags = opts->flags;
 	ida_init(&root->cgroup_ida);
+	if (opts->release_agent)
+		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
@@ -1621,7 +1571,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	struct super_block *sb;
 	struct cgroupfs_root *new_root;
 	struct inode *inode;
-	struct dentry *root_dentry;
 
 	/* First find the desired set of subsystems */
 	if (!(flags & MS_KERNMOUNT)) {
@@ -1636,6 +1585,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (ret)
 		goto out_err;
 
+#ifdef CONFIG_VE
+	/*
+	 * Cgroups mounting from inside of VE is not allowed
+	 * until we get some iron prove that we are to.
+	 */
+	if (!ve_is_super(get_exec_env())) {
+		ret = -EACCES;
+		goto out_err;
+	}
+#endif
+
 	/*
 	 * Allocate a new cgroup root. We may not need it if we're
 	 * reusing an existing hierarchy.
@@ -1727,12 +1687,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
 
-		if (!test_bit(CGRP_ROOT_VIRTUAL, &opts.flags)) {
-			root_cgrp->release_agent = opts.release_agent;
-			root_cgrp->cgroup_ve = get_exec_env();
-			opts.release_agent = NULL;
-		}
-
 		cred = override_creds(&init_cred);
 		cgroup_populate_dir(root_cgrp, true, root->subsys_mask);
 		revert_creds(cred);
@@ -1760,40 +1714,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		drop_parsed_module_refcounts(opts.subsys_mask);
 	}
 
-	if (!test_bit(CGRP_ROOT_VIRTUAL, &opts.flags)) {
-		root_dentry = dget(sb->s_root);
-	} else {
-		struct ve_struct *ve = get_exec_env();
-		struct cgroup *top_cgrp;
-
-		top_cgrp = cgroup_kernel_open(&root->top_cgroup, 0, ve->ve_name);
-		ret = PTR_ERR(top_cgrp);
-		if (IS_ERR(top_cgrp))
-			goto drop_new_super;
-
-		/* create fake root-cgroup in virtualized hierarchy */
-		if (top_cgrp == NULL) {
-			top_cgrp = cgroup_kernel_open(&root->top_cgroup, CGRP_CREAT, ve->ve_name);
-			ret = PTR_ERR(top_cgrp);
-			if (IS_ERR(top_cgrp))
-				goto drop_new_super;
-
-			mutex_lock(&cgroup_mutex);
-			top_cgrp->cgroup_ve = ve;
-			top_cgrp->release_agent = opts.release_agent;
-			opts.release_agent = NULL;
-			set_bit(CGRP_VE_TOP_CGROUP_VIRTUAL, &top_cgrp->flags);
-			mutex_unlock(&cgroup_mutex);
-		}
-
-		/* mount it as bindmount to fist-level fake root-cgroup */
-		root_dentry = dget(top_cgrp->dentry);
-		cgroup_kernel_close(top_cgrp);
-	}
-
 	kfree(opts.release_agent);
 	kfree(opts.name);
-	return root_dentry;
+	return dget(sb->s_root);
 
  unlock_drop:
 	mutex_unlock(&cgroup_root_mutex);
@@ -1881,7 +1804,6 @@ static struct kobject *cgroup_kobj;
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
-	struct ve_struct *ve = get_exec_env();
 	int ret = -ENAMETOOLONG;
 	char *start;
 
@@ -1899,16 +1821,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 		const char *name = cgroup_name(cgrp);
 		int len;
 
-		/* Hide fake root-cgroup in virtualized hierarchy */
-		if (!ve_is_super(ve) && test_bit(CGRP_VE_TOP_CGROUP_VIRTUAL, &cgrp->flags)) {
-			if (*start != '/') {
-				if (--start < buf)
-					goto out;
-				*start = '/';
-			}
-			break;
-		}
-
 		len = strlen(name);
 		if ((start -= len) < buf)
 			goto out;
@@ -2319,23 +2231,16 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
 static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
-	char *release_agent;
+	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
 
 	if (strlen(buffer) >= PATH_MAX)
 		return -EINVAL;
 
-	release_agent = kstrdup(buffer, GFP_KERNEL);
-	if (!release_agent)
-		return -ENOMEM;
-
-	if (!cgroup_lock_live_group(cgrp)) {
-		kfree(release_agent);
+	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	}
 
 	mutex_lock(&cgroup_root_mutex);
-	kfree(cgrp->release_agent);
-	cgrp->release_agent = release_agent;
+	strcpy(cgrp->root->release_agent_path, buffer);
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -2346,8 +2251,7 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 {
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
-	if (cgrp->release_agent)
-		seq_puts(seq, cgrp->release_agent);
+	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -2437,6 +2341,30 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
+#ifdef CONFIG_VE
+	/*
+	 * In a sake of Docker we might bindmount cgroups so
+	 * that they would look like
+	 *
+	 * Node				Container
+	 * /sys/fs/cgroup/memory/CTID	/sys/fs/cgroup/memory
+	 *
+	 * but we should not allow to modify these toplevel
+	 * cgroups, only nested ones, because toplevel carries
+	 * container's resource limits/settings and etc.
+	 *
+	 * Same time ve cgroup should be writable during
+	 * container startup (to modify @ve.state entry which
+	 * kick container to run), but once ve is up and running
+	 * userspace from ve0 should *never* bindmount it
+	 * inside a container FS.
+	 */
+	if (!ve_is_super(get_exec_env())			&&
+	    !(cgrp->root->subsys_mask & (1UL << ve_subsys_id))	&&
+	    (!cgrp->parent || !cgrp->parent->parent))
+		return -EACCES;
+#endif
+
 	if (cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->write)
@@ -2843,9 +2771,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		/* does cft->flags tell us to skip this file on @cgrp? */
 		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
 			continue;
-		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && &cgrp->root->top_cgroup == cgrp)
+		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 			continue;
-		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && &cgrp->root->top_cgroup != cgrp)
+		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 			continue;
 
 		if (is_add) {
@@ -4077,23 +4005,6 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
 	return 0;
 }
 
-static u64 cgroup_read_self_destruction(struct cgroup *cgrp,
-		struct cftype *cft)
-{
-	return test_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
-}
-
-static int cgroup_write_self_destruction(struct cgroup *cgrp,
-		struct cftype *cft, u64 val)
-{
-	clear_bit(CGRP_RELEASABLE, &cgrp->flags);
-	if (val)
-		set_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
-	else
-		clear_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
-	return 0;
-}
-
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -4142,11 +4053,6 @@ static struct cftype files[] = {
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
 	},
-	{
-		.name = "self_destruction",
-		.read_u64 = cgroup_read_self_destruction,
-		.write_u64 = cgroup_write_self_destruction,
-	},
 	{ }	/* terminate */
 };
 
@@ -4313,23 +4219,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
 
-	if (test_bit(CGRP_ROOT_VIRTUAL, &root->flags) && parent == &root->top_cgroup) {
-		cgrp->cgroup_ve = get_exec_env();
-		list_add(&cgrp->cgroup_ve_list, &cgrp->cgroup_ve->ve_cgroup_head);
-	} else {
-		cgrp->cgroup_ve = parent->cgroup_ve;
-		list_add(&cgrp->cgroup_ve_list, &parent->cgroup_ve_list);
-	}
-
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 
-	if (test_bit(CGRP_SELF_DESTRUCTION, &parent->flags))
-		set_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
-
 	for_each_subsys(root, ss) {
 		struct cgroup_subsys_state *css;
 
@@ -4394,7 +4289,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	return 0;
 
 err_free_all:
-	list_del_init(&cgrp->cgroup_ve_list);
 	for_each_subsys(root, ss) {
 		if (cgrp->subsys[ss->subsys_id])
 			ss->css_free(cgrp);
@@ -4475,7 +4369,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	/* delete this cgroup from parent->children */
 	list_del_rcu(&cgrp->sibling);
 	list_del_init(&cgrp->allcg_node);
-	list_del(&cgrp->cgroup_ve_list);
 
 	dget(d);
 	cgroup_d_remove_dir(d);
@@ -4926,14 +4819,8 @@ out:
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	int i;
-	struct ve_struct *ve = get_exec_env();
 
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
-
-	/* cgset wants to read /proc/cgroups and it's used for starting CT */
-	if (!ve_is_super(ve) && ve->is_running)
-		return 0;
-
 	/*
 	 * ideally we don't want subsystems moving around while we do this.
 	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
@@ -5232,26 +5119,12 @@ static void cgroup_release_agent(struct work_struct *work)
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		raw_spin_unlock(&release_list_lock);
-
-		if (test_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags)) {
-			struct inode *parent = cgrp->dentry->d_parent->d_inode;
-
-			dget(cgrp->dentry);
-			mutex_unlock(&cgroup_mutex);
-			mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT);
-			vfs_rmdir(parent, cgrp->dentry);
-			mutex_unlock(&parent->i_mutex);
-			dput(cgrp->dentry);
-			mutex_lock(&cgroup_mutex);
-			goto continue_free;
-		}
-
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
 		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
 			goto continue_free;
-		agentbuf = kstrdup(cgrp->root->top_cgroup.release_agent, GFP_KERNEL);
+		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
 
@@ -5270,10 +5143,7 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
-
-		err = call_usermodehelper_ve(cgrp->root->top_cgroup.cgroup_ve,
-					     argv[0], argv, envp,
-					     UMH_WAIT_EXEC);
+		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		if (err < 0)
 			pr_warn_ratelimited("cgroup release_agent "
 					    "%s %s failed: %d\n",
@@ -5713,14 +5583,10 @@ struct cgroup *cgroup_kernel_open(struct cgroup *parent,
 			ret = -EEXIST;
 		else if (!dentry->d_inode)
 			ret = vfs_mkdir(parent->dentry->d_inode, dentry, 0755);
-		else
-			flags &= ~CGRP_WEAK;
 	}
 	if (!ret && dentry->d_inode) {
 		cgrp = __d_cgrp(dentry);
 		atomic_inc(&cgrp->count);
-		if (flags & CGRP_WEAK)
-			set_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
 	} else
 		cgrp = ret ? ERR_PTR(ret) : NULL;
 	dput(dentry);
@@ -5763,19 +5629,9 @@ EXPORT_SYMBOL(cgroup_kernel_attach);
 
 void cgroup_kernel_close(struct cgroup *cgrp)
 {
-	if (!cgroup_is_disposable(cgrp)) {
-		atomic_dec(&cgrp->count);
-	} else if (atomic_dec_and_test(&cgrp->count)) {
+	if (atomic_dec_and_test(&cgrp->count)) {
 		set_bit(CGRP_RELEASABLE, &cgrp->flags);
 		check_for_release(cgrp);
 	}
 }
 EXPORT_SYMBOL(cgroup_kernel_close);
-
-void cgroup_kernel_destroy(struct cgroup *cgrp)
-{
-	set_bit(CGRP_SELF_DESTRUCTION, &cgrp->flags);
-	set_bit(CGRP_RELEASABLE, &cgrp->flags);
-	check_for_release(cgrp);
-}
-EXPORT_SYMBOL(cgroup_kernel_destroy);
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 3ef10bc..609ea75 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -709,7 +709,6 @@ do_init:
 	init_rwsem(&ve->op_sem);
 	mutex_init(&ve->sync_mutex);
 	INIT_LIST_HEAD(&ve->devices);
-	INIT_LIST_HEAD(&ve->ve_cgroup_head);
 	INIT_LIST_HEAD(&ve->ve_list);
 	kmapset_init_key(&ve->ve_sysfs_perms);
 
@@ -730,18 +729,9 @@ err_id:
 static void ve_offline(struct cgroup *cg)
 {
 	struct ve_struct *ve = cgroup_ve(cg);
-	struct cgroup *cgrp;
 
 	ve_list_del(ve);
 	veid_free(ve->veid);
-
-	while (!list_empty(&ve->ve_cgroup_head)) {
-		cgrp = list_entry(ve->ve_cgroup_head.prev,
-				struct cgroup, cgroup_ve_list);
-		cgrp->cgroup_ve = NULL;
-		list_del_init(&cgrp->cgroup_ve_list);
-		cgroup_kernel_destroy(cgrp);
-	}
 }
 
 static void ve_destroy(struct cgroup *cg)
diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
index f8e3e8d..7c574b3 100644
--- a/kernel/ve/vecalls.c
+++ b/kernel/ve/vecalls.c
@@ -441,14 +441,12 @@ static int do_env_create(envid_t veid, unsigned int flags, u32 class_id,
 
 	legacy_veid_to_name(veid, ve_name);
 
-	ve_cgroup = ve_cgroup_open(ve0.css.cgroup,
-			CGRP_CREAT|CGRP_WEAK|CGRP_EXCL, veid);
+	ve_cgroup = ve_cgroup_open(ve0.css.cgroup, CGRP_CREAT|CGRP_EXCL, veid);
 	err = PTR_ERR(ve_cgroup);
 	if (IS_ERR(ve_cgroup))
 		goto err_ve_cgroup;
 
-	dev_cgroup = ve_cgroup_open(devices_root,
-		CGRP_CREAT|CGRP_WEAK, veid);
+	dev_cgroup = ve_cgroup_open(devices_root, CGRP_CREAT, veid);
 	err = PTR_ERR(dev_cgroup);
 	if (IS_ERR(dev_cgroup))
 		goto err_dev_cgroup;