[Devel] Re: [PATCH 6/7] [RFC] Support multiply-bindable cgroup subsystems
Li Zefan
lizf at cn.fujitsu.com
Mon Mar 16 23:46:28 PDT 2009
Paul Menage wrote:
> [RFC] Support multiply-bindable cgroup subsystems
>
> This patch allows a cgroup subsystem to be marked as bindable on
> multiple cgroup hierarchies independently, when declared in
> cgroup_subsys.h via MULTI_SUBSYS() rather than SUBSYS().
>
> The state for such subsystems cannot be accessed directly from a
> task->cgroups (since there's no unique mapping for a task) but instead
> must be accessed via a particular control group object.
>
> Multiply-bound subsystems are useful in cases where there's no direct
> correspondence between the cgroup configuration and some property of
> the kernel outside of the cgroups subsystem. So this would not be
> applicable to e.g. the CFS cgroup, since there has to a unique mapping
> from a task to its CFS run queue.
>
> As an example, the "debug" subsystem is marked multiply-bindable,
> since it has no state outside the cgroups framework itself.
>
> Example usage:
>
> mount -t cgroup -o name=foo,debug,cpu cgroup /mnt1
> mount -t cgroup -o name=bar,debug,memory cgroup /mnt2
>
> Open issues:
>
> - what should /proc/cgroup show for multiply-bindable subsystems?
> Currently it shows just one entry for each hierarchy the subsystem
> is bound to, which means the subsystem doesn't show up when unbound,
> and is indistinguishable from a regular (singleton) subsystem when
> singly-bound.
>
> - should the bind() callback be extended to support passing more
> information to multiply-bindable subsystems? Or should we just scrap
> the bind() callback since nothing appears to be using it.
>
I agree to scrap it.
> - how to stop checkpatch.pl complaining about the way the
> SUBSYS/MULTI_SUBSYS macros as used to define the cgroup_subsys_id
> enum in cgroup.h?
>
I think we can ignore it.
> Signed-off-by: Paul Menage <menage at google.com>
>
> ---
>
> include/linux/cgroup.h | 37 ++++++++++-
> include/linux/cgroup_subsys.h | 2 -
> kernel/cgroup.c | 137 ++++++++++++++++++++++++++---------------
> 3 files changed, 123 insertions(+), 53 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index adf6739..9b5999d 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -39,10 +39,37 @@ extern int cgroupstats_build(struct cgroupstats *stats,
>
> extern struct file_operations proc_cgroup_operations;
>
> -/* Define the enumeration of all cgroup subsystems */
> -#define SUBSYS(_x) _x ## _subsys_id,
> +/* Define the enumeration of all cgroup subsystems. There are two
/*
* xxx
*/
> + * kinds of subsystems:
> + *
> + * - regular (singleton) subsystems can be only mounted once; these
> + * generally correspond to some system that has substantial state
> + * outside of the cgroups framework, or where the state is being used
> + * to control some external behaviour (e.g. CPU scheduler). Every
> + * task has an associated state pointer (accessed efficiently through
> + * task->cgroups) for each singleton subsystem.
> + *
> + * - multiply-bound subsystems may be mounted on zero or more
> + * hierarchies. Since there's no unique mapping from a task to a
> + * subsystem state pointer for multiply-bound subsystems, the state of
> + * these subsystems cannot be accessed rapidly from a task
> + * pointer. These correspond to subsystems where most or all of the
> + * state is maintained within the subsystem itself, and/or the
> + * subsystem is used for monitoring rather than control.
> + */
> enum cgroup_subsys_id {
> +#define SUBSYS(_x) _x ## _subsys_id,
> +#define MULTI_SUBSYS(_x)
> #include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> + CGROUP_SINGLETON_SUBSYS_COUNT,
> + CGROUP_DUMMY_ENUM_RESET = CGROUP_SINGLETON_SUBSYS_COUNT - 1,
> +#define SUBSYS(_x)
> +#define MULTI_SUBSYS(_x) _x ## _subsys_id,
> +#include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> CGROUP_SUBSYS_COUNT
> };
> #undef SUBSYS
> @@ -231,7 +258,7 @@ struct css_set {
> * time). Multi-subsystems don't have an entry in here since
> * there's no unique state for a given task.
> */
> - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
> + struct cgroup_subsys_state *subsys[CGROUP_SINGLETON_SUBSYS_COUNT];
> };
>
> /*
> @@ -419,8 +446,10 @@ struct cgroup_subsys {
> };
>
> #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
> +#define MULTI_SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
> #include <linux/cgroup_subsys.h>
> #undef SUBSYS
> +#undef MULTI_SUBSYS
>
> static inline struct cgroup_subsys_state *cgroup_subsys_state(
> struct cgroup *cgrp, int subsys_id)
> @@ -431,6 +460,8 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
> static inline struct cgroup_subsys_state *task_subsys_state(
> struct task_struct *task, int subsys_id)
> {
> + /* This check is optimized out for most callers */
> + BUG_ON(subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
> return rcu_dereference(task->cgroups->subsys[subsys_id]);
> }
>
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 9c8d31b..f78605e 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -14,7 +14,7 @@ SUBSYS(cpuset)
> /* */
>
> #ifdef CONFIG_CGROUP_DEBUG
> -SUBSYS(debug)
> +MULTI_SUBSYS(debug)
> #endif
>
> /* */
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index fcb8181..942a951 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -52,10 +52,18 @@
> static DEFINE_MUTEX(cgroup_mutex);
>
> /* Generate an array of cgroup subsystem pointers */
> -#define SUBSYS(_x) &_x ## _subsys,
>
> static struct cgroup_subsys *subsys[] = {
> +#define SUBSYS(_x) &_x ## _subsys,
> +#define MULTI_SUBSYS(_x)
> +#include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> +#define SUBSYS(_x)
> +#define MULTI_SUBSYS(_x) &_x ## _subsys,
> #include <linux/cgroup_subsys.h>
> +#undef SUBSYS
> +#undef MULTI_SUBSYS
> };
>
> /*
> @@ -265,7 +273,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
> int index;
> unsigned long tmp = 0UL;
>
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++)
> tmp += (unsigned long)css[i];
> tmp = (tmp >> 16) ^ tmp;
>
> @@ -435,7 +443,7 @@ static struct css_set *find_existing_css_set(
>
> /* Build the set of subsystem state objects that we want to
> * see in the new css_set */
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
> if (root->subsys_bits & (1UL << i)) {
> /* Subsystem is in this hierarchy. So we want
> * the subsystem state from the new
> @@ -529,7 +537,7 @@ static struct css_set *find_css_set(
> struct css_set *oldcg, struct cgroup *cgrp)
> {
> struct css_set *res;
> - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
> + struct cgroup_subsys_state *template[CGROUP_SINGLETON_SUBSYS_COUNT];
>
> struct list_head tmp_cg_links;
>
> @@ -853,6 +861,20 @@ static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
> wake_up_all(&cgroup_rmdir_waitq);
> }
>
> +static void init_cgroup_css(struct cgroup_subsys_state *css,
> + struct cgroup_subsys *ss,
> + struct cgroup *cgrp)
> +{
> + css->cgroup = cgrp;
> + atomic_set(&css->refcnt, 1);
> + css->flags = 0;
> + css->id = NULL;
> + if (cgrp == &cgrp->root->top_cgroup)
> + set_bit(CSS_ROOT, &css->flags);
> + BUG_ON(cgrp->subsys[ss->subsys_id]);
> + cgrp->subsys[ss->subsys_id] = css;
> +}
> +
> static int rebind_subsystems(struct cgroupfs_root *root,
> unsigned long final_bits)
> {
> @@ -863,7 +885,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
> removed_bits = root->subsys_bits & ~final_bits;
> added_bits = final_bits & ~root->subsys_bits;
> /* Check that any added subsystems are currently free */
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
> unsigned long bit = 1UL << i;
> if (!(bit & added_bits))
> continue;
> @@ -883,33 +905,57 @@ static int rebind_subsystems(struct cgroupfs_root *root,
> /* Process each subsystem */
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> struct cgroup_subsys *ss = subsys[i];
> + bool singleton = i < CGROUP_SINGLETON_SUBSYS_COUNT;
> unsigned long bit = 1UL << i;
> if (bit & added_bits) {
> + struct cgroup_subsys_state *css;
> /* We're binding this subsystem to this hierarchy */
> BUG_ON(cgrp->subsys[i]);
> - BUG_ON(!dummytop->subsys[i]);
> - BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
> - BUG_ON(!(rootnode.subsys_bits & bit));
> + if (singleton) {
> + css = dummytop->subsys[i];
> + BUG_ON(!css);
> +
> + BUG_ON(css->cgroup != dummytop);
> + BUG_ON(!(rootnode.subsys_bits & bit));
> + } else {
> + BUG_ON(dummytop->subsys[i]);
> + BUG_ON(rootnode.subsys_bits & bit);
> + css = ss->create(ss, cgrp);
> + if (IS_ERR(css))
> + return PTR_ERR(css);
> + init_cgroup_css(css, ss, cgrp);
> + }
> mutex_lock(&ss->hierarchy_mutex);
> - cgrp->subsys[i] = dummytop->subsys[i];
> - cgrp->subsys[i]->cgroup = cgrp;
> + cgrp->subsys[i] = css;
> + css->cgroup = cgrp;
> if (ss->bind)
> ss->bind(ss, cgrp);
> - rootnode.subsys_bits &= ~bit;
> + if (singleton)
> + rootnode.subsys_bits &= ~bit;
> root->subsys_bits |= bit;
> mutex_unlock(&ss->hierarchy_mutex);
> } else if (bit & removed_bits) {
> + struct cgroup_subsys_state *css;
> /* We're removing this subsystem */
> - BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
> - BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
> - BUG_ON(rootnode.subsys_bits & bit);
> + css = cgrp->subsys[i];
> + BUG_ON(css->cgroup != cgrp);
> + if (singleton) {
> + BUG_ON(css != dummytop->subsys[i]);
> + BUG_ON(rootnode.subsys_bits & bit);
> + }
> mutex_lock(&ss->hierarchy_mutex);
> if (ss->bind)
> ss->bind(ss, dummytop);
> - dummytop->subsys[i]->cgroup = dummytop;
> + if (singleton) {
> + css->cgroup = dummytop;
> + } else {
> + /* Is this safe? */
> + ss->destroy(ss, cgrp);
> + }
> cgrp->subsys[i] = NULL;
> root->subsys_bits &= ~bit;
> - rootnode.subsys_bits |= bit;
> + if (singleton)
> + rootnode.subsys_bits |= bit;
initially we can see all subsystems in /proc/cgroups, but:
# mount -o cpu,debug xxx /mnt
# remount -o cpu xxx /mnt
# umount /mnt
now no root including rootnode contains debug_subsys, so we
won't see debug_subsys in /proc/cgroups.
I think this inconsistency is wrong.
> mutex_unlock(&ss->hierarchy_mutex);
> } else if (bit & final_bits) {
> /* Subsystem state should already exist */
> @@ -974,7 +1020,7 @@ static int parse_cgroupfs_options(char *data,
> /* Add all non-disabled subsystems */
> int i;
> opts->subsys_bits = 0;
> - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> + for (i = 0; i < CGROUP_SINGLETON_SUBSYS_COUNT; i++) {
I don't see why only singleton subsystems are binded when mount
with '-o all'.
> struct cgroup_subsys *ss = subsys[i];
> if (!ss->disabled)
> opts->subsys_bits |= 1ul << i;
> @@ -1040,6 +1086,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> struct cgroupfs_root *root = sb->s_fs_info;
> struct cgroup *cgrp = &root->top_cgroup;
> struct cgroup_sb_opts opts;
> + unsigned long original_bits;
>
> mutex_lock(&cgrp->dentry->d_inode->i_mutex);
> mutex_lock(&cgroup_mutex);
> @@ -1061,9 +1108,13 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
> goto out_unlock;
> }
>
> + original_bits = root->subsys_bits;
> ret = rebind_subsystems(root, opts.subsys_bits);
> - if (ret)
> + if (ret) {
> + int tmp = rebind_subsystems(root, original_bits);
> + BUG_ON(tmp);
> goto out_unlock;
> + }
>
> /* (re)populate subsystem files */
> cgroup_populate_dir(cgrp);
> @@ -1265,16 +1316,15 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
> }
>
> ret = rebind_subsystems(root, opts.subsys_bits);
> - if (ret == -EBUSY) {
> + if (ret) {
> + int tmp = rebind_subsystems(root, 0);
> + BUG_ON(tmp);
> mutex_unlock(&cgroup_mutex);
> mutex_unlock(&inode->i_mutex);
> free_cg_links(&tmp_cg_links);
> goto drop_new_super;
> }
>
> - /* EBUSY should be the only error here */
> - BUG_ON(ret);
> -
> list_add(&root->root_list, &roots);
> root_count++;
>
> @@ -2591,20 +2641,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
> return 0;
> }
>
> -static void init_cgroup_css(struct cgroup_subsys_state *css,
> - struct cgroup_subsys *ss,
> - struct cgroup *cgrp)
> -{
> - css->cgroup = cgrp;
> - atomic_set(&css->refcnt, 1);
> - css->flags = 0;
> - css->id = NULL;
> - if (cgrp == dummytop)
> - set_bit(CSS_ROOT, &css->flags);
> - BUG_ON(cgrp->subsys[ss->subsys_id]);
> - cgrp->subsys[ss->subsys_id] = css;
> -}
> -
> static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
> {
> /* We need to take each hierarchy_mutex in a consistent order */
> @@ -2890,21 +2926,23 @@ again:
> static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
> {
> struct cgroup_subsys_state *css;
> -
> + bool singleton = ss->subsys_id < CGROUP_SINGLETON_SUBSYS_COUNT;
> printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
>
> - /* Create the top cgroup state for this subsystem */
> - css = ss->create(ss, dummytop);
> - /* We don't handle early failures gracefully */
> - BUG_ON(IS_ERR(css));
> - init_cgroup_css(css, ss, dummytop);
> -
> - /* Update the init_css_set to contain a subsys
> - * pointer to this state - since the subsystem is
> - * newly registered, all tasks and hence the
> - * init_css_set is in the subsystem's top cgroup. */
> - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
> + if (singleton) {
> + /* Create the top cgroup state for this subsystem */
> + css = ss->create(ss, dummytop);
> + /* We don't handle early failures gracefully */
> + BUG_ON(IS_ERR(css));
> + init_cgroup_css(css, ss, dummytop);
>
> + /* Update the init_css_set to contain a subsys
> + * pointer to this state - since the subsystem is
> + * newly registered, all tasks and hence the
> + * init_css_set is in the subsystem's top cgroup. */
> + init_css_set.subsys[ss->subsys_id] = css;
> + rootnode.subsys_bits |= 1ULL << ss->subsys_id;
> + }
> need_forkexit_callback |= ss->fork || ss->exit;
>
> /* At system boot, before all subsystems have been
> @@ -2916,7 +2954,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
> lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
> ss->active = 1;
>
tailing blank line
> - rootnode.subsys_bits |= 1ULL << ss->subsys_id;
> }
>
> /**
> @@ -3285,6 +3322,8 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
>
> /* We shouldn't be called by an unregistered subsystem */
> BUG_ON(!subsys->active);
> + /* We can only support singleton subsystems */
> + BUG_ON(subsys->subsys_id >= CGROUP_SINGLETON_SUBSYS_COUNT);
>
> /* First figure out what hierarchy and cgroup we're dealing
> * with, and pin them so we can drop cgroup_mutex */
>
>
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list