[Devel] [RFC] Fix get_exec_env() races

Kirill Tkhai ktkhai at odin.com
Thu Oct 15 03:21:54 PDT 2015


Vova remind me, we may sleep inside get_exec_env() section.

So, it's yet better to use task work here.

On 15.10.2015 13:02, Kirill Tkhai wrote:
> Since we allow to attach a not current task to ve cgroup, there is the race
> in the places where we use get_exec_env(). The task's ve may be changed after
> it dereferenced get_exec_env(), so a lot of problems are possible there.
> I'm sure the most places, where we use get_exec_env(), was not written in an
> assumption it may change. Also, there are a lot of nested functions and it's
> impossible to check every function to verify if it's input parameters, depending
> on a caller's dereferenced ve, are not actual because of ve has been changed.
> 
> I'm suggest to use to modify get_exec_env() which will supply ve's stability.
> It pairs with put_exec_env() which marks end of area where ve modification is
> not desirable.
> 
> get_exec_env() may be used nested, so here is task_struct::ve_attach_lock_depth,
> which allows nesting. The counter looks a better variant that plain read_lock()
> in get_exec_env() and write_trylock() loop in ve_attach():
> 
> get_exec_env()
> {
>    ...
>    read_lock();
>    ...
> }
> 
> ve_attach()
> {
>    while(!write_trylock())
>       cpu_relax();
> }
> 
> because this case the priority of read_lock() will be absolute and we lost all
> advantages of queued rw locks fairness.
> 
> Also I considered variants with using RCU and task work, but they seems to be worse.
> 
> Please, your comments.
> 
> ---
>  include/linux/init_task.h |  3 ++-
>  include/linux/sched.h     |  1 +
>  include/linux/ve.h        | 29 +++++++++++++++++++++++++++++
>  include/linux/ve_proto.h  |  1 -
>  kernel/fork.c             |  3 +++
>  kernel/ve/ve.c            |  8 +++++++-
>  6 files changed, 42 insertions(+), 3 deletions(-)
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index d2cbad0..57e0796 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -136,7 +136,8 @@ extern struct task_group root_task_group;
>  #endif
>  
>  #ifdef CONFIG_VE
> -#define	INIT_TASK_VE(tsk) .task_ve = &ve0,
> +#define	INIT_TASK_VE(tsk) .task_ve = &ve0,				\
> +			  .ve_attach_lock_depth = 0
>  #else
>  #define	INIT_TASK_VE(tsk)
>  #endif
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index e1bcabe..948481f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1564,6 +1564,7 @@ struct task_struct {
>  #endif
>  #ifdef CONFIG_VE
>  	struct ve_struct *task_ve;
> +	unsigned int ve_attach_lock_depth;
>  #endif
>  #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
>  	struct memcg_batch_info {
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 86b95c3..3cea73d 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -33,6 +33,7 @@ struct ve_monitor;
>  struct nsproxy;
>  
>  struct ve_struct {
> +	rwlock_t		attach_lock;
>  	struct cgroup_subsys_state	css;
>  
>  	const char		*ve_name;
> @@ -130,6 +131,34 @@ struct ve_struct {
>  #endif
>  };
>  
> +static inline struct ve_struct *get_exec_env(void)
> +{
> +	struct ve_struct *ve;
> +
> +	if (++current->ve_attach_lock_depth > 1)
> +		return current->task_ve;
> +
> +	rcu_read_lock();
> +again:
> +	ve = current->task_ve;
> +	read_lock(&ve->attach_lock);
> +	if (unlikely(current->task_ve != ve)) {
> +		read_unlock(&ve->attach_lock);
> +		goto again;
> +	}
> +	rcu_read_unlock();
> +
> +	return ve;
> +}
> +
> +static inline void put_exec_env(void)
> +{
> +	struct ve_struct *ve = current->task_ve;
> +
> +	if (!--current->ve_attach_lock_depth)
> +		read_unlock(&ve->attach_lock);
> +}
> +
>  struct ve_devmnt {
>  	struct list_head	link;
>  
> diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
> index 0f5898e..3deb09e 100644
> --- a/include/linux/ve_proto.h
> +++ b/include/linux/ve_proto.h
> @@ -30,7 +30,6 @@ static inline bool ve_is_super(struct ve_struct *ve)
>  	return ve == &ve0;
>  }
>  
> -#define get_exec_env()		(current->task_ve)
>  #define get_env_init(ve)	(ve->ve_ns->pid_ns->child_reaper)
>  
>  const char *ve_name(struct ve_struct *ve);
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 505fa21..3d7e452 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1439,6 +1439,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	INIT_LIST_HEAD(&p->pi_state_list);
>  	p->pi_state_cache = NULL;
>  #endif
> +#ifdef CONFIG_VE
> +	p->ve_attach_lock_depth = 0;
> +#endif
>  	/*
>  	 * sigaltstack should be cleared when sharing the same VM
>  	 */
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 39a95e8..23833ed 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -640,6 +640,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
>  	ve->meminfo_val = VE_MEMINFO_DEFAULT;
>  
>  do_init:
> +	ve->attach_lock = __RW_LOCK_UNLOCKED(&ve->attach_lock);
>  	init_rwsem(&ve->op_sem);
>  	mutex_init(&ve->sync_mutex);
>  	INIT_LIST_HEAD(&ve->devices);
> @@ -738,8 +739,11 @@ static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
>  
>  static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
>  {
> +	struct task_struct *task = cgroup_taskset_first(tset);
> +	struct ve_struct *old_ve = task->task_ve;
>  	struct ve_struct *ve = cgroup_ve(cg);
> -	struct task_struct *task;
> +
> +	write_lock_irq(&old_ve->attach_lock);
>  
>  	cgroup_taskset_for_each(task, cg, tset) {
>  		/* this probihibts ptracing of task entered to VE from host system */
> @@ -755,6 +759,8 @@ static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
>  
>  		task->task_ve = ve;
>  	}
> +
> +	write_unlock_irq(&old_ve->attach_lock);
>  }
>  
>  static int ve_state_read(struct cgroup *cg, struct cftype *cft,
> 



More information about the Devel mailing list