[Devel] [RFC] Fix get_exec_env() races

Kirill Tkhai ktkhai at odin.com
Thu Oct 15 03:02:34 PDT 2015


Since we allow to attach a not current task to ve cgroup, there is the race
in the places where we use get_exec_env(). The task's ve may be changed after
it dereferenced get_exec_env(), so a lot of problems are possible there.
I'm sure the most places, where we use get_exec_env(), was not written in an
assumption it may change. Also, there are a lot of nested functions and it's
impossible to check every function to verify if it's input parameters, depending
on a caller's dereferenced ve, are not actual because of ve has been changed.

I'm suggest to use to modify get_exec_env() which will supply ve's stability.
It pairs with put_exec_env() which marks end of area where ve modification is
not desirable.

get_exec_env() may be used nested, so here is task_struct::ve_attach_lock_depth,
which allows nesting. The counter looks a better variant that plain read_lock()
in get_exec_env() and write_trylock() loop in ve_attach():

get_exec_env()
{
   ...
   read_lock();
   ...
}

ve_attach()
{
   while(!write_trylock())
      cpu_relax();
}

because this case the priority of read_lock() will be absolute and we lost all
advantages of queued rw locks fairness.

Also I considered variants with using RCU and task work, but they seems to be worse.

Please, your comments.

---
 include/linux/init_task.h |  3 ++-
 include/linux/sched.h     |  1 +
 include/linux/ve.h        | 29 +++++++++++++++++++++++++++++
 include/linux/ve_proto.h  |  1 -
 kernel/fork.c             |  3 +++
 kernel/ve/ve.c            |  8 +++++++-
 6 files changed, 42 insertions(+), 3 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d2cbad0..57e0796 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -136,7 +136,8 @@ extern struct task_group root_task_group;
 #endif
 
 #ifdef CONFIG_VE
-#define	INIT_TASK_VE(tsk) .task_ve = &ve0,
+#define	INIT_TASK_VE(tsk) .task_ve = &ve0,				\
+			  .ve_attach_lock_depth = 0
 #else
 #define	INIT_TASK_VE(tsk)
 #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e1bcabe..948481f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1564,6 +1564,7 @@ struct task_struct {
 #endif
 #ifdef CONFIG_VE
 	struct ve_struct *task_ve;
+	unsigned int ve_attach_lock_depth;
 #endif
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
 	struct memcg_batch_info {
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 86b95c3..3cea73d 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -33,6 +33,7 @@ struct ve_monitor;
 struct nsproxy;
 
 struct ve_struct {
+	rwlock_t		attach_lock;
 	struct cgroup_subsys_state	css;
 
 	const char		*ve_name;
@@ -130,6 +131,34 @@ struct ve_struct {
 #endif
 };
 
+static inline struct ve_struct *get_exec_env(void)
+{
+	struct ve_struct *ve;
+
+	if (++current->ve_attach_lock_depth > 1)
+		return current->task_ve;
+
+	rcu_read_lock();
+again:
+	ve = current->task_ve;
+	read_lock(&ve->attach_lock);
+	if (unlikely(current->task_ve != ve)) {
+		read_unlock(&ve->attach_lock);
+		goto again;
+	}
+	rcu_read_unlock();
+
+	return ve;
+}
+
+static inline void put_exec_env(void)
+{
+	struct ve_struct *ve = current->task_ve;
+
+	if (!--current->ve_attach_lock_depth)
+		read_unlock(&ve->attach_lock);
+}
+
 struct ve_devmnt {
 	struct list_head	link;
 
diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
index 0f5898e..3deb09e 100644
--- a/include/linux/ve_proto.h
+++ b/include/linux/ve_proto.h
@@ -30,7 +30,6 @@ static inline bool ve_is_super(struct ve_struct *ve)
 	return ve == &ve0;
 }
 
-#define get_exec_env()		(current->task_ve)
 #define get_env_init(ve)	(ve->ve_ns->pid_ns->child_reaper)
 
 const char *ve_name(struct ve_struct *ve);
diff --git a/kernel/fork.c b/kernel/fork.c
index 505fa21..3d7e452 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1439,6 +1439,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->pi_state_list);
 	p->pi_state_cache = NULL;
 #endif
+#ifdef CONFIG_VE
+	p->ve_attach_lock_depth = 0;
+#endif
 	/*
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 39a95e8..23833ed 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -640,6 +640,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
 	ve->meminfo_val = VE_MEMINFO_DEFAULT;
 
 do_init:
+	ve->attach_lock = __RW_LOCK_UNLOCKED(&ve->attach_lock);
 	init_rwsem(&ve->op_sem);
 	mutex_init(&ve->sync_mutex);
 	INIT_LIST_HEAD(&ve->devices);
@@ -738,8 +739,11 @@ static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 
 static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 {
+	struct task_struct *task = cgroup_taskset_first(tset);
+	struct ve_struct *old_ve = task->task_ve;
 	struct ve_struct *ve = cgroup_ve(cg);
-	struct task_struct *task;
+
+	write_lock_irq(&old_ve->attach_lock);
 
 	cgroup_taskset_for_each(task, cg, tset) {
 		/* this probihibts ptracing of task entered to VE from host system */
@@ -755,6 +759,8 @@ static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
 
 		task->task_ve = ve;
 	}
+
+	write_unlock_irq(&old_ve->attach_lock);
 }
 
 static int ve_state_read(struct cgroup *cg, struct cftype *cft,



More information about the Devel mailing list