[Devel] [RFC] Fix get_exec_env() races
Kirill Tkhai
ktkhai at odin.com
Thu Oct 15 03:02:34 PDT 2015
Since we allow to attach a not current task to ve cgroup, there is the race
in the places where we use get_exec_env(). The task's ve may be changed after
it dereferenced get_exec_env(), so a lot of problems are possible there.
I'm sure the most places, where we use get_exec_env(), was not written in an
assumption it may change. Also, there are a lot of nested functions and it's
impossible to check every function to verify if it's input parameters, depending
on a caller's dereferenced ve, are not actual because of ve has been changed.
I'm suggest to use to modify get_exec_env() which will supply ve's stability.
It pairs with put_exec_env() which marks end of area where ve modification is
not desirable.
get_exec_env() may be used nested, so here is task_struct::ve_attach_lock_depth,
which allows nesting. The counter looks a better variant that plain read_lock()
in get_exec_env() and write_trylock() loop in ve_attach():
get_exec_env()
{
...
read_lock();
...
}
ve_attach()
{
while(!write_trylock())
cpu_relax();
}
because this case the priority of read_lock() will be absolute and we lost all
advantages of queued rw locks fairness.
Also I considered variants with using RCU and task work, but they seems to be worse.
Please, your comments.
---
include/linux/init_task.h | 3 ++-
include/linux/sched.h | 1 +
include/linux/ve.h | 29 +++++++++++++++++++++++++++++
include/linux/ve_proto.h | 1 -
kernel/fork.c | 3 +++
kernel/ve/ve.c | 8 +++++++-
6 files changed, 42 insertions(+), 3 deletions(-)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d2cbad0..57e0796 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -136,7 +136,8 @@ extern struct task_group root_task_group;
#endif
#ifdef CONFIG_VE
-#define INIT_TASK_VE(tsk) .task_ve = &ve0,
+#define INIT_TASK_VE(tsk) .task_ve = &ve0, \
+ .ve_attach_lock_depth = 0
#else
#define INIT_TASK_VE(tsk)
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e1bcabe..948481f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1564,6 +1564,7 @@ struct task_struct {
#endif
#ifdef CONFIG_VE
struct ve_struct *task_ve;
+ unsigned int ve_attach_lock_depth;
#endif
#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
struct memcg_batch_info {
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 86b95c3..3cea73d 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -33,6 +33,7 @@ struct ve_monitor;
struct nsproxy;
struct ve_struct {
+ rwlock_t attach_lock;
struct cgroup_subsys_state css;
const char *ve_name;
@@ -130,6 +131,34 @@ struct ve_struct {
#endif
};
+static inline struct ve_struct *get_exec_env(void)
+{
+ struct ve_struct *ve;
+
+ if (++current->ve_attach_lock_depth > 1)
+ return current->task_ve;
+
+ rcu_read_lock();
+again:
+ ve = current->task_ve;
+ read_lock(&ve->attach_lock);
+ if (unlikely(current->task_ve != ve)) {
+ read_unlock(&ve->attach_lock);
+ goto again;
+ }
+ rcu_read_unlock();
+
+ return ve;
+}
+
+static inline void put_exec_env(void)
+{
+ struct ve_struct *ve = current->task_ve;
+
+ if (!--current->ve_attach_lock_depth)
+ read_unlock(&ve->attach_lock);
+}
+
struct ve_devmnt {
struct list_head link;
diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h
index 0f5898e..3deb09e 100644
--- a/include/linux/ve_proto.h
+++ b/include/linux/ve_proto.h
@@ -30,7 +30,6 @@ static inline bool ve_is_super(struct ve_struct *ve)
return ve == &ve0;
}
-#define get_exec_env() (current->task_ve)
#define get_env_init(ve) (ve->ve_ns->pid_ns->child_reaper)
const char *ve_name(struct ve_struct *ve);
diff --git a/kernel/fork.c b/kernel/fork.c
index 505fa21..3d7e452 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1439,6 +1439,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->pi_state_list);
p->pi_state_cache = NULL;
#endif
+#ifdef CONFIG_VE
+ p->ve_attach_lock_depth = 0;
+#endif
/*
* sigaltstack should be cleared when sharing the same VM
*/
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 39a95e8..23833ed 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -640,6 +640,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
ve->meminfo_val = VE_MEMINFO_DEFAULT;
do_init:
+ ve->attach_lock = __RW_LOCK_UNLOCKED(&ve->attach_lock);
init_rwsem(&ve->op_sem);
mutex_init(&ve->sync_mutex);
INIT_LIST_HEAD(&ve->devices);
@@ -738,8 +739,11 @@ static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
{
+ struct task_struct *task = cgroup_taskset_first(tset);
+ struct ve_struct *old_ve = task->task_ve;
struct ve_struct *ve = cgroup_ve(cg);
- struct task_struct *task;
+
+ write_lock_irq(&old_ve->attach_lock);
cgroup_taskset_for_each(task, cg, tset) {
/* this probihibts ptracing of task entered to VE from host system */
@@ -755,6 +759,8 @@ static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
task->task_ve = ve;
}
+
+ write_unlock_irq(&old_ve->attach_lock);
}
static int ve_state_read(struct cgroup *cg, struct cftype *cft,
More information about the Devel
mailing list