[Devel] [PATCH RHEL7 COMMIT] ms/futex: Add another early deadlock detection check

Konstantin Khorenko khorenko at virtuozzo.com
Wed Jun 24 05:32:38 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.17
------>
commit 96fe3d7b30416ed954a71f5dc986f40698e9caf4
Author: Thomas Gleixner <tglx at linutronix.de>
Date:   Wed Jun 24 16:32:38 2015 +0400

    ms/futex: Add another early deadlock detection check
    
    Dave Jones trinity syscall fuzzer exposed an issue in the deadlock
    detection code of rtmutex:
      http://lkml.kernel.org/r/20140429151655.GA14277@redhat.com
    
    That underlying issue has been fixed with a patch to the rtmutex code,
    but the futex code must not call into rtmutex in that case because
        - it can detect that issue early
        - it avoids a different and more complex fixup for backing out
    
    If the user space variable got manipulated to 0x80000000 which means
    no lock holder, but the waiters bit set and an active pi_state in the
    kernel is found we can figure out the recursive locking issue by
    looking at the pi_state owner. If that is the current task, then we
    can safely return -EDEADLK.
    
    The check should have been added in commit 59fa62451 (futex: Handle
    futex_pi OWNER_DIED take over correctly) already, but I did not see
    the above issue caused by user space manipulation back then.
    
    Signed-off-by: Thomas Gleixner <tglx at linutronix.de>
    Cc: Dave Jones <davej at redhat.com>
    Cc: Linus Torvalds <torvalds at linux-foundation.org>
    Cc: Peter Zijlstra <peterz at infradead.org>
    Cc: Darren Hart <darren at dvhart.com>
    Cc: Davidlohr Bueso <davidlohr at hp.com>
    Cc: Steven Rostedt <rostedt at goodmis.org>
    Cc: Clark Williams <williams at redhat.com>
    Cc: Paul McKenney <paulmck at linux.vnet.ibm.com>
    Cc: Lai Jiangshan <laijs at cn.fujitsu.com>
    Cc: Roland McGrath <roland at hack.frob.com>
    Cc: Carlos ODonell <carlos at redhat.com>
    Cc: Jakub Jelinek <jakub at redhat.com>
    Cc: Michael Kerrisk <mtk.manpages at gmail.com>
    Cc: Sebastian Andrzej Siewior <bigeasy at linutronix.de>
    Link: http://lkml.kernel.org/r/20140512201701.097349971@linutronix.de
    Signed-off-by: Thomas Gleixner <tglx at linutronix.de>
    Cc: stable at vger.kernel.org
    (cherry picked from commit 866293ee54227584ffcb4a42f69c1f365974ba7f)
    
    Ports diff-ms-futex-add-another-early-deadlock-detection-check-2
    
    Related to https://jira.sw.ru/browse/PSBM-33650
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
---
 kernel/futex.c | 47 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index f9d7dba..2d93205 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -730,7 +730,8 @@ void exit_pi_state_list(struct task_struct *curr)
 
 static int
 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
-		union futex_key *key, struct futex_pi_state **ps)
+		union futex_key *key, struct futex_pi_state **ps,
+		struct task_struct *task)
 {
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_q *this, *next;
@@ -771,6 +772,16 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 					return -EINVAL;
 			}
 
+			/*
+			 * Protect against a corrupted uval. If uval
+			 * is 0x80000000 then pid is 0 and the waiter
+			 * bit is set. So the deadlock check in the
+			 * calling code has failed and we did not fall
+			 * into the check above due to !pid.
+			 */
+			if (task && pi_state->owner == task)
+				return -EDEADLK;
+
 			atomic_inc(&pi_state->refcount);
 			*ps = pi_state;
 
@@ -925,7 +936,7 @@ retry:
 	 * We dont have the lock. Look up the PI state (or create it if
 	 * we are the first waiter):
 	 */
-	ret = lookup_pi_state(uval, hb, key, ps);
+	ret = lookup_pi_state(uval, hb, key, ps, task);
 
 	if (unlikely(ret)) {
 		switch (ret) {
@@ -1337,7 +1348,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  *
  * Return:
  *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
  * <0 - error
  */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1348,7 +1359,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 {
 	struct futex_q *top_waiter = NULL;
 	u32 curval;
-	int ret;
+	int ret, vpid;
 
 	if (get_futex_value_locked(&curval, pifutex))
 		return -EFAULT;
@@ -1376,11 +1387,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 	 * the contended case or if set_waiters is 1.  The pi_state is returned
 	 * in ps in contended cases.
 	 */
+	vpid = task_pid_vnr(top_waiter->task);
 	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
 				   set_waiters);
-	if (ret == 1)
+	if (ret == 1) {
 		requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+		return vpid;
+	}
 	return ret;
 }
 
@@ -1411,7 +1424,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
-	u32 curval2;
 
 	if (requeue_pi) {
 		/*
@@ -1497,16 +1509,25 @@ retry_private:
 		 * At this point the top_waiter has either taken uaddr2 or is
 		 * waiting on it.  If the former, then the pi_state will not
 		 * exist yet, look it up one more time to ensure we have a
-		 * reference to it.
+		 * reference to it. If the lock was taken, ret contains the
+		 * vpid of the top waiter task.
 		 */
-		if (ret == 1) {
+		if (ret > 0) {
 			WARN_ON(pi_state);
 			drop_count++;
 			task_count++;
-			ret = get_futex_value_locked(&curval2, uaddr2);
-			if (!ret)
-				ret = lookup_pi_state(curval2, hb2, &key2,
-						      &pi_state);
+			/*
+			 * If we acquired the lock, then the user
+			 * space value of uaddr2 should be vpid. It
+			 * cannot be changed by the top waiter as it
+			 * is blocked on hb2 lock if it tries to do
+			 * so. If something fiddled with it behind our
+			 * back the pi state lookup might unearth
+			 * it. So we rather use the known value than
+			 * rereading and handing potential crap to
+			 * lookup_pi_state.
+			 */
+			ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
 		}
 
 		switch (ret) {



More information about the Devel mailing list