[Devel] [PATCH RHEL7 COMMIT] ms/x86/spinlocks/paravirt: Fix memory corruption on unlock

Thu Jul 13 18:30:25 MSK 2017

The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.33.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.33.5
------>
commit c133e082fa96099ffb781d59264dc5468871adfe
Author: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com>
Date:   Thu Jul 13 19:30:25 2017 +0400

    ms/x86/spinlocks/paravirt: Fix memory corruption on unlock
    
    Paravirt spinlock clears slowpath flag after doing unlock.
    As explained by Linus currently it does:
    
                    prev = *lock;
                    add_smp(&lock->tickets.head, TICKET_LOCK_INC);
    
                    /* add_smp() is a full mb() */
    
                    if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
                            __ticket_unlock_slowpath(lock, prev);
    
    which is *exactly* the kind of things you cannot do with spinlocks,
    because after you've done the "add_smp()" and released the spinlock
    for the fast-path, you can't access the spinlock any more.  Exactly
    because a fast-path lock might come in, and release the whole data
    structure.
    
    Linus suggested that we should not do any writes to lock after unlock(),
    and we can move slowpath clearing to fastpath lock.
    
    So this patch implements the fix with:
    
     1. Moving slowpath flag to head (Oleg):
        Unlocked locks don't care about the slowpath flag; therefore we can keep
        it set after the last unlock, and clear it again on the first (try)lock.
        -- this removes the write after unlock. note that keeping slowpath flag would
        result in unnecessary kicks.
        By moving the slowpath flag from the tail to the head ticket we also avoid
        the need to access both the head and tail tickets on unlock.
    
     2. use xadd to avoid read/write after unlock that checks the need for
        unlock_kick (Linus):
        We further avoid the need for a read-after-release by using xadd;
        the prev head value will include the slowpath flag and indicate if we
        need to do PV kicking of suspended spinners -- on modern chips xadd
        isn't (much) more expensive than an add + load.
    
    Result:
     setup: 16core (32 cpu +ht sandy bridge 8GB 16vcpu guest)
     benchmark overcommit %improve
     kernbench  1x           -0.13
     kernbench  2x            0.02
     dbench     1x           -1.77
     dbench     2x           -0.63
    
    [Jeremy: Hinted missing TICKET_LOCK_INC for kick]
    [Oleg: Moved slowpath flag to head, ticket_equals idea]
    [PeterZ: Added detailed changelog]
    
    Suggested-by: Linus Torvalds <torvalds at linux-foundation.org>
    Reported-by: Sasha Levin <sasha.levin at oracle.com>
    Tested-by: Sasha Levin <sasha.levin at oracle.com>
    Signed-off-by: Raghavendra K T <raghavendra.kt at linux.vnet.ibm.com>
    Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org>
    Reviewed-by: Oleg Nesterov <oleg at redhat.com>
    Cc: Andrew Jones <drjones at redhat.com>
    Cc: Andrew Morton <akpm at linux-foundation.org>
    Cc: Andy Lutomirski <luto at amacapital.net>
    Cc: Boris Ostrovsky <boris.ostrovsky at oracle.com>
    Cc: Christian Borntraeger <borntraeger at de.ibm.com>
    Cc: Christoph Lameter <cl at linux.com>
    Cc: Dave Hansen <dave.hansen at linux.intel.com>
    Cc: Dave Jones <davej at redhat.com>
    Cc: David Vrabel <david.vrabel at citrix.com>
    Cc: Fernando Luis VÃ¡zquez Cao <fernando_b1 at lab.ntt.co.jp>
    Cc: Konrad Rzeszutek Wilk <konrad.wilk at oracle.com>
    Cc: Masami Hiramatsu <masami.hiramatsu.pt at hitachi.com>
    Cc: Paolo Bonzini <pbonzini at redhat.com>
    Cc: Paul E. McKenney <paulmck at linux.vnet.ibm.com>
    Cc: Ulrich Obergfell <uobergfe at redhat.com>
    Cc: Waiman Long <Waiman.Long at hp.com>
    Cc: a.ryabinin at samsung.com
    Cc: dave at stgolabs.net
    Cc: hpa at zytor.com
    Cc: jasowang at redhat.com
    Cc: jeremy at goop.org
    Cc: paul.gortmaker at windriver.com
    Cc: riel at redhat.com
    Cc: tglx at linutronix.de
    Cc: waiman.long at hp.com
    Cc: xen-devel at lists.xenproject.org
    Link: http://lkml.kernel.org/r/20150215173043.GA7471@linux.vnet.ibm.com
    Signed-off-by: Ingo Molnar <mingo at kernel.org>
    
    https://jira.sw.ru/browse/PSBM-68212
    
    (cherry picked from commit d6abfdb2022368d8c6c4be3f11a06656601a6cc2)
    with
        78bff1c868 x86/ticketlock: Fix spin_unlock_wait() livelock
        4f9d1382e6 x86/spinlock: Replace ACCESS_ONCE with READ_ONCE
    
    Signed-off-by: Denis Plotnikov <dplotnikov at virtuozzo.com>
    Acked-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 arch/x86/include/asm/spinlock.h | 102 ++++++++++++++++++++--------------------
 arch/x86/kernel/kvm.c           |  13 +++--
 arch/x86/xen/spinlock.c         |  13 +++--
 3 files changed, 68 insertions(+), 60 deletions(-)

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 63140dc..08e7293 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -9,11 +9,6 @@
 #include <asm/paravirt.h>
 #include <asm/bitops.h>
 
-static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
-{
-	return lock.tickets.head == lock.tickets.tail;
-}
-
 /*
  * Your basic SMP spinlocks, allowing only a single CPU anywhere
  *
@@ -51,7 +46,7 @@ static __always_inline bool static_key_false(struct static_key *key);
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
 {
-	set_bit(0, (volatile unsigned long *)&lock->tickets.tail);
+	set_bit(0, (volatile unsigned long *)&lock->tickets.head);
 }
 
 #else  /* !CONFIG_PARAVIRT_SPINLOCKS */
@@ -65,6 +60,31 @@ static inline void __ticket_unlock_kick(arch_spinlock_t *lock,
 }
 
 #endif /* CONFIG_PARAVIRT_SPINLOCKS */
+static inline int  __tickets_equal(__ticket_t one, __ticket_t two)
+{
+	return !((one ^ two) & ~TICKET_SLOWPATH_FLAG);
+}
+
+static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock,
+							__ticket_t head)
+{
+	if (head & TICKET_SLOWPATH_FLAG) {
+		arch_spinlock_t old, new;
+
+		old.tickets.head = head;
+		new.tickets.head = head & ~TICKET_SLOWPATH_FLAG;
+		old.tickets.tail = new.tickets.head + TICKET_LOCK_INC;
+		new.tickets.tail = old.tickets.tail;
+
+		/* try to clear slowpath flag when there are no contenders */
+		cmpxchg(&lock->head_tail, old.head_tail, new.head_tail);
+	}
+}
+
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+	return __tickets_equal(lock.tickets.head, lock.tickets.tail);
+}
 
 /*
  * Ticket locks are conceptually two parts, one indicating the current head of
@@ -87,18 +107,21 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
 	if (likely(inc.head == inc.tail))
 		goto out;
 
-	inc.tail &= ~TICKET_SLOWPATH_FLAG;
 	for (;;) {
 		unsigned count = SPIN_THRESHOLD;
 
 		do {
-			if (READ_ONCE(lock->tickets.head) == inc.tail)
-				goto out;
+			inc.head = READ_ONCE(lock->tickets.head);
+			if (__tickets_equal(inc.head, inc.tail))
+				goto clear_slowpath;
 			cpu_relax();
 		} while (--count);
 		__ticket_lock_spinning(lock, inc.tail);
 	}
-out:	barrier();	/* make sure nothing creeps before the lock is taken */
+clear_slowpath:
+	__ticket_check_and_clear_slowpath(lock, inc.head);
+out:
+	barrier();	/* make sure nothing creeps before the lock is taken */
 }
 
 static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -106,56 +129,30 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
 	arch_spinlock_t old, new;
 
 	old.tickets = READ_ONCE(lock->tickets);
-	if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
+	if (!__tickets_equal(old.tickets.head, old.tickets.tail))
 		return 0;
 
 	new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT);
+	new.head_tail &= ~TICKET_SLOWPATH_FLAG;
 
 	/* cmpxchg is a full barrier, so nothing can move before it */
 	return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail;
 }
 
-static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock,
-					    arch_spinlock_t old)
-{
-	arch_spinlock_t new;
-
-	BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
-
-	/* Perform the unlock on the "before" copy */
-	old.tickets.head += TICKET_LOCK_INC;
-
-	/* Clear the slowpath flag */
-	new.head_tail = old.head_tail & ~(TICKET_SLOWPATH_FLAG << TICKET_SHIFT);
-
-	/*
-	 * If the lock is uncontended, clear the flag - use cmpxchg in
-	 * case it changes behind our back though.
-	 */
-	if (new.tickets.head != new.tickets.tail ||
-	    cmpxchg(&lock->head_tail, old.head_tail,
-					new.head_tail) != old.head_tail) {
-		/*
-		 * Lock still has someone queued for it, so wake up an
-		 * appropriate waiter.
-		 */
-		__ticket_unlock_kick(lock, old.tickets.head);
-	}
-}
-
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
 	if (TICKET_SLOWPATH_FLAG &&
-	    static_key_false(&paravirt_ticketlocks_enabled)) {
-		arch_spinlock_t prev;
+		static_key_false(&paravirt_ticketlocks_enabled)) {
+		__ticket_t head;
 
-		prev = *lock;
-		add_smp(&lock->tickets.head, TICKET_LOCK_INC);
+		BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS);
 
-		/* add_smp() is a full mb() */
+		head = xadd(&lock->tickets.head, TICKET_LOCK_INC);
 
-		if (unlikely(lock->tickets.tail & TICKET_SLOWPATH_FLAG))
-			__ticket_unlock_slowpath(lock, prev);
+		if (unlikely(head & TICKET_SLOWPATH_FLAG)) {
+			head &= ~TICKET_SLOWPATH_FLAG;
+			__ticket_unlock_kick(lock, (head + TICKET_LOCK_INC));
+		}
 	} else
 		__add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX);
 }
@@ -164,14 +161,15 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return tmp.tail != tmp.head;
+	return !__tickets_equal(tmp.tail, tmp.head);
 }
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 
-	return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
+	tmp.head &= ~TICKET_SLOWPATH_FLAG;
+	return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
 }
 #define arch_spin_is_contended	arch_spin_is_contended
 
@@ -183,16 +181,16 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
-	__ticket_t head = ACCESS_ONCE(lock->tickets.head);
+	__ticket_t head = READ_ONCE(lock->tickets.head);
 
 	for (;;) {
-		struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+		struct __raw_tickets tmp = READ_ONCE(lock->tickets);
 		/*
 		 * We need to check "unlocked" in a loop, tmp.head == head
 		 * can be false positive because of overflow.
 		 */
-		if (tmp.head == (tmp.tail & ~TICKET_SLOWPATH_FLAG) ||
-		    tmp.head != head)
+		if (__tickets_equal(tmp.head, tmp.tail) ||
+				!__tickets_equal(tmp.head, head))
 			break;
 
 		cpu_relax();
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index d35c811..74d43f3 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -600,7 +600,7 @@ static inline void check_zero(void)
 	u8 ret;
 	u8 old;
 
-	old = ACCESS_ONCE(zero_stats);
+	old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -718,6 +718,7 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	int cpu;
 	u64 start;
 	unsigned long flags;
+	__ticket_t head;
 
 	if (in_nmi())
 		return;
@@ -759,11 +760,15 @@ __visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking.
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -794,8 +799,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 	add_stats(RELEASED_SLOW, 1);
 	for_each_cpu(cpu, &waiting_cpus) {
 		const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu);
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == ticket) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == ticket) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			kvm_kick_cpu(cpu);
 			break;
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index ebab852..08fe23b 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -41,7 +41,7 @@ static u8 zero_stats;
 static inline void check_zero(void)
 {
 	u8 ret;
-	u8 old = ACCESS_ONCE(zero_stats);
+	u8 old = READ_ONCE(zero_stats);
 	if (unlikely(old)) {
 		ret = cmpxchg(&zero_stats, old, 0);
 		/* This ensures only one fellow resets the stat */
@@ -112,6 +112,7 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	struct xen_lock_waiting *w = &__get_cpu_var(lock_waiting);
 	int cpu = smp_processor_id();
 	u64 start;
+	__ticket_t head;
 	unsigned long flags;
 
 	/* If kicker interrupts not initialized yet, just spin */
@@ -159,11 +160,15 @@ __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 	 */
 	__ticket_enter_slowpath(lock);
 
+	/* make sure enter_slowpath, which is atomic does not cross the read */
+	smp_mb__after_atomic();
+
 	/*
 	 * check again make sure it didn't become free while
 	 * we weren't looking
 	 */
-	if (ACCESS_ONCE(lock->tickets.head) == want) {
+	head = READ_ONCE(lock->tickets.head);
+	if (__tickets_equal(head, want)) {
 		add_stats(TAKEN_SLOW_PICKUP, 1);
 		goto out;
 	}
@@ -204,8 +209,8 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 		const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu);
 
 		/* Make sure we read lock before want */
-		if (ACCESS_ONCE(w->lock) == lock &&
-		    ACCESS_ONCE(w->want) == next) {
+		if (READ_ONCE(w->lock) == lock &&
+		    READ_ONCE(w->want) == next) {
 			add_stats(RELEASED_SLOW_KICKED, 1);
 			xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
 			break;