[Devel] [PATCH RHEL7 COMMIT] scsi/eh: fix hang adding ehandler wakeups after decrementing host_busy

Wed Sep 6 19:14:50 MSK 2017

The commit is pushed to "branch-rh7-3.10.0-514.26.1.vz7.35.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.26.1.vz7.35.7
------>
commit 9ad5b119debbea53b7c9967c8b9060cc83a2ffb5
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Wed Sep 6 19:14:50 2017 +0300

    scsi/eh: fix hang adding ehandler wakeups after decrementing host_busy
    
    We have a problem on several our nodes with scsi EH. Imagine such an
    order of execution of two threads:
    
    CPU1 scsi_eh_scmd_add		CPU2 scsi_host_queue_ready
    /* shost->host_busy == 1 initialy */
    
    				if (shost->shost_state == SHOST_RECOVERY)
    					/* does not get here */
    					return 0;
    
    lock(shost->host_lock);
    shost->shost_state = SHOST_RECOVERY;
    
    				busy = shost->host_busy++;
    				/* host->can_queue == 1 initialy, busy == 1 */
    				lock(shost->host_lock) /* wait */
    
    shost->host_failed++;
    /* shost->host_busy == 2, shost->host_failed == 1 */
    call scsi_eh_wakeup(shost) {
    	if (host_busy == host_failed) {
    		/* does not get here */
    		wake_up_process(shost->ehandler)
    	}
    }
    unlock(shost->host_lock)
    
    				/* acquire lock */
    				shost->host_busy--;
    
    Finaly we do not wakeup scsi_error_handler and all other commands
    comming will hang as we are in never ending recovery state as there
    is no one left to wakeup handler.
    
    So scsi disc in these host becomes unresponsive and all bio on node
    hangs. (We trigger these problem when scsi cmnds to DVD drive timeout.)
    
    Main idea of the fix is to try to do wake up every time we decrement
    host_busy or increment host_faild(the latter is already OK).
    
    Now the very *last* one of busy threads getting host_lock after
    decrementing host_busy will see all write operations on host's
    shost_state, host_busy and host_failed completed thanks to implied
    memory barriers on spin_lock/unlock, so at the time of busy==failed
    we will trigger wakeup in at least one thread. (Thats why putting
    recovery and failed check under lock)
    
    Patch sent to mainstream as well:
    https://patchwork.kernel.org/patch/9938917/
    
    https://jira.sw.ru/browse/PSBM-69788
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 drivers/scsi/scsi_lib.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0f55949..ec21118 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -320,13 +320,13 @@ void scsi_device_unbusy(struct scsi_device *sdev)
 	if (starget->can_queue > 0)
 		atomic_dec(&starget->target_busy);
 
+	spin_lock_irqsave(shost->host_lock, flags);
 	if (unlikely(scsi_host_in_recovery(shost) &&
 		     (shost->host_failed || shost->host_eh_scheduled))) {
-		spin_lock_irqsave(shost->host_lock, flags);
 		scsi_debug_log_shost(SCSI_DEVICE_UNBUSY_CALLS_EH_WAKEUP, shost);
 		scsi_eh_wakeup(shost);
-		spin_unlock_irqrestore(shost->host_lock, flags);
 	}
+	spin_unlock_irqrestore(shost->host_lock, flags);
 
 	atomic_dec(&sdev->device_busy);
 }
@@ -1568,6 +1568,13 @@ static inline int scsi_host_queue_ready(struct request_queue *q,
 out_dec:
 	scsi_debug_log_sdev(SCSI_HOST_QUEUE_READY_DEC_HOST_BUSY, sdev);
 	atomic_dec(&shost->host_busy);
+
+	spin_lock_irq(shost->host_lock);
+	if (unlikely(scsi_host_in_recovery(shost) &&
+		     (shost->host_failed || shost->host_eh_scheduled)))
+		scsi_eh_wakeup(shost);
+	spin_unlock_irq(shost->host_lock);
+
 	return 0;
 }
 
@@ -1958,6 +1965,13 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 out_dec_host_busy:
 	scsi_debug_log_sdev(SCSI_QUEUE_RQ_DEC_HOST_BUSY, sdev);
 	atomic_dec(&shost->host_busy);
+
+	spin_lock_irq(shost->host_lock);
+	if (unlikely(scsi_host_in_recovery(shost) &&
+		     (shost->host_failed || shost->host_eh_scheduled)))
+		scsi_eh_wakeup(shost);
+	spin_unlock_irq(shost->host_lock);
+
 out_dec_target_busy:
 	if (scsi_target(sdev)->can_queue > 0)
 		atomic_dec(&scsi_target(sdev)->target_busy);