[Devel] [PATCH RHEL9 COMMIT] fs/fuse kio: always ack RIO_MSG_RDMA_READ_REQ received from csd

Konstantin Khorenko khorenko at virtuozzo.com
Thu Nov 23 14:16:49 MSK 2023


The commit is pushed to "branch-rh9-5.14.0-284.25.1.vz9.30.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-284.25.1.vz9.30.11
------>
commit 536f6f0a533f7932f9314c3464c495437c194c43
Author: Kui Liu <Kui.Liu at acronis.com>
Date:   Thu Nov 23 07:26:11 2023 +0000

    fs/fuse kio: always ack RIO_MSG_RDMA_READ_REQ received from csd
    
    In our userspace RDMA implementation, it is required that every
    RIO_MSG_RDMA_READ_REQ msg must be acked strictly in order.
    However this rule can be broken due to a bug in kio, which though
    is triggered by very abnormal hardware behaviour that it can take
    very long time (>10s) for a WR to complete.
    
    This happens in the read workload with large block size that the
    the client needs to issue RDMA READ wr to pull the data portion
    of a response msg returned by csd. When this operation takes very
    long time to complete for a msg, it will block responses to requests
    after it from being sent out by csd for as long as it can take.
    As a result, these requests will be killed due to calendar timeout.
    However when these responses arrives later in form of RIO_MSG_RDMA_READ_REQ
    msg, they will be ignored silently due to missing request msg without
    returning corresponding RIO_MSG_RDMA_RAD_ACK back, therefore breaks the
    expectation of ordered ack on the side of csd. Since the rio connection
    is still in working state, a later valid msg exchange will trigger
    the BUGON check of rb->xid in csd, causing it to crash.
    
    This patch makes sure client will always ack every RIO_MSG_RDMA_READ_REQ
    received and in order to avoid crashing csd. However it can't address
    any performance impact due to the strange hardware behaviour that it
    takes abnormal long time for a WR to complete.
    
    https://pmc.acronis.work/browse/VSTOR-76834
    https://pmc.acronis.work/browse/VSTOR-70758
    https://pmc.acronis.work/browse/VSTOR-60807
    https://pmc.acronis.work/browse/VSTOR-57903
    
    Signed-off-by: Liu Kui <Kui.Liu at acronis.com>
    Acked-by: Alexey Kuznetsov <kuznet at acronis.com>
---
 fs/fuse/kio/pcs/pcs_rdma_io.c | 62 ++++++++++++++++++++++++++++++++++++++-----
 fs/fuse/kio/pcs/pcs_rdma_io.h |  3 +++
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.c b/fs/fuse/kio/pcs/pcs_rdma_io.c
index 62d138c8b611..d2646056ff68 100644
--- a/fs/fuse/kio/pcs/pcs_rdma_io.c
+++ b/fs/fuse/kio/pcs/pcs_rdma_io.c
@@ -130,6 +130,8 @@ static void rio_abort(struct pcs_rdmaio *rio, int error);
 static void rio_rx_done(struct rio_cqe *cqe, bool sync_mode);
 static void rio_tx_done(struct rio_cqe *cqe, bool sync_mode);
 static void rio_tx_err_occured(struct rio_cqe *cqe, bool sync_mode);
+static int rio_submit(struct pcs_rdmaio *rio, struct pcs_msg *msg, int type, u64 xid, int status,
+		      bool allow_again);
 
 /* Only called when rio->write_queue is not empty */
 static struct pcs_msg *rio_dequeue_msg(struct pcs_rdmaio *rio)
@@ -424,6 +426,10 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg,
 	struct pcs_rdma_device *dev = rio->dev;
 	struct rio_tx *tx;
 
+	/* Blocked until after pending RDMA_READ_ACKs are sent out to keep ACK in order */
+	if (rio->n_rdma_read_ack_pending)
+		return -EAGAIN;
+
 	tx = RE_NULL(rio_get_tx(dev));
 	if (!tx) {
 		if (allow_again)
@@ -467,6 +473,8 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg,
 		}
 	}
 
+	rio->n_rdma_read_ongoing++;
+
 	return 0;
 
 fail:
@@ -478,6 +486,21 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg,
 	return -EIO;
 }
 
+static int rio_submit_rdma_read_ack(struct pcs_rdmaio *rio,  u64 xid)
+{
+	int ret;
+
+	/* Can only be sent after all ongoing RDMA_READ_REQs complete to keep ack in order */
+	if (rio->n_rdma_read_ongoing)
+		return -EAGAIN;
+
+	ret = rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, 0, true);
+	if (!ret)
+		rio->n_rdma_read_ack_pending--;
+
+	return ret;
+}
+
 static int rio_rdma_read_job_work(struct rio_job *j)
 {
 	struct rio_rdma_read_job *job = container_of(j, struct rio_rdma_read_job, job);
@@ -488,8 +511,16 @@ static int rio_rdma_read_job_work(struct rio_job *j)
 		return 0;
 	}
 
-	return rio_submit_rdma_read(rio, job->msg, job->offset,
-				    &job->rb, true);
+	/*
+	 * Return RDMA_READ_ACK directly if the original request msg had been
+	 * killed, however must wait until all previous RDMA_READ_REQs have
+	 * been acked.
+	 */
+	if (job->msg == PCS_TRASH_MSG)
+		return rio_submit_rdma_read_ack(rio, job->rb.xid);
+	else
+		return rio_submit_rdma_read(rio, job->msg, job->offset,
+					    &job->rb, true);
 }
 
 static void rio_rdma_read_job_destroy(struct rio_job *j)
@@ -766,6 +797,7 @@ static void rio_handle_tx(struct pcs_rdmaio *rio, struct rio_tx *tx, int ok)
 		case TX_SUBMIT_RDMA_READ_ACK:
 			rio_put_tx(rio->dev, tx);
 			rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, !ok, false);
+			rio->n_rdma_read_ongoing--;
 			break;
 		case TX_WAIT_FOR_TX_COMPL:
 		case TX_WAIT_FOR_READ_ACK:
@@ -798,6 +830,7 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len,
 	u32 msg_size;
 	int offset = rio->hdr_size;
 	struct iov_iter it;
+	struct rio_rdma_read_job *job;
 
 	if (rio->throttled) {
 		*throttle = 1;
@@ -820,6 +853,22 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len,
 		return err;
 	} else if (msg == PCS_TRASH_MSG) {
 		TRACE("rio drop trash msg: %u, rio: 0x%p\n", msg_size, rio);
+		/*
+		 * We must Ack every RDMA_READ_REQ received from our peer in
+		 * order even it's going to be dropped.
+		 * Missing ack will result in out of order ACK to our peer,
+		 * which will cause it to crash.
+		 * So we setup a job to ack this msg however it can only be
+		 * sent out after all ongoing RDMA READ completes and will
+		 * block future RDMA READ being issued.
+		 */
+		if (rb) {
+			job = rio_rdma_read_job_alloc(rio, msg, 0, rb);
+			if (!job)
+				return PCS_ERR_NOMEM;
+			rio_post_tx_job(rio, &job->job);
+			rio->n_rdma_read_ack_pending++;
+		}
 		return 0;
 	}
 
@@ -852,12 +901,10 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len,
 	if (len == msg->size) {
 		msg->done(msg);
 	} else if (rio_submit_rdma_read(rio, msg, offset, rb, true) == -EAGAIN) {
-		struct rio_rdma_read_job *job;
 		job = rio_rdma_read_job_alloc(rio, msg, offset, rb);
 		if (!job)
-			rio_submit_rdma_read(rio, msg, offset, rb, false);
-		else
-			rio_post_tx_job(rio, &job->job);
+			return PCS_ERR_NOMEM;
+		rio_post_tx_job(rio, &job->job);
 	}
 
 	return 0;
@@ -1228,6 +1275,9 @@ struct pcs_rdmaio* pcs_rdma_create(int hdr_size, struct rdma_cm_id *cmid,
 	rio->n_os_credits = 0;
 	rio->n_th_credits = queue_depth / 2;
 
+	rio->n_rdma_read_ongoing = 0;
+	rio->n_rdma_read_ack_pending = 0;
+
 	rio->cmid = cmid;
 
 	INIT_LIST_HEAD(&rio->write_queue);
diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.h b/fs/fuse/kio/pcs/pcs_rdma_io.h
index 18962208e4a2..c5109cbc5fe1 100644
--- a/fs/fuse/kio/pcs/pcs_rdma_io.h
+++ b/fs/fuse/kio/pcs/pcs_rdma_io.h
@@ -90,6 +90,9 @@ struct pcs_rdmaio
 	int n_th_credits;   /* threshold: when to return outstanding
 			     * credits urgently */
 
+	int n_rdma_read_ongoing;	/* number of ongoing RDMA_READ. */
+	int n_rdma_read_ack_pending;	/* number of RDMA_READ_ACK to be submitted */
+
 	struct pcs_rdma_device *dev;
 	struct rdma_cm_id *cmid;
 	struct ib_cq *cq;


More information about the Devel mailing list