From b05bc5f05e807d4e790ebfda7156649e5ba52509 Mon Sep 17 00:00:00 2001 From: Liu Kui Date: Thu, 23 Nov 2023 13:07:54 +0800 Subject: [PATCH RHEL9] fs/fuse kio: always ack RIO_MSG_RDMA_READ_REQ received from csd. In our userspace RDMA implementation, it is required that every RIO_MSG_RDMA_READ_REQ msg must be acked strictly in order. However this rule can be broken due to a bug in kio, which though is triggered by very abnormal hardware behaviour that it can take very long time(>10s) for a WR to complete. This happens in the read workload with large block size that the the client needs to issue RDMA READ wr to pull the data portion of a response msg returned by csd. When this operation takes very long time to complete for a msg, it will block responses to requests after it from being sent out by csd for as long as it can take. As a result, these requests will be killed due to calendar timout. However when these responses arrives later in form of RIO_MSG_RDMA_READ_REQ msg, they will be ignored silently due to missing reqeust msg without returning corresponding RIO_MSG_RDMA_RAD_ACK back, therefore breaks the expectation of ordered ack on the side of csd. Since the rio connection is still in working state, a later valid msg exchange will trigger the BUGON check of rb->xid in csd, causing it to crash. This patch makes sure client will always ack every RIO_MSG_RDMA_READ_REQ recevied and in order to avoid crashing csd. However it can't address any performance impact due to the strange hardware behaviour that it takes abnorma long time for a WR to complete. https://pmc.acronis.work/browse/VSTOR-76834 https://pmc.acronis.work/browse/VSTOR-70758 https://pmc.acronis.work/browse/VSTOR-60807 https://pmc.acronis.work/browse/VSTOR-57903 Signed-off-by: Liu Kui --- fs/fuse/kio/pcs/pcs_rdma_io.c | 58 +++++++++++++++++++++++++++++++---- fs/fuse/kio/pcs/pcs_rdma_io.h | 3 ++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.c b/fs/fuse/kio/pcs/pcs_rdma_io.c index 62d138c8b611..c78126ab1d79 100644 --- a/fs/fuse/kio/pcs/pcs_rdma_io.c +++ b/fs/fuse/kio/pcs/pcs_rdma_io.c @@ -130,6 +130,8 @@ static void rio_abort(struct pcs_rdmaio *rio, int error); static void rio_rx_done(struct rio_cqe *cqe, bool sync_mode); static void rio_tx_done(struct rio_cqe *cqe, bool sync_mode); static void rio_tx_err_occured(struct rio_cqe *cqe, bool sync_mode); +static int rio_submit(struct pcs_rdmaio *rio, struct pcs_msg *msg, int type, u64 xid, int status, + bool allow_again); /* Only called when rio->write_queue is not empty */ static struct pcs_msg *rio_dequeue_msg(struct pcs_rdmaio *rio) @@ -424,6 +426,10 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg, struct pcs_rdma_device *dev = rio->dev; struct rio_tx *tx; + /* Blocked until after pending RDMA_READ_ACKs are sent out to keep ACK in order */ + if (rio->n_rdma_read_ack_pending) + return -EAGAIN; + tx = RE_NULL(rio_get_tx(dev)); if (!tx) { if (allow_again) @@ -467,6 +473,8 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg, } } + rio->n_rdma_read_ongoing++; + return 0; fail: @@ -478,6 +486,21 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg, return -EIO; } +static int rio_submit_rdma_read_ack(struct pcs_rdmaio *rio, u64 xid) +{ + int ret; + + /* Can only be sent after all ongoing RDMA_READ_REQs complete to keep ack in order */ + if (rio->n_rdma_read_ongoing) + return -EAGAIN; + + ret = rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, 0, true); + if (!ret) + rio->n_rdma_read_ack_pending--; + + return ret; +} + static int rio_rdma_read_job_work(struct rio_job *j) { struct rio_rdma_read_job *job = container_of(j, struct rio_rdma_read_job, job); @@ -488,8 +511,15 @@ static int rio_rdma_read_job_work(struct rio_job *j) return 0; } - return rio_submit_rdma_read(rio, job->msg, job->offset, - &job->rb, true); + /* + * Return RDMA_READ_ACK directly if the original request msg had been killed, + * however must wait until all previous RDMA_READ_REQs have been acked. + */ + if (job->msg == PCS_TRASH_MSG) + return rio_submit_rdma_read_ack(rio, job->rb.xid); + else + return rio_submit_rdma_read(rio, job->msg, job->offset, + &job->rb, true); } static void rio_rdma_read_job_destroy(struct rio_job *j) @@ -766,6 +796,7 @@ static void rio_handle_tx(struct pcs_rdmaio *rio, struct rio_tx *tx, int ok) case TX_SUBMIT_RDMA_READ_ACK: rio_put_tx(rio->dev, tx); rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, !ok, false); + rio->n_rdma_read_ongoing--; break; case TX_WAIT_FOR_TX_COMPL: case TX_WAIT_FOR_READ_ACK: @@ -798,6 +829,7 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len, u32 msg_size; int offset = rio->hdr_size; struct iov_iter it; + struct rio_rdma_read_job *job; if (rio->throttled) { *throttle = 1; @@ -820,6 +852,19 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len, return err; } else if (msg == PCS_TRASH_MSG) { TRACE("rio drop trash msg: %u, rio: 0x%p\n", msg_size, rio); + /* + * We must Ack every RDMA_READ_REQ received from our peer in order even it's going to be dropped. + * Missing ack will result in out of order ACK to our peer, which will cause it to crash. + * So we setup a job to ack this msg however it can only be sent out after all ongoing RDMA READ + * completes and will block future RDMA READ being issued. + */ + if (rb) { + job = rio_rdma_read_job_alloc(rio, msg, 0, rb); + if (!job) + return PCS_ERR_NOMEM; + rio_post_tx_job(rio, &job->job); + rio->n_rdma_read_ack_pending++; + } return 0; } @@ -852,12 +897,10 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len, if (len == msg->size) { msg->done(msg); } else if (rio_submit_rdma_read(rio, msg, offset, rb, true) == -EAGAIN) { - struct rio_rdma_read_job *job; job = rio_rdma_read_job_alloc(rio, msg, offset, rb); if (!job) - rio_submit_rdma_read(rio, msg, offset, rb, false); - else - rio_post_tx_job(rio, &job->job); + return PCS_ERR_NOMEM; + rio_post_tx_job(rio, &job->job); } return 0; @@ -1228,6 +1271,9 @@ struct pcs_rdmaio* pcs_rdma_create(int hdr_size, struct rdma_cm_id *cmid, rio->n_os_credits = 0; rio->n_th_credits = queue_depth / 2; + rio->n_rdma_read_ongoing = 0; + rio->n_rdma_read_ack_pending = 0; + rio->cmid = cmid; INIT_LIST_HEAD(&rio->write_queue); diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.h b/fs/fuse/kio/pcs/pcs_rdma_io.h index 18962208e4a2..c5109cbc5fe1 100644 --- a/fs/fuse/kio/pcs/pcs_rdma_io.h +++ b/fs/fuse/kio/pcs/pcs_rdma_io.h @@ -90,6 +90,9 @@ struct pcs_rdmaio int n_th_credits; /* threshold: when to return outstanding * credits urgently */ + int n_rdma_read_ongoing; /* number of ongoing RDMA_READ. */ + int n_rdma_read_ack_pending; /* number of RDMA_READ_ACK to be submitted */ + struct pcs_rdma_device *dev; struct rdma_cm_id *cmid; struct ib_cq *cq; -- 2.32.0 (Apple Git-132)