[Devel] [PATCH RHEL9] fs/fuse kio: always ack RIO_MSG_RDMA_READ_REQ received from csd.

Alexey Kuznetsov kuznet at acronis.com
Thu Nov 23 12:30:53 MSK 2023


Ack

On Thu, Nov 23, 2023 at 3:28 PM Kui Liu <Kui.Liu at acronis.com> wrote:
>
> In our userspace RDMA implementation, it is required that every
>
> RIO_MSG_RDMA_READ_REQ msg must be acked strictly in order. However
>
> this rule can be broken due to a bug in kio, which though is
>
> triggered by very abnormal hardware behaviour that it can take
>
> very long time(>10s)  for a WR to complete.
>
>
>
> This happens in the read workload with large block size that the
>
> the client needs to issue RDMA READ wr to pull the data portion
>
> of a response msg returned by csd. When this operation takes very
>
> long time to complete for a msg, it will block responses to requests
>
> after it from being sent out by csd for as long as it can take.
>
> As a result, these requests will be killed due to calendar timout.
>
> However when these responses arrives later in form of RIO_MSG_RDMA_READ_REQ
>
> msg, they will be ignored silently due to missing reqeust msg without
>
> returning corresponding RIO_MSG_RDMA_RAD_ACK back, therefore breaks the
>
> expectation of ordered ack on the side of csd. Since the rio connection
>
> is still in working state, a later valid msg exchange will trigger
>
> the BUGON check of rb->xid in csd, causing it to crash.
>
>
>
> This patch makes sure client will always ack every RIO_MSG_RDMA_READ_REQ
>
> recevied and in order to avoid crashing csd. However it can't address
>
> any performance impact due to the strange hardware behaviour that it
>
> takes abnorma long time for a WR to complete.
>
>
>
> https://pmc.acronis.work/browse/VSTOR-76834
>
> https://pmc.acronis.work/browse/VSTOR-70758
>
> https://pmc.acronis.work/browse/VSTOR-60807
>
> https://pmc.acronis.work/browse/VSTOR-57903
>
>
>
> Signed-off-by: Liu Kui <Kui.Liu at acronis.com>
>
> ---
>
> fs/fuse/kio/pcs/pcs_rdma_io.c | 58 +++++++++++++++++++++++++++++++----
>
> fs/fuse/kio/pcs/pcs_rdma_io.h |  3 ++
>
> 2 files changed, 55 insertions(+), 6 deletions(-)
>
>
>
> diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.c b/fs/fuse/kio/pcs/pcs_rdma_io.c
>
> index 62d138c8b611..c78126ab1d79 100644
>
> --- a/fs/fuse/kio/pcs/pcs_rdma_io.c
>
> +++ b/fs/fuse/kio/pcs/pcs_rdma_io.c
>
> @@ -130,6 +130,8 @@ static void rio_abort(struct pcs_rdmaio *rio, int error);
>
> static void rio_rx_done(struct rio_cqe *cqe, bool sync_mode);
>
> static void rio_tx_done(struct rio_cqe *cqe, bool sync_mode);
>
> static void rio_tx_err_occured(struct rio_cqe *cqe, bool sync_mode);
>
> +static int rio_submit(struct pcs_rdmaio *rio, struct pcs_msg *msg, int type, u64 xid, int status,
>
> +                                    bool allow_again);
>
>
>
> /* Only called when rio->write_queue is not empty */
>
> static struct pcs_msg *rio_dequeue_msg(struct pcs_rdmaio *rio)
>
> @@ -424,6 +426,10 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg,
>
>                struct pcs_rdma_device *dev = rio->dev;
>
>                struct rio_tx *tx;
>
>
>
> +              /* Blocked until after pending RDMA_READ_ACKs are sent out to keep ACK in order */
>
> +              if (rio->n_rdma_read_ack_pending)
>
> +                              return -EAGAIN;
>
> +
>
>                tx = RE_NULL(rio_get_tx(dev));
>
>                if (!tx) {
>
>                                if (allow_again)
>
> @@ -467,6 +473,8 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg,
>
>                                }
>
>                }
>
>
>
> +              rio->n_rdma_read_ongoing++;
>
> +
>
>                return 0;
>
>
>
> fail:
>
> @@ -478,6 +486,21 @@ static int rio_submit_rdma_read(struct pcs_rdmaio *rio, struct pcs_msg *msg,
>
>                return -EIO;
>
> }
>
>
>
> +static int rio_submit_rdma_read_ack(struct pcs_rdmaio *rio,  u64 xid)
>
> +{
>
> +              int ret;
>
> +
>
> +              /* Can only be sent after all ongoing RDMA_READ_REQs complete to keep ack in order */
>
> +              if (rio->n_rdma_read_ongoing)
>
> +                              return -EAGAIN;
>
> +
>
> +              ret = rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, 0, true);
>
> +              if (!ret)
>
> +                              rio->n_rdma_read_ack_pending--;
>
> +
>
> +              return ret;
>
> +}
>
> +
>
> static int rio_rdma_read_job_work(struct rio_job *j)
>
> {
>
>                struct rio_rdma_read_job *job = container_of(j, struct rio_rdma_read_job, job);
>
> @@ -488,8 +511,15 @@ static int rio_rdma_read_job_work(struct rio_job *j)
>
>                                return 0;
>
>                }
>
>
>
> -               return rio_submit_rdma_read(rio, job->msg, job->offset,
>
> -                                                                   &job->rb, true);
>
> +              /*
>
> +              * Return RDMA_READ_ACK directly if the original request msg had been killed,
>
> +              * however must wait until all previous RDMA_READ_REQs have been acked.
>
> +              */
>
> +              if (job->msg == PCS_TRASH_MSG)
>
> +                              return rio_submit_rdma_read_ack(rio, job->rb.xid);
>
> +              else
>
> +                              return rio_submit_rdma_read(rio, job->msg, job->offset,
>
> +                                                                                  &job->rb, true);
>
> }
>
>
>
> static void rio_rdma_read_job_destroy(struct rio_job *j)
>
> @@ -766,6 +796,7 @@ static void rio_handle_tx(struct pcs_rdmaio *rio, struct rio_tx *tx, int ok)
>
>                                case TX_SUBMIT_RDMA_READ_ACK:
>
>                                                rio_put_tx(rio->dev, tx);
>
>                                                rio_submit(rio, NULL, SUBMIT_RDMA_READ_ACK, xid, !ok, false);
>
> +                                              rio->n_rdma_read_ongoing--;
>
>                                                break;
>
>                                case TX_WAIT_FOR_TX_COMPL:
>
>                                case TX_WAIT_FOR_READ_ACK:
>
> @@ -798,6 +829,7 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len,
>
>                u32 msg_size;
>
>                int offset = rio->hdr_size;
>
>                struct iov_iter it;
>
> +              struct rio_rdma_read_job *job;
>
>
>
>                if (rio->throttled) {
>
>                                *throttle = 1;
>
> @@ -820,6 +852,19 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len,
>
>                                return err;
>
>                } else if (msg == PCS_TRASH_MSG) {
>
>                                TRACE("rio drop trash msg: %u, rio: 0x%p\n", msg_size, rio);
>
> +                              /*
>
> +                              * We must Ack every RDMA_READ_REQ received from our peer in order even it's going to be dropped.
>
> +                              * Missing ack will result in out of order ACK to our peer, which will cause it to crash.
>
> +                              * So we setup a job to ack this msg however it can only be sent out after all ongoing RDMA READ
>
> +                              * completes and will block future RDMA READ being issued.
>
> +                              */
>
> +                              if (rb) {
>
> +                                              job = rio_rdma_read_job_alloc(rio, msg, 0, rb);
>
> +                                              if (!job)
>
> +                                                              return PCS_ERR_NOMEM;
>
> +                                              rio_post_tx_job(rio, &job->job);
>
> +                                              rio->n_rdma_read_ack_pending++;
>
> +                              }
>
>                                return 0;
>
>                }
>
>
>
> @@ -852,12 +897,10 @@ static int rio_handle_rx_immediate(struct pcs_rdmaio *rio, char *buf, int len,
>
>                if (len == msg->size) {
>
>                                msg->done(msg);
>
>                } else if (rio_submit_rdma_read(rio, msg, offset, rb, true) == -EAGAIN) {
>
> -                               struct rio_rdma_read_job *job;
>
>                                job = rio_rdma_read_job_alloc(rio, msg, offset, rb);
>
>                                if (!job)
>
> -                                               rio_submit_rdma_read(rio, msg, offset, rb, false);
>
> -                               else
>
> -                                               rio_post_tx_job(rio, &job->job);
>
> +                                              return PCS_ERR_NOMEM;
>
> +                              rio_post_tx_job(rio, &job->job);
>
>                }
>
>
>
>                return 0;
>
> @@ -1228,6 +1271,9 @@ struct pcs_rdmaio* pcs_rdma_create(int hdr_size, struct rdma_cm_id *cmid,
>
>                rio->n_os_credits = 0;
>
>                rio->n_th_credits = queue_depth / 2;
>
>
>
> +              rio->n_rdma_read_ongoing = 0;
>
> +              rio->n_rdma_read_ack_pending = 0;
>
> +
>
>                rio->cmid = cmid;
>
>
>
>                INIT_LIST_HEAD(&rio->write_queue);
>
> diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.h b/fs/fuse/kio/pcs/pcs_rdma_io.h
>
> index 18962208e4a2..c5109cbc5fe1 100644
>
> --- a/fs/fuse/kio/pcs/pcs_rdma_io.h
>
> +++ b/fs/fuse/kio/pcs/pcs_rdma_io.h
>
> @@ -90,6 +90,9 @@ struct pcs_rdmaio
>
>                int n_th_credits;   /* threshold: when to return outstanding
>
>                                                     * credits urgently */
>
>
>
> +              int n_rdma_read_ongoing;           /* number of ongoing RDMA_READ. */
>
> +              int n_rdma_read_ack_pending; /* number of RDMA_READ_ACK to be submitted */
>
> +
>
>                struct pcs_rdma_device *dev;
>
>                struct rdma_cm_id *cmid;
>
>                struct ib_cq *cq;
>
> --
>
> 2.32.0 (Apple Git-132)
>
> _______________________________________________
> Devel mailing list
> Devel at openvz.org
> https://lists.openvz.org/mailman/listinfo/devel



More information about the Devel mailing list