[Devel] [PATCH v2 VZ9 4/4] vhost-blk: add bounce-buffer for non-aligned requests

Mon Dec 30 08:39:51 MSK 2024

From: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>

We see that virtio-blk requests from Windows Server 2022 are not aligned
to sector size (512), specifically that io vectors start at 0x8 offset
to sector alignment. And if we send such unaligned requests directly to
block device it refuses to work with them.

In qemu there is a "bounce buffer" technique to mitigate such problems.
Perfectly aligned intermediary buffer is allocated and request is
converted to use it instead of unaligned original memory. So we should
do the same thing in vhost-blk driver too.

v2:
- bounce buffer uses kvec so bio does not pin those pages, so we should
  not relase them too
- write requests with unaligned length are not supported, add warning
  and fail such requests
- move io vectors to per-request storage to use after bio completion
- make error handling in vhost_blk_bio_make a bit more explicit

https://virtuozzo.atlassian.net/browse/PSBM-157752
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
Note:

My original plan was to support all kinds of unaligned requests
including write requests with unaligned length. To emulate such a
requests with aligned-only operations we need to pre-read the last
(partial) sector into memory, combine it's data with write data and do
aligned write with combined data.

Den says that such requests should not be considered correct and error
for them is fine. Just in case, I have some early (not yet working, not
shure why) version of it saved here:

https://bitbucket.org/virtuozzocore/vzkernel.ptikhomirov/commits/branch/vhost-blk-improvements-4
---
 drivers/vhost/blk.c | 148 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 137 insertions(+), 11 deletions(-)

diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
index 0c80967bfeee1..fdb063fafc996 100644
--- a/drivers/vhost/blk.c
+++ b/drivers/vhost/blk.c
@@ -59,6 +59,7 @@ struct vhost_blk_req {
 
 	struct iovec *iov;
 	int iov_nr;
+	void *bb;
 
 	atomic_t bio_nr;
 	struct bio **bio;
@@ -68,7 +69,7 @@ struct vhost_blk_req {
 	sector_t sector;
 	int bi_opf;
 	u16 head;
-	long len;
+	size_t len;
 	int bio_err;
 
 	struct vhost_blk *blk;
@@ -153,7 +154,13 @@ static void vhost_blk_req_done(struct bio *bio)
 		vhost_vq_work_queue(&req->blk_vq->vq, &req->blk_vq->work);
 	}
 
-	bio_release_pages(bio, !req->bi_opf);
+	/*
+	 * Bounce buffer adds kvec to bio (instead of user backed memory),
+	 * so there is no reference/pin to bio pages in this case.
+	 */
+	if (!req->bb)
+		bio_release_pages(bio, !req->bi_opf);
+
 	bio_put(bio);
 }
 
@@ -161,6 +168,12 @@ static void vhost_blk_req_cleanup(struct vhost_blk_req *req)
 {
 	if (req->bio != req->inline_bio)
 		kfree(req->bio);
+
+	if (req->iov != req->blk_vq->iov)
+		kvfree(req->iov);
+
+	if (req->bb)
+		kvfree(req->bb);
 }
 
 static int vhost_blk_bio_make_simple(struct vhost_blk_req *req,
@@ -184,11 +197,71 @@ static int vhost_blk_bio_make_simple(struct vhost_blk_req *req,
 	return 0;
 }
 
+inline static bool vhost_blk_iov_need_bb(struct vhost_blk_req *req)
+{
+	int i;
+
+	if (req->len & SECTOR_MASK)
+		return true;
+
+	for (i = 0; i < req->iov_nr; i++) {
+		if (((size_t)req->iov[i].iov_base) & SECTOR_MASK)
+			return true;
+		if (req->iov[i].iov_len & SECTOR_MASK)
+			return true;
+	}
+
+	return false;
+}
+
+static int vhost_blk_save_iov_to_req(struct vhost_blk_req *req)
+{
+	struct iovec *iov;
+
+	BUG_ON(req->iov != req->blk_vq->iov);
+
+	iov = kvmalloc(req->iov_nr * sizeof(struct iovec), GFP_KERNEL);
+	if (!iov)
+		return -ENOMEM;
+
+	if (move_iovec(req->iov, iov, req->len, req->iov_nr, req->iov_nr) < 0) {
+		kvfree(iov);
+		return -EINVAL;
+	}
+
+	req->iov = iov;
+	return 0;
+}
+
+
+static size_t vhost_blk_move_req_to_bb(struct vhost_blk_req *req)
+{
+	struct iov_iter iter;
+
+	iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+	if (copy_from_iter(req->bb, req->len, &iter) != req->len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static size_t vhost_blk_move_bb_to_req(struct vhost_blk_req *req)
+{
+	struct iov_iter iter;
+
+	iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+	if (copy_to_iter(req->bb, req->len, &iter) != req->len)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int vhost_blk_bio_make(struct vhost_blk_req *req,
 			      struct block_device *bdev)
 {
 	int nr_pages, nr_pages_total = 0, bio_nr = 0, ret, i;
 	struct iov_iter iter;
+	struct kvec kv;
 	struct bio *bio;
 	sector_t sector = req->sector;
 	unsigned long pos = 0;
@@ -196,15 +269,61 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
 	if (unlikely(req->bi_opf == REQ_OP_FLUSH))
 		return vhost_blk_bio_make_simple(req, bdev);
 
-	iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+	if (req->bi_opf == REQ_OP_WRITE && req->len & SECTOR_MASK) {
+		WARN_ONCE(1, "vhost-blk: write requests with unaligned len"
+			  " are not supported, len = %zu", req->len);
+		return -EINVAL;
+	}
+
+	/*
+	 * Requests with unaligned io vectors are not supported by underlined
+	 * block device, let's use an aligned intermediary "bounce buffer" to
+	 * make such requests work.
+	 */
+	if (vhost_blk_iov_need_bb(req)) {
+		size_t len;
+
+		/*
+		 * Save io vectors for read request, as we would need to fill
+		 * them with read data from bounce buffer after bio completion
+		 * (see vhost_blk_move_bb_to_req), and vhost_blk_vq->iov can be
+		 * reused by other request at that point.
+		 */
+		if (req->bi_opf == REQ_OP_READ) {
+			ret = vhost_blk_save_iov_to_req(req);
+			if (ret)
+				return ret;
+		}
+
+		len = ALIGN(req->len, SECTOR_SIZE);
+		req->bb = kvmalloc(len, GFP_KERNEL);
+		if (!req->bb) {
+			ret = -ENOMEM;
+			goto err_req;
+		}
+
+		if (req->bi_opf == REQ_OP_WRITE) {
+			ret = vhost_blk_move_req_to_bb(req);
+			if (ret)
+				goto err_req;
+		}
+
+		kv.iov_base = req->bb;
+		kv.iov_len = len;
+		iov_iter_kvec(&iter, req->bi_opf, &kv, 1, len);
+	} else {
+		iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+	}
 
 	nr_pages_total = iov_iter_npages(&iter, INT_MAX);
 	if (nr_pages_total > NR_INLINE * BIO_MAX_VECS) {
 		req->bio = kmalloc(((nr_pages_total + BIO_MAX_VECS - 1) /
 				    BIO_MAX_VECS) * sizeof(struct bio *),
 				   GFP_KERNEL);
-		if (!req->bio)
-			return -ENOMEM;
+		if (!req->bio) {
+			ret = -ENOMEM;
+			goto err_req;
+		}
 	} else {
 		req->bio = req->inline_bio;
 	}
@@ -215,8 +334,10 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
 		BUG_ON(pos & SECTOR_MASK);
 
 		bio = bio_alloc(bdev, nr_pages,	req->bi_opf, GFP_KERNEL);
-		if (!bio)
-			goto fail;
+		if (!bio) {
+			ret = -ENOMEM;
+			goto err_bio;
+		}
 
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_private = req;
@@ -225,7 +346,7 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
 		ret = bio_iov_iter_get_pages(bio, &iter);
 		if (unlikely(ret)) {
 			bio_put(bio);
-			goto fail;
+			goto err_bio;
 		}
 		req->bio[bio_nr++] = bio;
 
@@ -237,15 +358,15 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
 
 	atomic_set(&req->bio_nr, bio_nr);
 	return 0;
-fail:
+err_bio:
 	for (i = 0; i < bio_nr; i++) {
 		bio_release_pages(req->bio[i], false);
 		bio_put(req->bio[i]);
 	}
-
+err_req:
 	vhost_blk_req_cleanup(req);
 
-	return -ENOMEM;
+	return ret;
 }
 
 static inline void vhost_blk_bio_send(struct vhost_blk_req *req)
@@ -329,6 +450,7 @@ static int vhost_blk_req_handle(struct vhost_virtqueue *vq,
 	req->len	= iov_length(vq->iov, total_iov_nr) - sizeof(status);
 	req->iov_nr	= move_iovec(vq->iov, req->iov, req->len, total_iov_nr,
 				     ARRAY_SIZE(blk_vq->iov));
+	req->bb		= NULL;
 
 	ret = move_iovec(vq->iov, req->status, sizeof(status), total_iov_nr,
 			 ARRAY_SIZE(req->status));
@@ -456,6 +578,10 @@ static void vhost_blk_handle_host_kick(struct vhost_work *work)
 		if (!blk)
 			blk = req->blk;
 
+		if (!req->bio_err && req->bb && req->bi_opf == REQ_OP_READ) {
+			if (vhost_blk_move_bb_to_req(req))
+				req->bio_err = EINVAL;
+		}
 		vhost_blk_req_cleanup(req);
 
 		status = req->bio_err == 0 ?  VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
-- 
2.47.0