[Devel] [PATCH RHEL9 COMMIT] vhost-blk: add bounce-buffer for non-aligned requests
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jan 9 17:58:36 MSK 2025
The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.3
------>
commit 2ca2bb57be2003501892615ec92fe2009dc19e62
Author: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
Date: Mon Dec 30 13:39:51 2024 +0800
vhost-blk: add bounce-buffer for non-aligned requests
We see that virtio-blk requests from Windows Server 2022 are not aligned
to sector size (512), specifically that io vectors start at 0x8 offset
to sector alignment. And if we send such unaligned requests directly to
block device it refuses to work with them.
In qemu there is a "bounce buffer" technique to mitigate such problems.
Perfectly aligned intermediary buffer is allocated and request is
converted to use it instead of unaligned original memory. So we should
do the same thing in vhost-blk driver too.
v2:
- bounce buffer uses kvec so bio does not pin those pages, so we should
not relase them too
- write requests with unaligned length are not supported, add warning
and fail such requests
- move io vectors to per-request storage to use after bio completion
- make error handling in vhost_blk_bio_make a bit more explicit
https://virtuozzo.atlassian.net/browse/PSBM-157752
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
======
Patchset description:
vhost-blk: bounce buffer for unaligned requests
Andrey Zhadchenko (2):
vhost-blk: rework iov and bio handling
vhost-blk: add bounce-buffer for non-aligned requests
David Howells (1):
iov_iter: Add a function to extract a page list from an iterator
Pavel Tikhomirov (1):
vhost-blk: remove excess vhost_blk_req.use_inline
Feature: vhost-blk: in-kernel accelerator for virtio-blk guests
---
drivers/vhost/blk.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 137 insertions(+), 11 deletions(-)
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
index 0c80967bfeee..fdb063fafc99 100644
--- a/drivers/vhost/blk.c
+++ b/drivers/vhost/blk.c
@@ -59,6 +59,7 @@ struct vhost_blk_req {
struct iovec *iov;
int iov_nr;
+ void *bb;
atomic_t bio_nr;
struct bio **bio;
@@ -68,7 +69,7 @@ struct vhost_blk_req {
sector_t sector;
int bi_opf;
u16 head;
- long len;
+ size_t len;
int bio_err;
struct vhost_blk *blk;
@@ -153,7 +154,13 @@ static void vhost_blk_req_done(struct bio *bio)
vhost_vq_work_queue(&req->blk_vq->vq, &req->blk_vq->work);
}
- bio_release_pages(bio, !req->bi_opf);
+ /*
+ * Bounce buffer adds kvec to bio (instead of user backed memory),
+ * so there is no reference/pin to bio pages in this case.
+ */
+ if (!req->bb)
+ bio_release_pages(bio, !req->bi_opf);
+
bio_put(bio);
}
@@ -161,6 +168,12 @@ static void vhost_blk_req_cleanup(struct vhost_blk_req *req)
{
if (req->bio != req->inline_bio)
kfree(req->bio);
+
+ if (req->iov != req->blk_vq->iov)
+ kvfree(req->iov);
+
+ if (req->bb)
+ kvfree(req->bb);
}
static int vhost_blk_bio_make_simple(struct vhost_blk_req *req,
@@ -184,11 +197,71 @@ static int vhost_blk_bio_make_simple(struct vhost_blk_req *req,
return 0;
}
+inline static bool vhost_blk_iov_need_bb(struct vhost_blk_req *req)
+{
+ int i;
+
+ if (req->len & SECTOR_MASK)
+ return true;
+
+ for (i = 0; i < req->iov_nr; i++) {
+ if (((size_t)req->iov[i].iov_base) & SECTOR_MASK)
+ return true;
+ if (req->iov[i].iov_len & SECTOR_MASK)
+ return true;
+ }
+
+ return false;
+}
+
+static int vhost_blk_save_iov_to_req(struct vhost_blk_req *req)
+{
+ struct iovec *iov;
+
+ BUG_ON(req->iov != req->blk_vq->iov);
+
+ iov = kvmalloc(req->iov_nr * sizeof(struct iovec), GFP_KERNEL);
+ if (!iov)
+ return -ENOMEM;
+
+ if (move_iovec(req->iov, iov, req->len, req->iov_nr, req->iov_nr) < 0) {
+ kvfree(iov);
+ return -EINVAL;
+ }
+
+ req->iov = iov;
+ return 0;
+}
+
+
+static size_t vhost_blk_move_req_to_bb(struct vhost_blk_req *req)
+{
+ struct iov_iter iter;
+
+ iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+ if (copy_from_iter(req->bb, req->len, &iter) != req->len)
+ return -EINVAL;
+
+ return 0;
+}
+
+static size_t vhost_blk_move_bb_to_req(struct vhost_blk_req *req)
+{
+ struct iov_iter iter;
+
+ iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+ if (copy_to_iter(req->bb, req->len, &iter) != req->len)
+ return -EINVAL;
+
+ return 0;
+}
+
static int vhost_blk_bio_make(struct vhost_blk_req *req,
struct block_device *bdev)
{
int nr_pages, nr_pages_total = 0, bio_nr = 0, ret, i;
struct iov_iter iter;
+ struct kvec kv;
struct bio *bio;
sector_t sector = req->sector;
unsigned long pos = 0;
@@ -196,15 +269,61 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
if (unlikely(req->bi_opf == REQ_OP_FLUSH))
return vhost_blk_bio_make_simple(req, bdev);
- iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+ if (req->bi_opf == REQ_OP_WRITE && req->len & SECTOR_MASK) {
+ WARN_ONCE(1, "vhost-blk: write requests with unaligned len"
+ " are not supported, len = %zu", req->len);
+ return -EINVAL;
+ }
+
+ /*
+ * Requests with unaligned io vectors are not supported by underlined
+ * block device, let's use an aligned intermediary "bounce buffer" to
+ * make such requests work.
+ */
+ if (vhost_blk_iov_need_bb(req)) {
+ size_t len;
+
+ /*
+ * Save io vectors for read request, as we would need to fill
+ * them with read data from bounce buffer after bio completion
+ * (see vhost_blk_move_bb_to_req), and vhost_blk_vq->iov can be
+ * reused by other request at that point.
+ */
+ if (req->bi_opf == REQ_OP_READ) {
+ ret = vhost_blk_save_iov_to_req(req);
+ if (ret)
+ return ret;
+ }
+
+ len = ALIGN(req->len, SECTOR_SIZE);
+ req->bb = kvmalloc(len, GFP_KERNEL);
+ if (!req->bb) {
+ ret = -ENOMEM;
+ goto err_req;
+ }
+
+ if (req->bi_opf == REQ_OP_WRITE) {
+ ret = vhost_blk_move_req_to_bb(req);
+ if (ret)
+ goto err_req;
+ }
+
+ kv.iov_base = req->bb;
+ kv.iov_len = len;
+ iov_iter_kvec(&iter, req->bi_opf, &kv, 1, len);
+ } else {
+ iov_iter_init(&iter, req->bi_opf, req->iov, req->iov_nr, req->len);
+ }
nr_pages_total = iov_iter_npages(&iter, INT_MAX);
if (nr_pages_total > NR_INLINE * BIO_MAX_VECS) {
req->bio = kmalloc(((nr_pages_total + BIO_MAX_VECS - 1) /
BIO_MAX_VECS) * sizeof(struct bio *),
GFP_KERNEL);
- if (!req->bio)
- return -ENOMEM;
+ if (!req->bio) {
+ ret = -ENOMEM;
+ goto err_req;
+ }
} else {
req->bio = req->inline_bio;
}
@@ -215,8 +334,10 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
BUG_ON(pos & SECTOR_MASK);
bio = bio_alloc(bdev, nr_pages, req->bi_opf, GFP_KERNEL);
- if (!bio)
- goto fail;
+ if (!bio) {
+ ret = -ENOMEM;
+ goto err_bio;
+ }
bio->bi_iter.bi_sector = sector;
bio->bi_private = req;
@@ -225,7 +346,7 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
ret = bio_iov_iter_get_pages(bio, &iter);
if (unlikely(ret)) {
bio_put(bio);
- goto fail;
+ goto err_bio;
}
req->bio[bio_nr++] = bio;
@@ -237,15 +358,15 @@ static int vhost_blk_bio_make(struct vhost_blk_req *req,
atomic_set(&req->bio_nr, bio_nr);
return 0;
-fail:
+err_bio:
for (i = 0; i < bio_nr; i++) {
bio_release_pages(req->bio[i], false);
bio_put(req->bio[i]);
}
-
+err_req:
vhost_blk_req_cleanup(req);
- return -ENOMEM;
+ return ret;
}
static inline void vhost_blk_bio_send(struct vhost_blk_req *req)
@@ -329,6 +450,7 @@ static int vhost_blk_req_handle(struct vhost_virtqueue *vq,
req->len = iov_length(vq->iov, total_iov_nr) - sizeof(status);
req->iov_nr = move_iovec(vq->iov, req->iov, req->len, total_iov_nr,
ARRAY_SIZE(blk_vq->iov));
+ req->bb = NULL;
ret = move_iovec(vq->iov, req->status, sizeof(status), total_iov_nr,
ARRAY_SIZE(req->status));
@@ -456,6 +578,10 @@ static void vhost_blk_handle_host_kick(struct vhost_work *work)
if (!blk)
blk = req->blk;
+ if (!req->bio_err && req->bb && req->bi_opf == REQ_OP_READ) {
+ if (vhost_blk_move_bb_to_req(req))
+ req->bio_err = EINVAL;
+ }
vhost_blk_req_cleanup(req);
status = req->bio_err == 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR;
More information about the Devel
mailing list