[Devel] [PATCH RH9 v2 01/10] drivers/vhost: vhost-blk accelerator for virtio-blk guests
Andrey Zhadchenko
andrey.zhadchenko at virtuozzo.com
Thu Sep 15 20:10:24 MSK 2022
On 9/15/22 20:05, Andrey Zhadchenko wrote:
>
>
> On 9/15/22 17:28, Pavel Tikhomirov wrote:
>>
>>
>> On 08.09.2022 18:32, Andrey Zhadchenko wrote:
>>> Although QEMU virtio is quite fast, there is still some room for
>>> improvements. Disk latency can be reduced if we handle virito-blk
>>> requests
>>> in host kernel istead of passing them to QEMU. The patch adds vhost-blk
>>> kernel module to do so.
>>>
>>> Some test setups:
>>> fio --direct=1 --rw=randread --bs=4k --ioengine=libaio --iodepth=128
>>> QEMU drive options: cache=none
>>> filesystem: xfs
>>>
>>> SSD:
>>> | randread, IOPS | randwrite, IOPS |
>>> Host | 95.8k | 85.3k |
>>> QEMU virtio | 57.5k | 79.4k |
>>> QEMU vhost-blk | 95.6k | 84.3k |
>>>
>>> RAMDISK (vq == vcpu):
>>> | randread, IOPS | randwrite, IOPS |
>>> virtio, 1vcpu | 123k | 129k |
>>> virtio, 2vcpu | 253k (??) | 250k (??) |
>>> virtio, 4vcpu | 158k | 154k |
>>> vhost-blk, 1vcpu | 110k | 113k |
>>> vhost-blk, 2vcpu | 247k | 252k |
>>> vhost-blk, 4vcpu | 576k | 567k |
>>>
>>> https://jira.sw.ru/browse/PSBM-139414
>>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>> ---
>>> v2:
>>> - removed unused VHOST_BLK_VQ
>>> - reworked bio handling a bit: now add all pages from signle iov into
>>> single bio istead of allocating one bio per page
>>> - changed how to calculate sector incrementation
>>> - check move_iovec() in vhost_blk_req_handle()
>>> - remove snprintf check and better check ret from copy_to_iter for
>>> VIRTIO_BLK_ID_BYTES requests
>>> - discard vq request if vhost_blk_req_handle() returned negative code
>>> - forbid to change nonzero backend in vhost_blk_set_backend().
>>> First of
>>> all, QEMU sets backend only once. Also if we want to change backend when
>>> we already running requests we need to be much more careful in
>>> vhost_blk_handle_guest_kick() as it is not taking any references. If
>>> userspace want to change backend that bad it can always reset device.
>>> - removed EXPERIMENTAL from Kconfig
>>>
>>> drivers/vhost/Kconfig | 12 +
>>> drivers/vhost/Makefile | 3 +
>>> drivers/vhost/blk.c | 829 +++++++++++++++++++++++++++++++++++++
>>> include/uapi/linux/vhost.h | 5 +
>>> 4 files changed, 849 insertions(+)
>>> create mode 100644 drivers/vhost/blk.c
>>>
>>> diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
>>> index 587fbae06182..e1389bf0c10b 100644
>>> --- a/drivers/vhost/Kconfig
>>> +++ b/drivers/vhost/Kconfig
>>> @@ -89,4 +89,16 @@ config VHOST_CROSS_ENDIAN_LEGACY
>>> If unsure, say "N".
>>> +config VHOST_BLK
>>> + tristate "Host kernel accelerator for virtio-blk"
>>> + depends on BLOCK && EVENTFD
>>> + select VHOST
>>> + default n
>>> + help
>>> + This kernel module can be loaded in host kernel to accelerate
>>> + guest vm with virtio-blk driver.
>>> +
>>> + To compile this driver as a module, choose M here: the module
>>> will
>>> + be called vhost_blk.
>>> +
>>> endif
>>> diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
>>> index f3e1897cce85..c76cc4f5fcd8 100644
>>> --- a/drivers/vhost/Makefile
>>> +++ b/drivers/vhost/Makefile
>>> @@ -17,3 +17,6 @@ obj-$(CONFIG_VHOST) += vhost.o
>>> obj-$(CONFIG_VHOST_IOTLB) += vhost_iotlb.o
>>> vhost_iotlb-y := iotlb.o
>>> +
>>> +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
>>> +vhost_blk-y := blk.o
>>> diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
>>> new file mode 100644
>>> index 000000000000..c62b8ae70716
>>> --- /dev/null
>>> +++ b/drivers/vhost/blk.c
>>> @@ -0,0 +1,829 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/*
>>> + * Copyright (C) 2011 Taobao, Inc.
>>> + * Author: Liu Yuan <tailai.ly at taobao.com>
>>> + *
>>> + * Copyright (C) 2012 Red Hat, Inc.
>>> + * Author: Asias He <asias at redhat.com>
>>> + *
>>> + * Copyright (c) 2022 Virtuozzo International GmbH.
>>> + * Author: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>> + *
>>> + * virtio-blk host kernel accelerator.
>>> + */
>>> +
>>> +#include <linux/miscdevice.h>
>>> +#include <linux/module.h>
>>> +#include <linux/vhost.h>
>>> +#include <linux/virtio_blk.h>
>>> +#include <linux/mutex.h>
>>> +#include <linux/file.h>
>>> +#include <linux/kthread.h>
>>> +#include <linux/blkdev.h>
>>> +#include <linux/llist.h>
>>> +
>>> +#include "vhost.h"
>>> +
>>> +enum {
>>> + VHOST_BLK_FEATURES = VHOST_FEATURES |
>>> + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
>>> + (1ULL << VIRTIO_RING_F_EVENT_IDX) |
>>> + (1ULL << VIRTIO_BLK_F_MQ) |
>>> + (1ULL << VIRTIO_BLK_F_FLUSH),
>>> +};
>>> +
>>> +/*
>>> + * Max number of bytes transferred before requeueing the job.
>>> + * Using this limit prevents one virtqueue from starving others.
>>> + */
>>> +#define VHOST_DEV_WEIGHT 0x80000
>>> +
>>> +/*
>>> + * Max number of packets transferred before requeueing the job.
>>> + * Using this limit prevents one virtqueue from starving others with
>>> + * pkts.
>>> + */
>>> +#define VHOST_DEV_PKT_WEIGHT 256
>>> +
>>> +#define VHOST_BLK_VQ_MAX 8
>>> +
>>> +#define VHOST_MAX_METADATA_IOV 1
>>> +
>>> +#define VHOST_BLK_SECTOR_BITS 9
>>> +#define VHOST_BLK_SECTOR_SIZE (1 << VHOST_BLK_SECTOR_BITS)
>>> +#define VHOST_BLK_SECTOR_MASK (VHOST_BLK_SECTOR_SIZE - 1)
>>> +
>>> +struct req_page_list {
>>> + struct page **pages;
>>> + int pages_nr;
>>> +};
>>> +
>>> +#define NR_INLINE 16
>>> +
>>> +struct vhost_blk_req {
>>> + struct req_page_list inline_pl[NR_INLINE];
>>> + struct page *inline_page[NR_INLINE];
>>> + struct bio *inline_bio[NR_INLINE];
>>> + struct req_page_list *pl;
>>> + int during_flush;
>>> + bool use_inline;
>>> +
>>> + struct llist_node llnode;
>>> +
>>> + struct vhost_blk *blk;
>>> +
>>> + struct iovec *iov;
>>> + int iov_nr;
>>> +
>>> + struct bio **bio;
>>> + atomic_t bio_nr;
>>> +
>>> + struct iovec status[VHOST_MAX_METADATA_IOV];
>>> +
>>> + sector_t sector;
>>> + int bi_opf;
>>> + u16 head;
>>> + long len;
>>> + int bio_err;
>>> +
>>> + struct vhost_blk_vq *blk_vq;
>>> +};
>>> +
>>> +struct vhost_blk_vq {
>>> + struct vhost_virtqueue vq;
>>> + struct vhost_blk_req *req;
>>> + struct iovec iov[UIO_MAXIOV];
>>> + struct llist_head llhead;
>>> + struct vhost_work work;
>>> +};
>>> +
>>> +struct vhost_blk {
>>> + wait_queue_head_t flush_wait;
>>> + struct vhost_blk_vq vqs[VHOST_BLK_VQ_MAX];
>>> + atomic_t req_inflight[2];
>>> + spinlock_t flush_lock;
>>> + struct vhost_dev dev;
>>> + int during_flush;
>>> + struct file *backend;
>>> + int index;
>>> +};
>>> +
>>> +static int gen;
>>> +
>>> +static int move_iovec(struct iovec *from, struct iovec *to,
>>> + size_t len, int iov_count_from, int iov_count_to)
>>> +{
>>> + int moved_seg = 0, spent_seg = 0;
>>> + size_t size;
>>> +
>>> + while (len && spent_seg < iov_count_from && moved_seg <
>>> iov_count_to) {
>>> + if (from->iov_len == 0) {
>>> + ++from;
>>> + ++spent_seg;
>>> + continue;
>>> + }
>>> + size = min(from->iov_len, len);
>>> + to->iov_base = from->iov_base;
>>> + to->iov_len = size;
>>> + from->iov_len -= size;
>>> + from->iov_base += size;
>>> + len -= size;
>>> + ++from;
>>> + ++to;
>>> + ++moved_seg;
>>> + ++spent_seg;
>>> + }
>>> +
>>> + return len ? -1 : moved_seg;
>>> +}
>>> +
>>> +static inline int iov_num_pages(struct iovec *iov)
>>> +{
>>> + return (PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) -
>>> + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
>>> +}
>>> +
>>> +static inline int vhost_blk_set_status(struct vhost_blk_req *req, u8
>>> status)
>>> +{
>>> + struct iov_iter iter;
>>> + int ret;
>>> +
>>> + iov_iter_init(&iter, WRITE, req->status,
>>> ARRAY_SIZE(req->status), sizeof(status));
>>> + ret = copy_to_iter(&status, sizeof(status), &iter);
>>> + if (ret != sizeof(status)) {
>>> + vq_err(&req->blk_vq->vq, "Failed to write status\n");
>>> + return -EFAULT;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static void vhost_blk_req_done(struct bio *bio)
>>> +{
>>> + struct vhost_blk_req *req = bio->bi_private;
>>> + struct vhost_blk *blk = req->blk;
>>> +
>>> + req->bio_err = blk_status_to_errno(bio->bi_status);
>>> +
>>> + if (atomic_dec_and_test(&req->bio_nr)) {
>>> + llist_add(&req->llnode, &req->blk_vq->llhead);
>>> + vhost_work_queue(&blk->dev, &req->blk_vq->work);
>>> + }
>>> +
>>> + bio_put(bio);
>>> +}
>>> +
>>> +static void vhost_blk_req_umap(struct vhost_blk_req *req)
>>> +{
>>> + struct req_page_list *pl;
>>> + int i, j;
>>> +
>>> + if (req->pl) {
>>> + for (i = 0; i < req->iov_nr; i++) {
>>> + pl = &req->pl[i];
>>> +
>>> + for (j = 0; j < pl->pages_nr; j++) {
>>> + if (!req->bi_opf)
>>> + set_page_dirty_lock(pl->pages[j]);
>>> + put_page(pl->pages[j]);
>>> + }
>>> + }
>>> + }
>>> +
>>> + if (!req->use_inline)
>>> + kfree(req->pl);
>>> +}
>>> +
>>> +static int vhost_blk_bio_make_simple(struct vhost_blk_req *req,
>>> + struct block_device *bdev)
>>> +{
>>> + struct bio *bio;
>>> +
>>> + req->use_inline = true;
>>> + req->pl = NULL;
>>> + req->bio = req->inline_bio;
>>> +
>>> + bio = bio_alloc(GFP_KERNEL, 1);
>>> + if (!bio)
>>> + return -ENOMEM;
>>> +
>>> + bio->bi_iter.bi_sector = req->sector;
>>> + bio_set_dev(bio, bdev);
>>> + bio->bi_private = req;
>>> + bio->bi_end_io = vhost_blk_req_done;
>>> + bio->bi_opf = req->bi_opf;
>>> + req->bio[0] = bio;
>>> +
>>> + atomic_set(&req->bio_nr, 1);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static struct page **vhost_blk_prepare_req(struct vhost_blk_req *req,
>>> + int total_pages, int iov_nr)
>>> +{
>>> + int pl_len, page_len, bio_len;
>>> + void *buf;
>>> +
>>> + req->use_inline = false;
>>> + pl_len = iov_nr * sizeof(req->pl[0]);
>>> + page_len = total_pages * sizeof(struct page *);
>>> + bio_len = total_pages * sizeof(struct bio *);
>>> +
>>> + buf = kmalloc(pl_len + page_len + bio_len, GFP_KERNEL);
>>> + if (!buf)
>>> + return NULL;
>>> +
>>> + req->pl = buf;
>>> + req->bio = buf + pl_len + page_len;
>>> +
>>> + return buf + pl_len;
>>> +}
>>> +
>>> +static int vhost_blk_bio_make(struct vhost_blk_req *req,
>>> + struct block_device *bdev)
>>> +{
>>> + int pages_nr_total, i, j, ret;
>>> + struct iovec *iov = req->iov;
>>> + int iov_nr = req->iov_nr;
>>> + struct page **pages, *page;
>>> + struct bio *bio = NULL;
>>> + int bio_nr = 0;
>>> +
>>> + if (unlikely(req->bi_opf == REQ_OP_FLUSH))
>>> + return vhost_blk_bio_make_simple(req, bdev);
>>> +
>>> + pages_nr_total = 0;
>>> + for (i = 0; i < iov_nr; i++)
>>> + pages_nr_total += iov_num_pages(&iov[i]);
>>> +
>>> + if (pages_nr_total > NR_INLINE) {
>>> + pages = vhost_blk_prepare_req(req, pages_nr_total, iov_nr);
>>> + if (!pages)
>>> + return -ENOMEM;
>>> + } else {
>>> + req->use_inline = true;
>>> + req->pl = req->inline_pl;
>>> + pages = req->inline_page;
>>> + req->bio = req->inline_bio;
>>> + }
>>> +
>>> + req->iov_nr = 0;
>>> + for (i = 0; i < iov_nr; i++) {
>>> + int pages_nr = iov_num_pages(&iov[i]);
>>> + unsigned long iov_base, iov_len;
>>> + struct req_page_list *pl;
>>> +
>>> + iov_base = (unsigned long)iov[i].iov_base;
>>> + iov_len = (unsigned long)iov[i].iov_len;
>>> +
>>> + ret = get_user_pages_fast(iov_base, pages_nr,
>>> + !req->bi_opf, pages);
>>> + if (ret != pages_nr)
>>> + goto fail;
>>> +
>>> + req->iov_nr++;
>>> + pl = &req->pl[i];
>>> + pl->pages_nr = pages_nr;
>>> + pl->pages = pages;
>>> +
>>> + bio = bio_alloc(GFP_KERNEL, pages_nr);
>>> + if (!bio)
>>> + goto fail;
>>> + bio->bi_iter.bi_sector = req->sector;
>>> + bio_set_dev(bio, bdev);
>>> + bio->bi_private = req;
>>> + bio->bi_end_io = vhost_blk_req_done;
>>> + bio->bi_opf = req->bi_opf;
>>> + req->bio[bio_nr++] = bio;
>>> +
>>> + for (j = 0; j < pages_nr; j++) {
>>> + unsigned int off, len, pos;
>>> +
>>> + page = pages[j];
>>> + off = iov_base & ~PAGE_MASK;
>>> + len = PAGE_SIZE - off;
>>> + if (len > iov_len)
>>> + len = iov_len;
>>> +
>>> + if (!bio_add_page(bio, page, len, off))
>>> + goto fail;
>>
>> Lets continue discussion of my v1 comment here:
>>
>> >> Why do we skip bio_add_page thing from the original patch here?
>> Likely
>> >> this is not important.
>>
>> > In the original patch code tried to stuff all pages in single bio
>> > I changed this to simplify it when I was developing it. Probably
>> wise to
>> > bring it back.
>>
>> I don't think so, in the original patch, the code tried to stuff pages
>> not in _one_ bio but in multiple bios, but as less as possible, what
>> you do in v2 is different and is error prone, I would prefere the
>> approach from the original patch.
>
> I am a bit confused here with word "original". Is it used as reference
> to Asias patch? Or you mean you liked v1 more?. Let's use "Asias", "v1",
> "v2", etc. from now on.
>
> If we talk about Asias patch, he tried to stuff all pages in single bio.
> Code from the asias patch:
> ...
> while (!bio || bio_add_page(bio, page, len, off) <= 0) {
> bio = bio_alloc(GFP_KERNEL, pages_nr_total);
> ...
>
> pages_nr_total is a total number of pages needed for the whole request.
> bio_add_page fail only if you try to add more pages than you requested
> with bio_alloc.
> bio is assigned nowhere except here is a scope of two for().
> Therefore while() fails for the first time, allocate one bio and then
> never fails. So all pages are stuffed into single bio.
By writing "while fails" I actually meant that the condition is passed =)
>
> In v1 I allocated one bio per one page needed. This is probably a great
> waste of bios!
>
> In v2 I allocated one bio per one input iovec in request. Note: linux
> guest always use only one input iovec. Do not know about others.
>
> As for robustness I do not really get why v2 is worse than v1. If we are
> under memory pressure allocating 10 bios will consume more memory than
> allocation one bio with space for 10 pages. As for bio_add_page() it
> practically never fail unless we try to add more pages than we
> requested. We are not trying to do this :)
>
>>
>> You can see how easy it is not to get idea of the code from the code
>> itself, so let's also continue other discussion:
>>
>> >> If there is no explanation almost any code is right, probably you
>> intentionally leak oldfile in the patch. How would I know if it is
>> intentional or not if there is no commit message explanation of what
>> code does? =)
>> >
>> > I get your point. But it is a very hard (and holywarish) take. It
>> is simple (and a must) to document 10-50-100-line change within a
>> commit message, especially if it is a specific change or bugfix. But
>> for a big new module? Probably not, it would take an immense amount of
>> time and probably worthless. And also this code use a lot of helpers
>> from vhost which should also be probably explained?
>> > Some people may not read an explanation at all, some people may
>> prefer to understand it from reading code, and for some people it may
>> be very useful. And there is no guarantee that all potential
>> description would be clear and concise. For a reference I personally
>> prefer to explore the code rather read some explanations.
>> > And the answer for "how would I know" is, in my opinion, a
>> questions. I would be glad to answer!
>>
>> I believe that for complex code god commit message and probably
>> in-code comments with complete description is a must, else at some
>> point nobody would understand how to fix the code and what was the
>> original intent behind the lines of code =)
>>
>>> +
>>> + iov_base += len;
>>> + iov_len -= len;
>>> +
>>> + pos = (iov_base & VHOST_BLK_SECTOR_MASK) + iov_len;
>>> + req->sector += pos >> VHOST_BLK_SECTOR_BITS;
>>> + }
>>> +
>>> + pages += pages_nr;
>>> + }
>>> + atomic_set(&req->bio_nr, bio_nr);
>>> + return 0;
>>> +
>>> +fail:
>>> + for (i = 0; i < bio_nr; i++)
>>> + bio_put(req->bio[i]);
>>> + vhost_blk_req_umap(req);
>>> + return -ENOMEM;
>>> +}
>>> +
>>> +static inline void vhost_blk_bio_send(struct vhost_blk_req *req)
>>> +{
>>> + struct blk_plug plug;
>>> + int i, bio_nr;
>>> +
>>> + bio_nr = atomic_read(&req->bio_nr);
>>> + blk_start_plug(&plug);
>>> + for (i = 0; i < bio_nr; i++)
>>> + submit_bio(req->bio[i]);
>>> +
>>> + blk_finish_plug(&plug);
>>> +}
>>> +
>>> +static int vhost_blk_req_submit(struct vhost_blk_req *req, struct
>>> file *file)
>>> +{
>>> +
>>> + struct inode *inode = file->f_mapping->host;
>>> + struct block_device *bdev = I_BDEV(inode);
>>> + int ret;
>>> +
>>> + ret = vhost_blk_bio_make(req, bdev);
>>> + if (ret < 0)
>>> + return ret;
>>> +
>>> + vhost_blk_bio_send(req);
>>> +
>>> + spin_lock(&req->blk->flush_lock);
>>> + req->during_flush = req->blk->during_flush;
>>> + atomic_inc(&req->blk->req_inflight[req->during_flush]);
>>> + spin_unlock(&req->blk->flush_lock);
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static int vhost_blk_req_handle(struct vhost_virtqueue *vq,
>>> + struct virtio_blk_outhdr *hdr,
>>> + u16 head, u16 total_iov_nr,
>>> + struct file *file)
>>> +{
>>> + struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk,
>>> dev);
>>> + struct vhost_blk_vq *blk_vq = container_of(vq, struct
>>> vhost_blk_vq, vq);
>>> + unsigned char id[VIRTIO_BLK_ID_BYTES];
>>> + struct vhost_blk_req *req;
>>> + struct iov_iter iter;
>>> + int ret, len;
>>> + u8 status;
>>> +
>>> + req = &blk_vq->req[head];
>>> + req->blk_vq = blk_vq;
>>> + req->head = head;
>>> + req->blk = blk;
>>> + req->sector = hdr->sector;
>>> + req->iov = blk_vq->iov;
>>> +
>>> + req->len = iov_length(vq->iov, total_iov_nr) - sizeof(status);
>>> + req->iov_nr = move_iovec(vq->iov, req->iov, req->len,
>>> total_iov_nr,
>>> + ARRAY_SIZE(blk_vq->iov));
>>> +
>>> + ret = move_iovec(vq->iov, req->status, sizeof(status),
>>> total_iov_nr,
>>> + ARRAY_SIZE(req->status));
>>> + if (ret < 0 || req->iov_nr < 0)
>>> + return -EINVAL;
>>> +
>>> + switch (hdr->type) {
>>> + case VIRTIO_BLK_T_OUT:
>>> + req->bi_opf = REQ_OP_WRITE;
>>> + ret = vhost_blk_req_submit(req, file);
>>> + break;
>>> + case VIRTIO_BLK_T_IN:
>>> + req->bi_opf = REQ_OP_READ;
>>> + ret = vhost_blk_req_submit(req, file);
>>> + break;
>>> + case VIRTIO_BLK_T_FLUSH:
>>> + req->bi_opf = REQ_OP_FLUSH;
>>> + ret = vhost_blk_req_submit(req, file);
>>> + break;
>>> + case VIRTIO_BLK_T_GET_ID:
>>> + len = snprintf(id, VIRTIO_BLK_ID_BYTES, "vhost-blk%d",
>>> blk->index);
>>> + iov_iter_init(&iter, WRITE, req->iov, req->iov_nr, req->len);
>>> + ret = copy_to_iter(id, len, &iter);
>>> + status = ret != len ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
>>> + ret = vhost_blk_set_status(req, status);
>>> + if (ret)
>>> + break;
>>> + vhost_add_used_and_signal(&blk->dev, vq, head, len);
>>> + break;
>>> + default:
>>> + vq_err(vq, "Unsupported request type %d\n", hdr->type);
>>> + status = VIRTIO_BLK_S_UNSUPP;
>>> + ret = vhost_blk_set_status(req, status);
>>> + if (ret)
>>> + break;
>>> + vhost_add_used_and_signal(&blk->dev, vq, head, 0);
>>> + }
>>> +
>>> + return ret;
>>> +}
>>> +
>>> +static void vhost_blk_handle_guest_kick(struct vhost_work *work)
>>> +{
>>> + struct virtio_blk_outhdr hdr;
>>> + struct vhost_blk_vq *blk_vq;
>>> + struct vhost_virtqueue *vq;
>>> + struct iovec hdr_iovec[VHOST_MAX_METADATA_IOV];
>>> + struct vhost_blk *blk;
>>> + struct iov_iter iter;
>>> + int in, out, ret;
>>> + struct file *f;
>>> + u16 head;
>>> +
>>> + vq = container_of(work, struct vhost_virtqueue, poll.work);
>>> + blk = container_of(vq->dev, struct vhost_blk, dev);
>>> + blk_vq = container_of(vq, struct vhost_blk_vq, vq);
>>> +
>>> + f = vhost_vq_get_backend(vq);
>>> + if (!f)
>>> + return;
>>> +
>>> + vhost_disable_notify(&blk->dev, vq);
>>> + for (;;) {
>>> + head = vhost_get_vq_desc(vq, vq->iov,
>>> + ARRAY_SIZE(vq->iov),
>>> + &out, &in, NULL, NULL);
>>> + if (unlikely(head < 0))
>>> + break;
>>> +
>>> + if (unlikely(head == vq->num)) {
>>> + if (unlikely(vhost_enable_notify(&blk->dev, vq))) {
>>> + vhost_disable_notify(&blk->dev, vq);
>>> + continue;
>>> + }
>>> + break;
>>> + }
>>> +
>>> + ret = move_iovec(vq->iov, hdr_iovec, sizeof(hdr), in + out,
>>> ARRAY_SIZE(hdr_iovec));
>>> + if (ret < 0) {
>>> + vq_err(vq, "virtio_blk_hdr is too split!");
>>> + vhost_discard_vq_desc(vq, 1);
>>> + break;
>>> + }
>>> +
>>> + iov_iter_init(&iter, READ, hdr_iovec, ARRAY_SIZE(hdr_iovec),
>>> sizeof(hdr));
>>> + ret = copy_from_iter(&hdr, sizeof(hdr), &iter);
>>> + if (ret != sizeof(hdr)) {
>>> + vq_err(vq, "Failed to get block header: read %d bytes
>>> instead of %ld!\n",
>>> + ret, sizeof(hdr));
>>> + vhost_discard_vq_desc(vq, 1);
>>> + break;
>>> + }
>>> +
>>> + if (vhost_blk_req_handle(vq, &hdr, head, out + in, f) < 0) {
>>> + vhost_discard_vq_desc(vq, 1);
>>> + break;
>>> + }
>>> +
>>> + if (!llist_empty(&blk_vq->llhead)) {
>>> + vhost_poll_queue(&vq->poll);
>>> + break;
>>> + }
>>> + }
>>> +}
>>> +
>>> +static void vhost_blk_handle_host_kick(struct vhost_work *work)
>>> +{
>>> + struct vhost_blk_vq *blk_vq;
>>> + struct vhost_virtqueue *vq;
>>> + struct vhost_blk_req *req;
>>> + struct llist_node *llnode;
>>> + struct vhost_blk *blk = NULL;
>>> + bool added, zero;
>>> + u8 status;
>>> + int ret;
>>> +
>>> + blk_vq = container_of(work, struct vhost_blk_vq, work);
>>> + vq = &blk_vq->vq;
>>> + llnode = llist_del_all(&blk_vq->llhead);
>>> + added = false;
>>> + while (llnode) {
>>> + req = llist_entry(llnode, struct vhost_blk_req, llnode);
>>> + llnode = llist_next(llnode);
>>> +
>>> + if (!blk)
>>> + blk = req->blk;
>>> +
>>> + vhost_blk_req_umap(req);
>>> +
>>> + status = req->bio_err == 0 ? VIRTIO_BLK_S_OK :
>>> VIRTIO_BLK_S_IOERR;
>>> + ret = vhost_blk_set_status(req, status);
>>> + if (unlikely(ret))
>>> + continue;
>>> +
>>> + vhost_add_used(vq, req->head, req->len);
>>> + added = true;
>>> +
>>> + spin_lock(&req->blk->flush_lock);
>>> + zero = atomic_dec_and_test(
>>> + &req->blk->req_inflight[req->during_flush]);
>>> + if (zero && !req->during_flush)
>>> + wake_up(&blk->flush_wait);
>>> + spin_unlock(&req->blk->flush_lock);
>>> +
>>> + }
>>> +
>>> + if (likely(added))
>>> + vhost_signal(&blk->dev, vq);
>>> +}
>>> +
>>> +static void vhost_blk_flush(struct vhost_blk *blk)
>>> +{
>>> + int i;
>>> +
>>> + spin_lock(&blk->flush_lock);
>>> + blk->during_flush = 1;
>>> + spin_unlock(&blk->flush_lock);
>>> +
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++)
>>> + vhost_poll_flush(&blk->vqs[i].vq.poll);
>>> + vhost_work_dev_flush(&blk->dev);
>>> + /*
>>> + * Wait until requests fired before the flush to be finished
>>> + * req_inflight[0] is used to track the requests fired before
>>> the flush
>>> + * req_inflight[1] is used to track the requests fired during
>>> the flush
>>> + */
>>> + wait_event(blk->flush_wait, !atomic_read(&blk->req_inflight[0]));
>>> +
>>> + spin_lock(&blk->flush_lock);
>>> + blk->during_flush = 0;
>>> + spin_unlock(&blk->flush_lock);
>>> +}
>>> +
>>> +static inline void vhost_blk_drop_backends(struct vhost_blk *blk)
>>> +{
>>> + struct vhost_virtqueue *vq;
>>> + int i;
>>> +
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
>>> + vq = &blk->vqs[i].vq;
>>> +
>>> + mutex_lock(&vq->mutex);
>>> + vhost_vq_set_backend(vq, NULL);
>>> + mutex_unlock(&vq->mutex);
>>> + }
>>> +}
>>> +
>>> +static int vhost_blk_open(struct inode *inode, struct file *file)
>>> +{
>>> + struct vhost_blk *blk;
>>> + struct vhost_virtqueue **vqs;
>>> + int ret = 0, i = 0;
>>> +
>>> + blk = kvzalloc(sizeof(*blk), GFP_KERNEL);
>>> + if (!blk) {
>>> + ret = -ENOMEM;
>>> + goto out;
>>> + }
>>> +
>>> + vqs = kcalloc(VHOST_BLK_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
>>> + if (!vqs) {
>>> + ret = -ENOMEM;
>>> + goto out_blk;
>>> + }
>>> +
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
>>> + blk->vqs[i].vq.handle_kick = vhost_blk_handle_guest_kick;
>>> + vqs[i] = &blk->vqs[i].vq;
>>> + }
>>> +
>>> + blk->index = gen++;
>>> +
>>> + atomic_set(&blk->req_inflight[0], 0);
>>> + atomic_set(&blk->req_inflight[1], 0);
>>> + blk->during_flush = 0;
>>> + spin_lock_init(&blk->flush_lock);
>>> + init_waitqueue_head(&blk->flush_wait);
>>> +
>>> + vhost_dev_init(&blk->dev, vqs, VHOST_BLK_VQ_MAX, UIO_MAXIOV,
>>> + VHOST_DEV_WEIGHT, VHOST_DEV_PKT_WEIGHT, true, NULL);
>>> + file->private_data = blk;
>>> +
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++)
>>> + vhost_work_init(&blk->vqs[i].work, vhost_blk_handle_host_kick);
>>> +
>>> + return ret;
>>> +out_blk:
>>> + kvfree(blk);
>>> +out:
>>> + return ret;
>>> +}
>>> +
>>> +static int vhost_blk_release(struct inode *inode, struct file *f)
>>> +{
>>> + struct vhost_blk *blk = f->private_data;
>>> + int i;
>>> +
>>> + vhost_blk_drop_backends(blk);
>>> + vhost_blk_flush(blk);
>>> + vhost_dev_stop(&blk->dev);
>>> + if (blk->backend)
>>> + fput(blk->backend);
>>> + vhost_dev_cleanup(&blk->dev);
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++)
>>> + kvfree(blk->vqs[i].req);
>>> + kfree(blk->dev.vqs);
>>> + kvfree(blk);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int vhost_blk_set_features(struct vhost_blk *blk, u64 features)
>>> +{
>>> + struct vhost_virtqueue *vq;
>>> + int i;
>>> +
>>> + mutex_lock(&blk->dev.mutex);
>>> + if ((features & (1 << VHOST_F_LOG_ALL)) &&
>>> + !vhost_log_access_ok(&blk->dev)) {
>>> + mutex_unlock(&blk->dev.mutex);
>>> + return -EFAULT;
>>> + }
>>> +
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
>>> + vq = &blk->vqs[i].vq;
>>> + mutex_lock(&vq->mutex);
>>> + vq->acked_features = features & (VHOST_BLK_FEATURES);
>>> + mutex_unlock(&vq->mutex);
>>> + }
>>> +
>>> + vhost_blk_flush(blk);
>>> + mutex_unlock(&blk->dev.mutex);
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static long vhost_blk_set_backend(struct vhost_blk *blk, int fd)
>>> +{
>>> + struct vhost_virtqueue *vq;
>>> + struct file *file;
>>> + struct inode *inode;
>>> + int ret, i;
>>> +
>>> + mutex_lock(&blk->dev.mutex);
>>> + ret = vhost_dev_check_owner(&blk->dev);
>>> + if (ret)
>>> + goto out_dev;
>>> +
>>> + if (blk->backend) {
>>> + ret = -EBUSY;
>>> + goto out_dev;
>>> + }
>>> +
>>> + file = fget(fd);
>>> + if (IS_ERR(file)) {
>>> + ret = PTR_ERR(file);
>>> + goto out_dev;
>>> + }
>>> +
>>> + inode = file->f_mapping->host;
>>> + if (!S_ISBLK(inode->i_mode)) {
>>> + ret = -EFAULT;
>>> + goto out_file;
>>> + }
>>> +
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
>>> + vq = &blk->vqs[i].vq;
>>> + if (!vhost_vq_access_ok(vq)) {
>>> + ret = -EFAULT;
>>> + goto out_drop;
>>> + }
>>> +
>>> + mutex_lock(&vq->mutex);
>>> + vhost_vq_set_backend(vq, file);
>>> + ret = vhost_vq_init_access(vq);
>>> + mutex_unlock(&vq->mutex);
>>> + }
>>> +
>>> + blk->backend = file;
>>> +
>>> + mutex_unlock(&blk->dev.mutex);
>>> + return 0;
>>> +
>>> +out_drop:
>>> + vhost_blk_drop_backends(blk);
>>> +out_file:
>>> + fput(file);
>>> +out_dev:
>>> + mutex_unlock(&blk->dev.mutex);
>>> + return ret;
>>> +}
>>> +
>>> +static long vhost_blk_reset_owner(struct vhost_blk *blk)
>>> +{
>>> + struct vhost_iotlb *umem;
>>> + int err, i;
>>> +
>>> + mutex_lock(&blk->dev.mutex);
>>> + err = vhost_dev_check_owner(&blk->dev);
>>> + if (err)
>>> + goto done;
>>> + umem = vhost_dev_reset_owner_prepare();
>>> + if (!umem) {
>>> + err = -ENOMEM;
>>> + goto done;
>>> + }
>>> + vhost_blk_drop_backends(blk);
>>> + if (blk->backend) {
>>> + fput(blk->backend);
>>> + blk->backend = NULL;
>>> + }
>>> + vhost_blk_flush(blk);
>>> + vhost_dev_stop(&blk->dev);
>>> + vhost_dev_reset_owner(&blk->dev, umem);
>>> +
>>> + for (i = 0; i < VHOST_BLK_VQ_MAX; i++) {
>>> + kvfree(blk->vqs[i].req);
>>> + blk->vqs[i].req = NULL;
>>> + }
>>> +
>>> +done:
>>> + mutex_unlock(&blk->dev.mutex);
>>> + return err;
>>> +}
>>> +
>>> +static int vhost_blk_setup(struct vhost_blk *blk, void __user *argp)
>>> +{
>>> + struct vhost_vring_state s;
>>> +
>>> + if (copy_from_user(&s, argp, sizeof(s)))
>>> + return -EFAULT;
>>> +
>>> + if (blk->vqs[s.index].req)
>>> + return 0;
>>> +
>>> + blk->vqs[s.index].req = kvmalloc(sizeof(struct vhost_blk_req) *
>>> s.num, GFP_KERNEL);
>>> + if (!blk->vqs[s.index].req)
>>> + return -ENOMEM;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
>>> + unsigned long arg)
>>> +{
>>> + struct vhost_blk *blk = f->private_data;
>>> + void __user *argp = (void __user *)arg;
>>> + struct vhost_vring_file backend;
>>> + u64 __user *featurep = argp;
>>> + u64 features;
>>> + int ret;
>>> +
>>> + switch (ioctl) {
>>> + case VHOST_BLK_SET_BACKEND:
>>> + if (copy_from_user(&backend, argp, sizeof(backend)))
>>> + return -EFAULT;
>>> + return vhost_blk_set_backend(blk, backend.fd);
>>> + case VHOST_GET_FEATURES:
>>> + features = VHOST_BLK_FEATURES;
>>> + if (copy_to_user(featurep, &features, sizeof(features)))
>>> + return -EFAULT;
>>> + return 0;
>>> + case VHOST_SET_FEATURES:
>>> + if (copy_from_user(&features, featurep, sizeof(features)))
>>> + return -EFAULT;
>>> + if (features & ~VHOST_BLK_FEATURES)
>>> + return -EOPNOTSUPP;
>>> + return vhost_blk_set_features(blk, features);
>>> + case VHOST_RESET_OWNER:
>>> + return vhost_blk_reset_owner(blk);
>>> + default:
>>> + mutex_lock(&blk->dev.mutex);
>>> + ret = vhost_dev_ioctl(&blk->dev, ioctl, argp);
>>> + if (ret == -ENOIOCTLCMD)
>>> + ret = vhost_vring_ioctl(&blk->dev, ioctl, argp);
>>> + if (!ret && ioctl == VHOST_SET_VRING_NUM)
>>> + ret = vhost_blk_setup(blk, argp);
>>> + vhost_blk_flush(blk);
>>> + mutex_unlock(&blk->dev.mutex);
>>> + return ret;
>>> + }
>>> +}
>>> +
>>> +static const struct file_operations vhost_blk_fops = {
>>> + .owner = THIS_MODULE,
>>> + .open = vhost_blk_open,
>>> + .release = vhost_blk_release,
>>> + .llseek = noop_llseek,
>>> + .unlocked_ioctl = vhost_blk_ioctl,
>>> +};
>>> +
>>> +static struct miscdevice vhost_blk_misc = {
>>> + MISC_DYNAMIC_MINOR,
>>> + "vhost-blk",
>>> + &vhost_blk_fops,
>>> +};
>>> +module_misc_device(vhost_blk_misc);
>>> +
>>> +MODULE_VERSION("0.0.1");
>>> +MODULE_LICENSE("GPL v2");
>>> +MODULE_AUTHOR("Andrey Zhadchenko");
>>> +MODULE_DESCRIPTION("Host kernel accelerator for virtio_blk");
>>> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
>>> index c998860d7bbc..13caf114bcde 100644
>>> --- a/include/uapi/linux/vhost.h
>>> +++ b/include/uapi/linux/vhost.h
>>> @@ -150,4 +150,9 @@
>>> /* Get the valid iova range */
>>> #define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \
>>> struct vhost_vdpa_iova_range)
>>> +
>>> +/* VHOST_BLK specific defines */
>>> +#define VHOST_BLK_SET_BACKEND _IOW(VHOST_VIRTIO, 0xFF, \
>>> + struct vhost_vring_file)
>>> +
>>> #endif
>>
> _______________________________________________
> Devel mailing list
> Devel at openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
More information about the Devel
mailing list