[Devel] [PATCH VZ9 2/3] block/blk-cbt: allow multiple cbts in a single queue
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Thu Jan 23 12:39:21 MSK 2025
On 1/23/25 17:14, Andrey Zhadchenko wrote:
>
>
> On 1/23/25 09:05, Pavel Tikhomirov wrote:
>> General comment:
>>
>> Do we need a restriction on maximum amount of cbts? E.g. in next patch
>> we allow to read cbts list from userspace, if we have to many cbts it
>> can be not so trivial to allocate proper buffer to get them all at once.
>
> Currently userspace code looks like that:
> int cbt_dump_names(int fd, ...)
> {
> struct blk_user_cbt_list cl = {}, *req = NULL;
> int fd, ret = -1, i;
> char *mem;
>
> ret = ioctl(fd, BLKCBTLIST, &cl);
> if (ret < 0) {
> ploop_err(errno, "failed to count CBT names");
> return SYSEXIT_DEVIOC;
> }
>
> req = malloc(cl.total_count * CBT_NAME_LENGTH + sizeof(cl));
> if (!req) {
> ploop_err(ENOMEM, "failed to allocate memory");
> return SYSEXIT_MALLOC;
> }
>
> req->count = cl.total_count;
> ret = ioctl(fd, BLKCBTLIST, req);
> if (ret < 0) {
> ploop_err(errno, "failed to list CBT names");
> ret = SYSEXIT_DEVIOC;
> goto out;
> }
>
> ...
> }
>
> I think it is pretty easy
Yes, but malloc may fail on too big buffer.
>
>>
>> On 1/20/25 16:34, Andrey Zhadchenko wrote:
>>> Store cbts in a list instead of a single pointer.
>>> Update all APIs to work with user-specified bitmap.
>>>
>>> https://virtuozzo.atlassian.net/browse/VSTOR-96269
>>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>> ---
>>> block/blk-cbt.c | 174 +++++++++++++++++++++-------------------
>>> block/blk-core.c | 2 +
>>> include/linux/blkdev.h | 2 +-
>>> include/uapi/linux/fs.h | 2 +-
>>> 4 files changed, 96 insertions(+), 84 deletions(-)
>>>
>>> diff --git a/block/blk-cbt.c b/block/blk-cbt.c
>>> index 8ae993143098..8b0d07ee757c 100644
>>> --- a/block/blk-cbt.c
>>> +++ b/block/blk-cbt.c
>>> @@ -53,6 +53,7 @@ struct cbt_info {
>>> blkcnt_t snp_block_max;
>>> spinlock_t lock;
>>> + struct list_head list;
>>> };
>>> @@ -112,7 +113,11 @@ static int cbt_page_alloc(struct cbt_info
>>> **cbt_pp, unsigned long idx,
>>> if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
>>> cbt->count++;
>>> } else {
>>> - struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
>>> + struct cbt_info *i, *new = NULL;
>>> +
>>> + list_for_each_entry_rcu(i, &cbt->queue->cbt_list, list)
>>> + if (!memcmp(i->name, cbt->name, sizeof(cbt->name)))
>>> + new = i;
>>> spin_unlock_irq(&cbt->lock);
>>> /* was cbt updated ? */
>>> @@ -216,33 +221,26 @@ static int __blk_cbt_set(struct cbt_info *cbt,
>>> blkcnt_t block,
>>> return (pages_missed && *pages_missed) ? -EAGAIN : 0;
>>> }
>>> -static void blk_cbt_add(struct request_queue *q, blkcnt_t start,
>>> blkcnt_t len)
>>> +static void blk_cbt_add(struct cbt_info *cbt, blkcnt_t start,
>>> blkcnt_t len)
>>> {
>>> - struct cbt_info *cbt;
>>> struct cbt_extent *ex;
>>> struct cbt_extent old;
>>> blkcnt_t end;
>>> - /* Check per-cpu cache */
>>> -
>>> - rcu_read_lock();
>>> - cbt = rcu_dereference(q->cbt);
>>> - if (unlikely(!cbt))
>>> - goto out_rcu;
>>> if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
>>> - goto out_rcu;
>>> + return;
>>> end = DIV_ROUND_UP(start + len, 1 << cbt->block_bits);
>>> start >>= cbt->block_bits;
>>> len = end - start;
>>> if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
>>> __blk_cbt_set(cbt, start, len, 1, 1, NULL, NULL);
>>> - goto out_rcu;
>>> + return;
>>> }
>>> ex = get_cpu_ptr(cbt->cache);
>>> if (ex->start + ex->len == start) {
>>> ex->len += len;
>>> put_cpu_ptr(cbt->cache);
>>> - goto out_rcu;
>>> + return;
>>> }
>>> old = *ex;
>>> ex->start = start;
>>> @@ -251,18 +249,23 @@ static void blk_cbt_add(struct request_queue
>>> *q, blkcnt_t start, blkcnt_t len)
>>> if (likely(old.len))
>>> __blk_cbt_set(cbt, old.start, old.len, 1, 1, NULL, NULL);
>>> +}
>>> +
>>> +void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
>>> +{
>>> + struct cbt_info *cbt;
>>> +
>>> + rcu_read_lock();
>>> + if (list_empty(&q->cbt_list) || bio_data_dir(bio) == READ || !
>>> bio->bi_iter.bi_size)
>>> + goto out_rcu;
>>> +
>>> + list_for_each_entry_rcu(cbt, &q->cbt_list, list)
>>> + blk_cbt_add(cbt, bio->bi_iter.bi_sector << 9, bio-
>>> >bi_iter.bi_size);
>>> +
>>> out_rcu:
>>> rcu_read_unlock();
>>> }
>>> -inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
>>> -{
>>> - if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
>>> - return;
>>> -
>>> - blk_cbt_add(q, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
>>> -}
>>> -
>>> static struct cbt_info *do_cbt_alloc(struct request_queue *q, __u8
>>> *name,
>>> loff_t size, loff_t blocksize)
>>> {
>>> @@ -272,6 +275,7 @@ static struct cbt_info *do_cbt_alloc(struct
>>> request_queue *q, __u8 *name,
>>> if (!cbt)
>>> return ERR_PTR(-ENOMEM);
>>> + INIT_LIST_HEAD(&cbt->list);
>>> cbt->block_bits = ilog2(blocksize);
>>> cbt->block_max = DIV_ROUND_UP(size, blocksize);
>>> spin_lock_init(&cbt->lock);
>>> @@ -365,6 +369,17 @@ static int copy_cbt_to_user(struct page **map,
>>> unsigned long size,
>>> return 0;
>>> }
>>> +static struct cbt_info *blk_cbt_find(struct request_queue *q, __u8
>>> *name)
>>> +{
>>> + struct cbt_info *cbt;
>>> +
>>> + list_for_each_entry(cbt, &q->cbt_list, list)
>>> + if (!memcmp(name, cbt->name, CBT_NAME_LENGTH))
>>> + return cbt;
>>> +
>>> + return NULL;
>>> +}
>>> +
>>> static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
>>> struct blk_user_cbt_snap_create __user *arg)
>>> {
>>> @@ -382,8 +397,7 @@ static int blk_cbt_snap_create(struct
>>> request_queue *q, __u8 *name,
>>> return -EFAULT;
>>> mutex_lock(&cbt_mutex);
>>> - cbt = q->cbt;
>>> -
>>> + cbt = blk_cbt_find(q, name);
>>> if (!cbt) {
>>> mutex_unlock(&cbt_mutex);
>>> return -ENOENT;
>>> @@ -392,11 +406,6 @@ static int blk_cbt_snap_create(struct
>>> request_queue *q, __u8 *name,
>>> BUG_ON(!cbt->map);
>>> BUG_ON(!cbt->block_max);
>>> - if (!name || memcmp(name, cbt->name, sizeof(cbt->name))) {
>>> - mutex_unlock(&cbt_mutex);
>>> - return -EINVAL;
>>> - }
>>> -
>>> if (cbt->snp_map) {
>>> mutex_unlock(&cbt_mutex);
>>> return -EBUSY;
>>> @@ -461,7 +470,7 @@ static int blk_cbt_snap_drop(struct request_queue
>>> *q, __u8 *name)
>>> int ret;
>>> mutex_lock(&cbt_mutex);
>>> - cbt = q->cbt;
>>> + cbt = blk_cbt_find(q, name);
>>> ret = -ENOENT;
>>> if (!cbt)
>>> @@ -470,10 +479,6 @@ static int blk_cbt_snap_drop(struct
>>> request_queue *q, __u8 *name)
>>> BUG_ON(!cbt->map);
>>> BUG_ON(!cbt->block_max);
>>> - ret = -EINVAL;
>>> - if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
>>> - goto out;
>>> -
>>> ret = -ENODEV;
>>> map = cbt->snp_map;
>>> if (!map)
>>> @@ -511,7 +516,7 @@ static int blk_cbt_snap_merge_back(struct
>>> request_queue *q, __u8 *name)
>>> int ret;
>>> mutex_lock(&cbt_mutex);
>>> - cbt = q->cbt;
>>> + cbt = blk_cbt_find(q, name);
>>> ret = -ENOENT;
>>> if (!cbt)
>>> @@ -520,10 +525,6 @@ static int blk_cbt_snap_merge_back(struct
>>> request_queue *q, __u8 *name)
>>> BUG_ON(!cbt->map);
>>> BUG_ON(!cbt->block_max);
>>> - ret = -EINVAL;
>>> - if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
>>> - goto out;
>>> -
>>> map = cbt->snp_map;
>>> block_max = cbt->snp_block_max;
>>> ret = -ENODEV;
>>> @@ -568,33 +569,21 @@ static int blk_cbt_snap_merge_back(struct
>>> request_queue *q, __u8 *name)
>>> return ret;
>>> }
>>> -void blk_cbt_update_size(struct block_device *bdev)
>>> +static void blk_cbt_update_cbt_size(struct cbt_info *cbt, loff_t
>>> new_sz)
>>> {
>>> - struct request_queue *q;
>>> - struct cbt_info *new, *cbt;
>>> + struct cbt_info *new;
>>> unsigned long to_cpy, idx;
>>> unsigned bsz;
>>> - loff_t new_sz = i_size_read(bdev->bd_inode);
>>> int in_use = 0;
>>> - if (!bdev->bd_disk || !bdev_get_queue(bdev))
>>> - return;
>>> -
>>> - q = bdev_get_queue(bdev);
>>> - mutex_lock(&cbt_mutex);
>>> - cbt = q->cbt;
>>> - if (!cbt) {
>>> - mutex_unlock(&cbt_mutex);
>>> - return;
>>> - }
>>> bsz = 1 << cbt->block_bits;
>>> if (DIV_ROUND_UP(new_sz, bsz) <= cbt->block_max)
>>> - goto err_mtx;
>>> + return;
>>> - new = do_cbt_alloc(q, cbt->name, new_sz, bsz);
>>> + new = do_cbt_alloc(cbt->queue, cbt->name, new_sz, bsz);
>>> if (IS_ERR(new)) {
>>> set_bit(CBT_ERROR, &cbt->flags);
>>> - goto err_mtx;
>>> + return;
>>> }
>>> to_cpy = NR_PAGES(cbt->block_max);
>>> set_bit(CBT_NOCACHE, &cbt->flags);
>>> @@ -606,15 +595,27 @@ void blk_cbt_update_size(struct block_device
>>> *bdev)
>>> if (CBT_PAGE(new, idx))
>>> get_page(CBT_PAGE(new, idx));
>>> }
>>> - rcu_assign_pointer(q->cbt, new);
>>> + list_replace_rcu(&cbt->list, &new->list);
>>> in_use = cbt->count;
>>> spin_unlock_irq(&cbt->lock);
>>> if (!in_use)
>>> call_rcu(&cbt->rcu, &cbt_release_callback);
>>> -err_mtx:
>>> +}
>>> +
>>> +void blk_cbt_update_size(struct block_device *bdev)
>>> +{
>>> + struct request_queue *q;
>>> + struct cbt_info *cbt;
>>> +
>>> + if (!bdev->bd_disk || !bdev_get_queue(bdev))
>>> + return;
>>> +
>>> + q = bdev_get_queue(bdev);
>>> + mutex_lock(&cbt_mutex);
>>> + list_for_each_entry(cbt, &q->cbt_list, list)
>>> + blk_cbt_update_cbt_size(cbt, i_size_read(bdev->bd_inode));
>>> +
>>> mutex_unlock(&cbt_mutex);
>>> -
>>> -
>>> }
>>> static int cbt_ioc_init(struct block_device *bdev, struct
>>> blk_user_cbt_info __user *ucbt_ioc)
>>> @@ -632,16 +633,13 @@ static int cbt_ioc_init(struct block_device
>>> *bdev, struct blk_user_cbt_info __us
>>> q = bdev_get_queue(bdev);
>>> mutex_lock(&cbt_mutex);
>>> - if (q->cbt) {
>>> - ret = -EBUSY;
>>> - goto err_mtx;
>>> - }
>>> +
>>> cbt = do_cbt_alloc(q, ci.ci_name, i_size_read(bdev->bd_inode),
>>> ci.ci_blksize);
>>> if (IS_ERR(cbt))
>>> ret = PTR_ERR(cbt);
>>> else
>>> - rcu_assign_pointer(q->cbt, cbt);
>>> -err_mtx:
>>> + list_add_tail_rcu(&q->cbt_list, &cbt->list);
>>> +
>>> mutex_unlock(&cbt_mutex);
>>> return ret;
>>> }
>>> @@ -667,35 +665,46 @@ static void cbt_release_callback(struct
>>> rcu_head *head)
>>> kfree(cbt);
>>> }
>>> -void blk_cbt_release(struct request_queue *q)
>>> +static void blk_cbt_del(struct cbt_info *cbt)
>>> {
>>> - struct cbt_info *cbt;
>>> - int in_use = 0;
>>> unsigned long flags;
>>> + int in_use;
>>> - cbt = q->cbt;
>>> - if (!cbt)
>>> - return;
>>> spin_lock_irqsave(&cbt->lock, flags);
>>> set_bit(CBT_DEAD, &cbt->flags);
>>> - rcu_assign_pointer(q->cbt, NULL);
>>> + list_del_rcu(&cbt->list);
>>> in_use = cbt->count;
>>> spin_unlock_irqrestore(&cbt->lock, flags);
>>> if (!in_use)
>>> call_rcu(&cbt->rcu, &cbt_release_callback);
>>> }
>>> -static int cbt_ioc_stop(struct block_device *bdev)
>>> +void blk_cbt_release(struct request_queue *q)
>>> {
>>> + struct cbt_info *cbt, *tmp;
>>> +
>>> + list_for_each_entry_safe(cbt, tmp, &q->cbt_list, list)
>>> + blk_cbt_del(cbt);
>>> +}
>>> +
>>> +static int cbt_ioc_stop(struct block_device *bdev, struct
>>> blk_user_cbt_info __user *ucbt_ioc)
>>> +{
>>> + struct blk_user_cbt_info ci;
>>> struct request_queue *q;
>>> + struct cbt_info *cbt;
>>> +
>>> + if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
>>> + return -EFAULT;
>>> mutex_lock(&cbt_mutex);
>>> q = bdev_get_queue(bdev);
>>> - if(!q->cbt) {
>>> - mutex_unlock(&cbt_mutex);
>>> - return -EINVAL;
>>> - }
>>> - blk_cbt_release(q);
>>> +
>>> + cbt = blk_cbt_find(q, ci.ci_name);
>>> + if (!cbt)
>>> + return -ENOMEM;
>>> +
>>> + blk_cbt_del(cbt);
>>> +
>>> mutex_unlock(&cbt_mutex);
>>> return 0;
>>> }
>>> @@ -839,7 +848,8 @@ static int cbt_ioc_get(struct block_device *bdev,
>>> struct blk_user_cbt_info __use
>>> ret = -EINVAL;
>>> q = bdev_get_queue(bdev);
>>> mutex_lock(&cbt_mutex);
>>> - cbt = q->cbt;
>>> +
>>> + cbt = blk_cbt_find(q, ci.ci_name);
>>> if (!cbt ||
>>> (ci.ci_start >> cbt->block_bits) > cbt->block_max)
>>> goto ioc_get_failed;
>>> @@ -925,7 +935,7 @@ static int cbt_ioc_set(struct block_device *bdev,
>>> struct blk_user_cbt_info __use
>>> ret = -EINVAL;
>>> mutex_lock(&cbt_mutex);
>>> - cbt = q->cbt;
>>> + cbt = blk_cbt_find(q, ci.ci_name);
>>> if (!cbt)
>>> goto ioc_set_failed;
>>> @@ -948,8 +958,8 @@ static int cbt_ioc_set(struct block_device *bdev,
>>> struct blk_user_cbt_info __use
>>> ex.start = cur_ex->ce_physical >> cbt->block_bits;
>>> ex.len = DIV_ROUND_UP(cur_ex->ce_length, 1 << cbt-
>>> >block_bits);
>>> - if (ex.start > q->cbt->block_max ||
>>> - ex.start + ex.len > q->cbt->block_max ||
>>> + if (ex.start > cbt->block_max ||
>>> + ex.start + ex.len > cbt->block_max ||
>>> ex.len == 0) {
>>> ret = -EINVAL;
>>> break;
>>> @@ -1007,7 +1017,7 @@ int blk_cbt_ioctl(struct block_device *bdev,
>>> unsigned cmd, char __user *arg)
>>> case BLKCBTSTART:
>>> return cbt_ioc_init(bdev, ucbt_ioc);
>>> case BLKCBTSTOP:
>>> - return cbt_ioc_stop(bdev);
>>> + return cbt_ioc_stop(bdev, ucbt_ioc);
>>> case BLKCBTSET:
>>> return cbt_ioc_set(bdev, ucbt_ioc, 1);
>>> case BLKCBTCLR:
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 6abad29dd501..e3aa923f399e 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -428,6 +428,8 @@ struct request_queue *blk_alloc_queue(int node_id)
>>> init_waitqueue_head(&q->mq_freeze_wq);
>>> mutex_init(&q->mq_freeze_lock);
>>> + INIT_LIST_HEAD(&q->cbt_list);
>>> +
>>> blkg_init_queue(q);
>>> /*
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index fcbc83909b91..3009964707e6 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -520,7 +520,7 @@ struct request_queue {
>>> struct throtl_data *td;
>>> #endif
>>> #ifdef CONFIG_BLK_DEV_CBT
>>> - struct cbt_info *cbt;
>>> + struct list_head cbt_list;
>>> #endif
>>> struct rcu_head rcu_head;
>>> wait_queue_head_t mq_freeze_wq;
>>> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
>>> index 47efa7ef05a0..667157fe864a 100644
>>> --- a/include/uapi/linux/fs.h
>>> +++ b/include/uapi/linux/fs.h
>>> @@ -170,7 +170,7 @@ struct blk_user_cbt_snap_create {
>>> };
>>> #define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
>>> -#define BLKCBTSTOP _IO(0x12,201)
>>> +#define BLKCBTSTOP _IOR(0x12, 201, struct blk_user_cbt_info)
>>> #define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
>>> #define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
>>> #define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
>>
--
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.
More information about the Devel
mailing list