[Devel] [PATCH VZ9 2/3] block/blk-cbt: allow multiple cbts in a single queue

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Thu Jan 23 12:39:21 MSK 2025



On 1/23/25 17:14, Andrey Zhadchenko wrote:
> 
> 
> On 1/23/25 09:05, Pavel Tikhomirov wrote:
>> General comment:
>>
>> Do we need a restriction on maximum amount of cbts? E.g. in next patch 
>> we allow to read cbts list from userspace, if we have to many cbts it 
>> can be not so trivial to allocate proper buffer to get them all at once.
> 
> Currently userspace code looks like that:
> int cbt_dump_names(int fd, ...)
> {
>      struct blk_user_cbt_list cl = {}, *req = NULL;
>      int fd, ret = -1, i;
>      char *mem;
> 
>      ret = ioctl(fd, BLKCBTLIST, &cl);
>      if (ret < 0) {
>          ploop_err(errno, "failed to count CBT names");
>          return SYSEXIT_DEVIOC;
>      }
> 
>      req = malloc(cl.total_count * CBT_NAME_LENGTH + sizeof(cl));
>      if (!req) {
>          ploop_err(ENOMEM, "failed to allocate memory");
>          return SYSEXIT_MALLOC;
>      }
> 
>      req->count = cl.total_count;
>      ret = ioctl(fd, BLKCBTLIST, req);
>      if (ret < 0) {
>          ploop_err(errno, "failed to list CBT names");
>          ret = SYSEXIT_DEVIOC;
>          goto out;
>      }
> 
>      ...
> }
> 
> I think it is pretty easy

Yes, but malloc may fail on too big buffer.

> 
>>
>> On 1/20/25 16:34, Andrey Zhadchenko wrote:
>>> Store cbts in a list instead of a single pointer.
>>> Update all APIs to work with user-specified bitmap.
>>>
>>> https://virtuozzo.atlassian.net/browse/VSTOR-96269
>>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>> ---
>>>   block/blk-cbt.c         | 174 +++++++++++++++++++++-------------------
>>>   block/blk-core.c        |   2 +
>>>   include/linux/blkdev.h  |   2 +-
>>>   include/uapi/linux/fs.h |   2 +-
>>>   4 files changed, 96 insertions(+), 84 deletions(-)
>>>
>>> diff --git a/block/blk-cbt.c b/block/blk-cbt.c
>>> index 8ae993143098..8b0d07ee757c 100644
>>> --- a/block/blk-cbt.c
>>> +++ b/block/blk-cbt.c
>>> @@ -53,6 +53,7 @@ struct cbt_info {
>>>       blkcnt_t snp_block_max;
>>>       spinlock_t lock;
>>> +    struct list_head list;
>>>   };
>>> @@ -112,7 +113,11 @@ static int cbt_page_alloc(struct cbt_info 
>>> **cbt_pp, unsigned long idx,
>>>       if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
>>>           cbt->count++;
>>>       } else {
>>> -        struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
>>> +        struct cbt_info *i, *new = NULL;
>>> +
>>> +        list_for_each_entry_rcu(i, &cbt->queue->cbt_list, list)
>>> +            if (!memcmp(i->name, cbt->name, sizeof(cbt->name)))
>>> +                new = i;
>>>           spin_unlock_irq(&cbt->lock);
>>>           /* was cbt updated ? */
>>> @@ -216,33 +221,26 @@ static int __blk_cbt_set(struct cbt_info  *cbt, 
>>> blkcnt_t block,
>>>       return (pages_missed && *pages_missed) ? -EAGAIN : 0;
>>>   }
>>> -static void blk_cbt_add(struct request_queue *q, blkcnt_t start, 
>>> blkcnt_t len)
>>> +static void blk_cbt_add(struct cbt_info *cbt, blkcnt_t start, 
>>> blkcnt_t len)
>>>   {
>>> -    struct cbt_info *cbt;
>>>       struct cbt_extent *ex;
>>>       struct cbt_extent old;
>>>       blkcnt_t end;
>>> -    /* Check per-cpu cache */
>>> -
>>> -    rcu_read_lock();
>>> -    cbt = rcu_dereference(q->cbt);
>>> -    if (unlikely(!cbt))
>>> -        goto out_rcu;
>>>       if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
>>> -        goto out_rcu;
>>> +        return;
>>>       end = DIV_ROUND_UP(start + len, 1 << cbt->block_bits);
>>>       start >>= cbt->block_bits;
>>>       len = end - start;
>>>       if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
>>>           __blk_cbt_set(cbt, start, len, 1, 1, NULL, NULL);
>>> -        goto out_rcu;
>>> +        return;
>>>       }
>>>       ex = get_cpu_ptr(cbt->cache);
>>>       if (ex->start + ex->len == start) {
>>>           ex->len += len;
>>>           put_cpu_ptr(cbt->cache);
>>> -        goto out_rcu;
>>> +        return;
>>>       }
>>>       old = *ex;
>>>       ex->start = start;
>>> @@ -251,18 +249,23 @@ static void blk_cbt_add(struct request_queue 
>>> *q, blkcnt_t start, blkcnt_t len)
>>>       if (likely(old.len))
>>>           __blk_cbt_set(cbt, old.start, old.len, 1, 1, NULL, NULL);
>>> +}
>>> +
>>> +void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
>>> +{
>>> +    struct cbt_info *cbt;
>>> +
>>> +    rcu_read_lock();
>>> +    if (list_empty(&q->cbt_list) || bio_data_dir(bio) == READ || ! 
>>> bio->bi_iter.bi_size)
>>> +        goto out_rcu;
>>> +
>>> +    list_for_each_entry_rcu(cbt, &q->cbt_list, list)
>>> +        blk_cbt_add(cbt, bio->bi_iter.bi_sector << 9, bio- 
>>> >bi_iter.bi_size);
>>> +
>>>   out_rcu:
>>>       rcu_read_unlock();
>>>   }
>>> -inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
>>> -{
>>> -    if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
>>> -        return;
>>> -
>>> -    blk_cbt_add(q, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
>>> -}
>>> -
>>>   static struct cbt_info *do_cbt_alloc(struct request_queue *q, __u8 
>>> *name,
>>>                        loff_t size, loff_t blocksize)
>>>   {
>>> @@ -272,6 +275,7 @@ static struct cbt_info *do_cbt_alloc(struct 
>>> request_queue *q, __u8 *name,
>>>       if (!cbt)
>>>           return ERR_PTR(-ENOMEM);
>>> +    INIT_LIST_HEAD(&cbt->list);
>>>       cbt->block_bits = ilog2(blocksize);
>>>       cbt->block_max  = DIV_ROUND_UP(size, blocksize);
>>>       spin_lock_init(&cbt->lock);
>>> @@ -365,6 +369,17 @@ static int copy_cbt_to_user(struct page **map, 
>>> unsigned long size,
>>>           return 0;
>>>   }
>>> +static struct cbt_info *blk_cbt_find(struct request_queue *q, __u8 
>>> *name)
>>> +{
>>> +    struct cbt_info *cbt;
>>> +
>>> +    list_for_each_entry(cbt, &q->cbt_list, list)
>>> +        if (!memcmp(name, cbt->name, CBT_NAME_LENGTH))
>>> +            return cbt;
>>> +
>>> +    return NULL;
>>> +}
>>> +
>>>   static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
>>>                      struct blk_user_cbt_snap_create __user *arg)
>>>   {
>>> @@ -382,8 +397,7 @@ static int blk_cbt_snap_create(struct 
>>> request_queue *q, __u8 *name,
>>>           return -EFAULT;
>>>       mutex_lock(&cbt_mutex);
>>> -    cbt = q->cbt;
>>> -
>>> +    cbt = blk_cbt_find(q, name);
>>>       if (!cbt) {
>>>           mutex_unlock(&cbt_mutex);
>>>           return -ENOENT;
>>> @@ -392,11 +406,6 @@ static int blk_cbt_snap_create(struct 
>>> request_queue *q, __u8 *name,
>>>       BUG_ON(!cbt->map);
>>>       BUG_ON(!cbt->block_max);
>>> -    if (!name || memcmp(name, cbt->name, sizeof(cbt->name))) {
>>> -        mutex_unlock(&cbt_mutex);
>>> -        return -EINVAL;
>>> -    }
>>> -
>>>       if (cbt->snp_map) {
>>>           mutex_unlock(&cbt_mutex);
>>>           return -EBUSY;
>>> @@ -461,7 +470,7 @@ static int blk_cbt_snap_drop(struct request_queue 
>>> *q, __u8 *name)
>>>       int ret;
>>>       mutex_lock(&cbt_mutex);
>>> -    cbt = q->cbt;
>>> +    cbt = blk_cbt_find(q, name);
>>>       ret = -ENOENT;
>>>       if (!cbt)
>>> @@ -470,10 +479,6 @@ static int blk_cbt_snap_drop(struct 
>>> request_queue *q, __u8 *name)
>>>       BUG_ON(!cbt->map);
>>>       BUG_ON(!cbt->block_max);
>>> -    ret = -EINVAL;
>>> -    if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
>>> -        goto out;
>>> -
>>>       ret = -ENODEV;
>>>       map = cbt->snp_map;
>>>       if (!map)
>>> @@ -511,7 +516,7 @@ static int blk_cbt_snap_merge_back(struct 
>>> request_queue *q, __u8 *name)
>>>       int ret;
>>>       mutex_lock(&cbt_mutex);
>>> -    cbt = q->cbt;
>>> +    cbt = blk_cbt_find(q, name);
>>>       ret = -ENOENT;
>>>       if (!cbt)
>>> @@ -520,10 +525,6 @@ static int blk_cbt_snap_merge_back(struct 
>>> request_queue *q, __u8 *name)
>>>       BUG_ON(!cbt->map);
>>>       BUG_ON(!cbt->block_max);
>>> -    ret = -EINVAL;
>>> -    if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
>>> -        goto out;
>>> -
>>>       map = cbt->snp_map;
>>>       block_max = cbt->snp_block_max;
>>>       ret = -ENODEV;
>>> @@ -568,33 +569,21 @@ static int blk_cbt_snap_merge_back(struct 
>>> request_queue *q, __u8 *name)
>>>       return ret;
>>>   }
>>> -void blk_cbt_update_size(struct block_device *bdev)
>>> +static void blk_cbt_update_cbt_size(struct cbt_info *cbt, loff_t 
>>> new_sz)
>>>   {
>>> -    struct request_queue *q;
>>> -    struct cbt_info *new, *cbt;
>>> +    struct cbt_info *new;
>>>       unsigned long to_cpy, idx;
>>>       unsigned bsz;
>>> -    loff_t new_sz = i_size_read(bdev->bd_inode);
>>>       int in_use = 0;
>>> -    if (!bdev->bd_disk || !bdev_get_queue(bdev))
>>> -        return;
>>> -
>>> -    q = bdev_get_queue(bdev);
>>> -    mutex_lock(&cbt_mutex);
>>> -    cbt = q->cbt;
>>> -    if (!cbt) {
>>> -        mutex_unlock(&cbt_mutex);
>>> -        return;
>>> -    }
>>>       bsz = 1 << cbt->block_bits;
>>>       if (DIV_ROUND_UP(new_sz, bsz) <= cbt->block_max)
>>> -        goto err_mtx;
>>> +        return;
>>> -    new = do_cbt_alloc(q, cbt->name, new_sz, bsz);
>>> +    new = do_cbt_alloc(cbt->queue, cbt->name, new_sz, bsz);
>>>       if (IS_ERR(new)) {
>>>           set_bit(CBT_ERROR, &cbt->flags);
>>> -        goto err_mtx;
>>> +        return;
>>>       }
>>>       to_cpy = NR_PAGES(cbt->block_max);
>>>       set_bit(CBT_NOCACHE, &cbt->flags);
>>> @@ -606,15 +595,27 @@ void blk_cbt_update_size(struct block_device 
>>> *bdev)
>>>           if (CBT_PAGE(new, idx))
>>>               get_page(CBT_PAGE(new, idx));
>>>       }
>>> -    rcu_assign_pointer(q->cbt, new);
>>> +    list_replace_rcu(&cbt->list, &new->list);
>>>       in_use = cbt->count;
>>>       spin_unlock_irq(&cbt->lock);
>>>       if (!in_use)
>>>           call_rcu(&cbt->rcu, &cbt_release_callback);
>>> -err_mtx:
>>> +}
>>> +
>>> +void blk_cbt_update_size(struct block_device *bdev)
>>> +{
>>> +    struct request_queue *q;
>>> +    struct cbt_info *cbt;
>>> +
>>> +    if (!bdev->bd_disk || !bdev_get_queue(bdev))
>>> +        return;
>>> +
>>> +    q = bdev_get_queue(bdev);
>>> +    mutex_lock(&cbt_mutex);
>>> +    list_for_each_entry(cbt, &q->cbt_list, list)
>>> +        blk_cbt_update_cbt_size(cbt, i_size_read(bdev->bd_inode));
>>> +
>>>       mutex_unlock(&cbt_mutex);
>>> -
>>> -
>>>   }
>>>   static int cbt_ioc_init(struct block_device *bdev, struct 
>>> blk_user_cbt_info __user *ucbt_ioc)
>>> @@ -632,16 +633,13 @@ static int cbt_ioc_init(struct block_device 
>>> *bdev, struct blk_user_cbt_info __us
>>>       q = bdev_get_queue(bdev);
>>>       mutex_lock(&cbt_mutex);
>>> -    if (q->cbt) {
>>> -        ret = -EBUSY;
>>> -        goto err_mtx;
>>> -    }
>>> +
>>>       cbt = do_cbt_alloc(q, ci.ci_name, i_size_read(bdev->bd_inode), 
>>> ci.ci_blksize);
>>>       if (IS_ERR(cbt))
>>>           ret = PTR_ERR(cbt);
>>>       else
>>> -        rcu_assign_pointer(q->cbt, cbt);
>>> -err_mtx:
>>> +        list_add_tail_rcu(&q->cbt_list, &cbt->list);
>>> +
>>>       mutex_unlock(&cbt_mutex);
>>>       return ret;
>>>   }
>>> @@ -667,35 +665,46 @@ static void cbt_release_callback(struct 
>>> rcu_head *head)
>>>       kfree(cbt);
>>>   }
>>> -void blk_cbt_release(struct request_queue *q)
>>> +static void blk_cbt_del(struct cbt_info *cbt)
>>>   {
>>> -    struct cbt_info *cbt;
>>> -    int in_use = 0;
>>>       unsigned long flags;
>>> +    int in_use;
>>> -    cbt = q->cbt;
>>> -    if (!cbt)
>>> -        return;
>>>       spin_lock_irqsave(&cbt->lock, flags);
>>>       set_bit(CBT_DEAD, &cbt->flags);
>>> -    rcu_assign_pointer(q->cbt, NULL);
>>> +    list_del_rcu(&cbt->list);
>>>       in_use = cbt->count;
>>>       spin_unlock_irqrestore(&cbt->lock, flags);
>>>       if (!in_use)
>>>           call_rcu(&cbt->rcu, &cbt_release_callback);
>>>   }
>>> -static int cbt_ioc_stop(struct block_device *bdev)
>>> +void blk_cbt_release(struct request_queue *q)
>>>   {
>>> +    struct cbt_info *cbt, *tmp;
>>> +
>>> +    list_for_each_entry_safe(cbt, tmp, &q->cbt_list, list)
>>> +        blk_cbt_del(cbt);
>>> +}
>>> +
>>> +static int cbt_ioc_stop(struct block_device *bdev, struct 
>>> blk_user_cbt_info __user *ucbt_ioc)
>>> +{
>>> +    struct blk_user_cbt_info ci;
>>>       struct request_queue *q;
>>> +    struct cbt_info *cbt;
>>> +
>>> +    if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
>>> +        return -EFAULT;
>>>       mutex_lock(&cbt_mutex);
>>>       q = bdev_get_queue(bdev);
>>> -    if(!q->cbt) {
>>> -        mutex_unlock(&cbt_mutex);
>>> -        return -EINVAL;
>>> -    }
>>> -    blk_cbt_release(q);
>>> +
>>> +    cbt = blk_cbt_find(q, ci.ci_name);
>>> +    if (!cbt)
>>> +        return -ENOMEM;
>>> +
>>> +    blk_cbt_del(cbt);
>>> +
>>>       mutex_unlock(&cbt_mutex);
>>>       return 0;
>>>   }
>>> @@ -839,7 +848,8 @@ static int cbt_ioc_get(struct block_device *bdev, 
>>> struct blk_user_cbt_info __use
>>>       ret = -EINVAL;
>>>       q = bdev_get_queue(bdev);
>>>       mutex_lock(&cbt_mutex);
>>> -    cbt = q->cbt;
>>> +
>>> +    cbt = blk_cbt_find(q, ci.ci_name);
>>>       if (!cbt ||
>>>           (ci.ci_start >> cbt->block_bits) > cbt->block_max)
>>>           goto ioc_get_failed;
>>> @@ -925,7 +935,7 @@ static int cbt_ioc_set(struct block_device *bdev, 
>>> struct blk_user_cbt_info __use
>>>       ret = -EINVAL;
>>>       mutex_lock(&cbt_mutex);
>>> -    cbt = q->cbt;
>>> +    cbt = blk_cbt_find(q, ci.ci_name);
>>>       if (!cbt)
>>>           goto ioc_set_failed;
>>> @@ -948,8 +958,8 @@ static int cbt_ioc_set(struct block_device *bdev, 
>>> struct blk_user_cbt_info __use
>>>           ex.start  = cur_ex->ce_physical >> cbt->block_bits;
>>>           ex.len  = DIV_ROUND_UP(cur_ex->ce_length, 1 << cbt- 
>>> >block_bits);
>>> -        if (ex.start > q->cbt->block_max ||
>>> -            ex.start + ex.len > q->cbt->block_max ||
>>> +        if (ex.start > cbt->block_max ||
>>> +            ex.start + ex.len > cbt->block_max ||
>>>               ex.len == 0) {
>>>               ret = -EINVAL;
>>>               break;
>>> @@ -1007,7 +1017,7 @@ int blk_cbt_ioctl(struct block_device *bdev, 
>>> unsigned cmd, char __user *arg)
>>>       case BLKCBTSTART:
>>>           return cbt_ioc_init(bdev, ucbt_ioc);
>>>       case BLKCBTSTOP:
>>> -        return cbt_ioc_stop(bdev);
>>> +        return cbt_ioc_stop(bdev, ucbt_ioc);
>>>       case BLKCBTSET:
>>>           return cbt_ioc_set(bdev, ucbt_ioc, 1);
>>>       case BLKCBTCLR:
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 6abad29dd501..e3aa923f399e 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -428,6 +428,8 @@ struct request_queue *blk_alloc_queue(int node_id)
>>>       init_waitqueue_head(&q->mq_freeze_wq);
>>>       mutex_init(&q->mq_freeze_lock);
>>> +    INIT_LIST_HEAD(&q->cbt_list);
>>> +
>>>       blkg_init_queue(q);
>>>       /*
>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>> index fcbc83909b91..3009964707e6 100644
>>> --- a/include/linux/blkdev.h
>>> +++ b/include/linux/blkdev.h
>>> @@ -520,7 +520,7 @@ struct request_queue {
>>>       struct throtl_data *td;
>>>   #endif
>>>   #ifdef CONFIG_BLK_DEV_CBT
>>> -    struct cbt_info *cbt;
>>> +    struct list_head cbt_list;
>>>   #endif
>>>       struct rcu_head        rcu_head;
>>>       wait_queue_head_t    mq_freeze_wq;
>>> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
>>> index 47efa7ef05a0..667157fe864a 100644
>>> --- a/include/uapi/linux/fs.h
>>> +++ b/include/uapi/linux/fs.h
>>> @@ -170,7 +170,7 @@ struct blk_user_cbt_snap_create {
>>>   };
>>>   #define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
>>> -#define BLKCBTSTOP _IO(0x12,201)
>>> +#define BLKCBTSTOP _IOR(0x12, 201, struct blk_user_cbt_info)
>>>   #define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
>>>   #define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
>>>   #define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
>>

-- 
Best regards, Tikhomirov Pavel
Senior Software Developer, Virtuozzo.



More information about the Devel mailing list