[Devel] [PATCH VZ9 v2 2/5] block/blk-cbt: allow multiple cbts in a single queue
Andrey Zhadchenko
andrey.zhadchenko at virtuozzo.com
Tue Jan 28 03:39:47 MSK 2025
Store cbts in a list instead of a single pointer. Allow
up to 16 entries.
Update all APIs to work with user-specified bitmap.
We need to exercise extra care when accessing queue->cbt_list
to be sure that accessed element still exists. Majority of
accesses, which are mostly user-space ioctls, are protected
by cbt_mutex. On hot path, we use rcu_read_lock() and
list_for_each_entry_rcu() paired with list_replace_rcu() and
list_del_rcu() for list modification.
cbt_page_alloc() can be called in both cases, so here is a small
scheme of possible paths with lock analysis
+-< cbt_page_alloc
+-< __blk_cbt_set
| +-< blk_cbt_add
| | +-< blk_cbt_bio_queue # under rcu_read_lock
| +-< __cbt_flush_cpu_cache
+-< cbt_flush_cache
+-< blk_cbt_snap_create # under cbt_mutex
+-< blk_cbt_update_cbt_size
| +-< blk_cbt_update_size # under cbt_mutex
+-< cbt_ioc_get # under cbt_mutex
+-< cbt_ioc_set # under cbt_mutex
| +-< cbt_ioc_get # under cbt_mutex
| +-< cbt_ioc_set # under cbt_mutex
+-< blk_cbt_snap_merge_back # under cbt_mutex
+-< cbt_flush_cache # ok, see above
https://virtuozzo.atlassian.net/browse/VSTOR-96269
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
v2:
- check that new CBT name does not collide with existing
- remove unnecessary INIT_LIST_HEAD
- fix return code in cbt_ioc_stop()
- release cbt_mutex on error path in cbt_ioc_stop()
- update commit message with locking explanation
- add cbt limit
- fix arguments in list_add_tail_rcu(&q->cbt_list, &cbt->list):
should be list_add_tail_rcu(&cbt->list, &q->cbt_list);
block/blk-cbt.c | 175 ++++++++++++++++++++++------------------
block/blk-core.c | 2 +
include/linux/blkdev.h | 2 +-
include/uapi/linux/fs.h | 4 +-
4 files changed, 102 insertions(+), 81 deletions(-)
diff --git a/block/blk-cbt.c b/block/blk-cbt.c
index 8ae993143098..bfc0fdaa4033 100644
--- a/block/blk-cbt.c
+++ b/block/blk-cbt.c
@@ -53,6 +53,7 @@ struct cbt_info {
blkcnt_t snp_block_max;
spinlock_t lock;
+ struct list_head list;
};
@@ -112,7 +113,11 @@ static int cbt_page_alloc(struct cbt_info **cbt_pp, unsigned long idx,
if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
cbt->count++;
} else {
- struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
+ struct cbt_info *i, *new = NULL;
+
+ list_for_each_entry_rcu(i, &cbt->queue->cbt_list, list)
+ if (!memcmp(i->name, cbt->name, sizeof(cbt->name)))
+ new = i;
spin_unlock_irq(&cbt->lock);
/* was cbt updated ? */
@@ -216,33 +221,26 @@ static int __blk_cbt_set(struct cbt_info *cbt, blkcnt_t block,
return (pages_missed && *pages_missed) ? -EAGAIN : 0;
}
-static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
+static void blk_cbt_add(struct cbt_info *cbt, blkcnt_t start, blkcnt_t len)
{
- struct cbt_info *cbt;
struct cbt_extent *ex;
struct cbt_extent old;
blkcnt_t end;
- /* Check per-cpu cache */
-
- rcu_read_lock();
- cbt = rcu_dereference(q->cbt);
- if (unlikely(!cbt))
- goto out_rcu;
if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
- goto out_rcu;
+ return;
end = DIV_ROUND_UP(start + len, 1 << cbt->block_bits);
start >>= cbt->block_bits;
len = end - start;
if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
__blk_cbt_set(cbt, start, len, 1, 1, NULL, NULL);
- goto out_rcu;
+ return;
}
ex = get_cpu_ptr(cbt->cache);
if (ex->start + ex->len == start) {
ex->len += len;
put_cpu_ptr(cbt->cache);
- goto out_rcu;
+ return;
}
old = *ex;
ex->start = start;
@@ -251,18 +249,23 @@ static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
if (likely(old.len))
__blk_cbt_set(cbt, old.start, old.len, 1, 1, NULL, NULL);
+}
+
+void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+{
+ struct cbt_info *cbt;
+
+ rcu_read_lock();
+ if (list_empty(&q->cbt_list) || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
+ goto out_rcu;
+
+ list_for_each_entry_rcu(cbt, &q->cbt_list, list)
+ blk_cbt_add(cbt, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
+
out_rcu:
rcu_read_unlock();
}
-inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
-{
- if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
- return;
-
- blk_cbt_add(q, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
-}
-
static struct cbt_info *do_cbt_alloc(struct request_queue *q, __u8 *name,
loff_t size, loff_t blocksize)
{
@@ -365,6 +368,17 @@ static int copy_cbt_to_user(struct page **map, unsigned long size,
return 0;
}
+static struct cbt_info *blk_cbt_find(struct request_queue *q, __u8 *name)
+{
+ struct cbt_info *cbt;
+
+ list_for_each_entry(cbt, &q->cbt_list, list)
+ if (!memcmp(name, cbt->name, CBT_NAME_LENGTH))
+ return cbt;
+
+ return NULL;
+}
+
static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
struct blk_user_cbt_snap_create __user *arg)
{
@@ -382,8 +396,7 @@ static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
return -EFAULT;
mutex_lock(&cbt_mutex);
- cbt = q->cbt;
-
+ cbt = blk_cbt_find(q, name);
if (!cbt) {
mutex_unlock(&cbt_mutex);
return -ENOENT;
@@ -392,11 +405,6 @@ static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
BUG_ON(!cbt->map);
BUG_ON(!cbt->block_max);
- if (!name || memcmp(name, cbt->name, sizeof(cbt->name))) {
- mutex_unlock(&cbt_mutex);
- return -EINVAL;
- }
-
if (cbt->snp_map) {
mutex_unlock(&cbt_mutex);
return -EBUSY;
@@ -461,7 +469,7 @@ static int blk_cbt_snap_drop(struct request_queue *q, __u8 *name)
int ret;
mutex_lock(&cbt_mutex);
- cbt = q->cbt;
+ cbt = blk_cbt_find(q, name);
ret = -ENOENT;
if (!cbt)
@@ -470,10 +478,6 @@ static int blk_cbt_snap_drop(struct request_queue *q, __u8 *name)
BUG_ON(!cbt->map);
BUG_ON(!cbt->block_max);
- ret = -EINVAL;
- if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
- goto out;
-
ret = -ENODEV;
map = cbt->snp_map;
if (!map)
@@ -511,7 +515,7 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
int ret;
mutex_lock(&cbt_mutex);
- cbt = q->cbt;
+ cbt = blk_cbt_find(q, name);
ret = -ENOENT;
if (!cbt)
@@ -520,10 +524,6 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
BUG_ON(!cbt->map);
BUG_ON(!cbt->block_max);
- ret = -EINVAL;
- if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
- goto out;
-
map = cbt->snp_map;
block_max = cbt->snp_block_max;
ret = -ENODEV;
@@ -568,33 +568,21 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
return ret;
}
-void blk_cbt_update_size(struct block_device *bdev)
+static void blk_cbt_update_cbt_size(struct cbt_info *cbt, loff_t new_sz)
{
- struct request_queue *q;
- struct cbt_info *new, *cbt;
+ struct cbt_info *new;
unsigned long to_cpy, idx;
unsigned bsz;
- loff_t new_sz = i_size_read(bdev->bd_inode);
int in_use = 0;
- if (!bdev->bd_disk || !bdev_get_queue(bdev))
- return;
-
- q = bdev_get_queue(bdev);
- mutex_lock(&cbt_mutex);
- cbt = q->cbt;
- if (!cbt) {
- mutex_unlock(&cbt_mutex);
- return;
- }
bsz = 1 << cbt->block_bits;
if (DIV_ROUND_UP(new_sz, bsz) <= cbt->block_max)
- goto err_mtx;
+ return;
- new = do_cbt_alloc(q, cbt->name, new_sz, bsz);
+ new = do_cbt_alloc(cbt->queue, cbt->name, new_sz, bsz);
if (IS_ERR(new)) {
set_bit(CBT_ERROR, &cbt->flags);
- goto err_mtx;
+ return;
}
to_cpy = NR_PAGES(cbt->block_max);
set_bit(CBT_NOCACHE, &cbt->flags);
@@ -606,15 +594,27 @@ void blk_cbt_update_size(struct block_device *bdev)
if (CBT_PAGE(new, idx))
get_page(CBT_PAGE(new, idx));
}
- rcu_assign_pointer(q->cbt, new);
+ list_replace_rcu(&cbt->list, &new->list);
in_use = cbt->count;
spin_unlock_irq(&cbt->lock);
if (!in_use)
call_rcu(&cbt->rcu, &cbt_release_callback);
-err_mtx:
+}
+
+void blk_cbt_update_size(struct block_device *bdev)
+{
+ struct request_queue *q;
+ struct cbt_info *cbt;
+
+ if (!bdev->bd_disk || !bdev_get_queue(bdev))
+ return;
+
+ q = bdev_get_queue(bdev);
+ mutex_lock(&cbt_mutex);
+ list_for_each_entry(cbt, &q->cbt_list, list)
+ blk_cbt_update_cbt_size(cbt, i_size_read(bdev->bd_inode));
+
mutex_unlock(&cbt_mutex);
-
-
}
static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
@@ -632,16 +632,19 @@ static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __us
q = bdev_get_queue(bdev);
mutex_lock(&cbt_mutex);
- if (q->cbt) {
- ret = -EBUSY;
- goto err_mtx;
- }
+
+ if (blk_cbt_find(q, ci.ci_name))
+ return -EEXIST;
+
+ if (list_count_nodes(&q->cbt_list) == CBT_MAX_ENTRIES)
+ return -E2BIG;
+
cbt = do_cbt_alloc(q, ci.ci_name, i_size_read(bdev->bd_inode), ci.ci_blksize);
if (IS_ERR(cbt))
ret = PTR_ERR(cbt);
else
- rcu_assign_pointer(q->cbt, cbt);
-err_mtx:
+ list_add_tail_rcu(&cbt->list, &q->cbt_list);
+
mutex_unlock(&cbt_mutex);
return ret;
}
@@ -667,35 +670,48 @@ static void cbt_release_callback(struct rcu_head *head)
kfree(cbt);
}
-void blk_cbt_release(struct request_queue *q)
+static void blk_cbt_del(struct cbt_info *cbt)
{
- struct cbt_info *cbt;
- int in_use = 0;
unsigned long flags;
+ int in_use;
- cbt = q->cbt;
- if (!cbt)
- return;
spin_lock_irqsave(&cbt->lock, flags);
set_bit(CBT_DEAD, &cbt->flags);
- rcu_assign_pointer(q->cbt, NULL);
+ list_del_rcu(&cbt->list);
in_use = cbt->count;
spin_unlock_irqrestore(&cbt->lock, flags);
if (!in_use)
call_rcu(&cbt->rcu, &cbt_release_callback);
}
-static int cbt_ioc_stop(struct block_device *bdev)
+void blk_cbt_release(struct request_queue *q)
{
+ struct cbt_info *cbt, *tmp;
+
+ list_for_each_entry_safe(cbt, tmp, &q->cbt_list, list)
+ blk_cbt_del(cbt);
+}
+
+static int cbt_ioc_stop(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+ struct blk_user_cbt_info ci;
struct request_queue *q;
+ struct cbt_info *cbt;
+
+ if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+ return -EFAULT;
mutex_lock(&cbt_mutex);
q = bdev_get_queue(bdev);
- if(!q->cbt) {
+
+ cbt = blk_cbt_find(q, ci.ci_name);
+ if (!cbt) {
mutex_unlock(&cbt_mutex);
return -EINVAL;
}
- blk_cbt_release(q);
+
+ blk_cbt_del(cbt);
+
mutex_unlock(&cbt_mutex);
return 0;
}
@@ -839,7 +855,8 @@ static int cbt_ioc_get(struct block_device *bdev, struct blk_user_cbt_info __use
ret = -EINVAL;
q = bdev_get_queue(bdev);
mutex_lock(&cbt_mutex);
- cbt = q->cbt;
+
+ cbt = blk_cbt_find(q, ci.ci_name);
if (!cbt ||
(ci.ci_start >> cbt->block_bits) > cbt->block_max)
goto ioc_get_failed;
@@ -925,7 +942,7 @@ static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __use
ret = -EINVAL;
mutex_lock(&cbt_mutex);
- cbt = q->cbt;
+ cbt = blk_cbt_find(q, ci.ci_name);
if (!cbt)
goto ioc_set_failed;
@@ -948,8 +965,8 @@ static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __use
ex.start = cur_ex->ce_physical >> cbt->block_bits;
ex.len = DIV_ROUND_UP(cur_ex->ce_length, 1 << cbt->block_bits);
- if (ex.start > q->cbt->block_max ||
- ex.start + ex.len > q->cbt->block_max ||
+ if (ex.start > cbt->block_max ||
+ ex.start + ex.len > cbt->block_max ||
ex.len == 0) {
ret = -EINVAL;
break;
@@ -1007,7 +1024,7 @@ int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
case BLKCBTSTART:
return cbt_ioc_init(bdev, ucbt_ioc);
case BLKCBTSTOP:
- return cbt_ioc_stop(bdev);
+ return cbt_ioc_stop(bdev, ucbt_ioc);
case BLKCBTSET:
return cbt_ioc_set(bdev, ucbt_ioc, 1);
case BLKCBTCLR:
diff --git a/block/blk-core.c b/block/blk-core.c
index 6abad29dd501..e3aa923f399e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -428,6 +428,8 @@ struct request_queue *blk_alloc_queue(int node_id)
init_waitqueue_head(&q->mq_freeze_wq);
mutex_init(&q->mq_freeze_lock);
+ INIT_LIST_HEAD(&q->cbt_list);
+
blkg_init_queue(q);
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fcbc83909b91..3009964707e6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -520,7 +520,7 @@ struct request_queue {
struct throtl_data *td;
#endif
#ifdef CONFIG_BLK_DEV_CBT
- struct cbt_info *cbt;
+ struct list_head cbt_list;
#endif
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 4820d5923b89..5d7014be0139 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -129,6 +129,8 @@ struct blk_user_cbt_extent {
};
#define CBT_NAME_LENGTH 128
+#define CBT_MAX_ENTRIES 16
+
struct blk_user_cbt_info {
__u8 ci_name[CBT_NAME_LENGTH]; /* CBT name */
__u64 ci_start; /* start phisical range of mapping which
@@ -169,7 +171,7 @@ struct blk_user_cbt_snap_create {
};
#define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
-#define BLKCBTSTOP _IO(0x12,201)
+#define BLKCBTSTOP _IOR(0x12, 201, struct blk_user_cbt_info)
#define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
#define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
#define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
--
2.39.3
More information about the Devel
mailing list