[Devel] [PATCH VZ9 v2 2/5] block/blk-cbt: allow multiple cbts in a single queue

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Tue Jan 28 03:39:47 MSK 2025


Store cbts in a list instead of a single pointer. Allow
up to 16 entries.
Update all APIs to work with user-specified bitmap.

We need to exercise extra care when accessing queue->cbt_list
to be sure that accessed element still exists. Majority of
accesses, which are mostly user-space ioctls, are protected
by cbt_mutex. On hot path, we use rcu_read_lock() and
list_for_each_entry_rcu() paired with list_replace_rcu() and
list_del_rcu() for list modification.
cbt_page_alloc() can be called in both cases, so here is a small
scheme of possible paths with lock analysis
  +-< cbt_page_alloc
    +-< __blk_cbt_set
    | +-< blk_cbt_add
    | | +-< blk_cbt_bio_queue # under rcu_read_lock
    | +-< __cbt_flush_cpu_cache
        +-< cbt_flush_cache
          +-< blk_cbt_snap_create # under cbt_mutex
          +-< blk_cbt_update_cbt_size
          | +-< blk_cbt_update_size # under cbt_mutex
          +-< cbt_ioc_get # under cbt_mutex
          +-< cbt_ioc_set # under cbt_mutex
    | +-< cbt_ioc_get # under cbt_mutex
    | +-< cbt_ioc_set # under cbt_mutex
    +-< blk_cbt_snap_merge_back # under cbt_mutex
    +-< cbt_flush_cache # ok, see above

https://virtuozzo.atlassian.net/browse/VSTOR-96269
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>

---
v2:
 - check that new CBT name does not collide with existing
 - remove unnecessary INIT_LIST_HEAD
 - fix return code in cbt_ioc_stop()
 - release cbt_mutex on error path in cbt_ioc_stop()
 - update commit message with locking explanation
 - add cbt limit
 - fix arguments in list_add_tail_rcu(&q->cbt_list, &cbt->list):
should be list_add_tail_rcu(&cbt->list, &q->cbt_list);

 block/blk-cbt.c         | 175 ++++++++++++++++++++++------------------
 block/blk-core.c        |   2 +
 include/linux/blkdev.h  |   2 +-
 include/uapi/linux/fs.h |   4 +-
 4 files changed, 102 insertions(+), 81 deletions(-)

diff --git a/block/blk-cbt.c b/block/blk-cbt.c
index 8ae993143098..bfc0fdaa4033 100644
--- a/block/blk-cbt.c
+++ b/block/blk-cbt.c
@@ -53,6 +53,7 @@ struct cbt_info {
 	blkcnt_t snp_block_max;
 
 	spinlock_t lock;
+	struct list_head list;
 };
 
 
@@ -112,7 +113,11 @@ static int cbt_page_alloc(struct cbt_info  **cbt_pp, unsigned long idx,
 	if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
 		cbt->count++;
 	} else {
-		struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
+		struct cbt_info *i, *new = NULL;
+
+		list_for_each_entry_rcu(i, &cbt->queue->cbt_list, list)
+			if (!memcmp(i->name, cbt->name, sizeof(cbt->name)))
+				new = i;
 
 		spin_unlock_irq(&cbt->lock);
 		/* was cbt updated ? */
@@ -216,33 +221,26 @@ static int __blk_cbt_set(struct cbt_info  *cbt, blkcnt_t block,
 	return (pages_missed && *pages_missed) ? -EAGAIN : 0;
 }
 
-static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
+static void blk_cbt_add(struct cbt_info *cbt, blkcnt_t start, blkcnt_t len)
 {
-	struct cbt_info *cbt;
 	struct cbt_extent *ex;
 	struct cbt_extent old;
 	blkcnt_t end;
-	/* Check per-cpu cache */
-
-	rcu_read_lock();
-	cbt = rcu_dereference(q->cbt);
-	if (unlikely(!cbt))
-		goto out_rcu;
 
 	if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
-		goto out_rcu;
+		return;
 	end = DIV_ROUND_UP(start + len, 1 << cbt->block_bits);
 	start >>= cbt->block_bits;
 	len = end - start;
 	if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
 		__blk_cbt_set(cbt, start, len, 1, 1, NULL, NULL);
-		goto out_rcu;
+		return;
 	}
 	ex = get_cpu_ptr(cbt->cache);
 	if (ex->start + ex->len == start) {
 		ex->len += len;
 		put_cpu_ptr(cbt->cache);
-		goto out_rcu;
+		return;
 	}
 	old = *ex;
 	ex->start = start;
@@ -251,18 +249,23 @@ static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
 
 	if (likely(old.len))
 		__blk_cbt_set(cbt, old.start, old.len, 1, 1, NULL, NULL);
+}
+
+void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	struct cbt_info *cbt;
+
+	rcu_read_lock();
+	if (list_empty(&q->cbt_list) || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
+		goto out_rcu;
+
+	list_for_each_entry_rcu(cbt, &q->cbt_list, list)
+		blk_cbt_add(cbt, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
+
 out_rcu:
 	rcu_read_unlock();
 }
 
-inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
-{
-	if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
-		return;
-
-	blk_cbt_add(q, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
-}
-
 static struct cbt_info *do_cbt_alloc(struct request_queue *q, __u8 *name,
 				     loff_t size, loff_t blocksize)
 {
@@ -365,6 +368,17 @@ static int copy_cbt_to_user(struct page **map, unsigned long size,
         return 0;
 }
 
+static struct cbt_info *blk_cbt_find(struct request_queue *q, __u8 *name)
+{
+	struct cbt_info *cbt;
+
+	list_for_each_entry(cbt, &q->cbt_list, list)
+		if (!memcmp(name, cbt->name, CBT_NAME_LENGTH))
+			return cbt;
+
+	return NULL;
+}
+
 static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
 			       struct blk_user_cbt_snap_create __user *arg)
 {
@@ -382,8 +396,7 @@ static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
 		return -EFAULT;
 
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
-
+	cbt = blk_cbt_find(q, name);
 	if (!cbt) {
 		mutex_unlock(&cbt_mutex);
 		return -ENOENT;
@@ -392,11 +405,6 @@ static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
 	BUG_ON(!cbt->map);
 	BUG_ON(!cbt->block_max);
 
-	if (!name || memcmp(name, cbt->name, sizeof(cbt->name))) {
-		mutex_unlock(&cbt_mutex);
-		return -EINVAL;
-	}
-
 	if (cbt->snp_map) {
 		mutex_unlock(&cbt_mutex);
 		return -EBUSY;
@@ -461,7 +469,7 @@ static int blk_cbt_snap_drop(struct request_queue *q, __u8 *name)
 	int ret;
 
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+	cbt = blk_cbt_find(q, name);
 
 	ret = -ENOENT;
 	if (!cbt)
@@ -470,10 +478,6 @@ static int blk_cbt_snap_drop(struct request_queue *q, __u8 *name)
 	BUG_ON(!cbt->map);
 	BUG_ON(!cbt->block_max);
 
-	ret = -EINVAL;
-	if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
-		goto out;
-
 	ret = -ENODEV;
 	map = cbt->snp_map;
 	if (!map)
@@ -511,7 +515,7 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
 	int ret;
 
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+	cbt = blk_cbt_find(q, name);
 
 	ret = -ENOENT;
 	if (!cbt)
@@ -520,10 +524,6 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
 	BUG_ON(!cbt->map);
 	BUG_ON(!cbt->block_max);
 
-	ret = -EINVAL;
-	if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
-		goto out;
-
 	map = cbt->snp_map;
 	block_max = cbt->snp_block_max;
 	ret = -ENODEV;
@@ -568,33 +568,21 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
 	return ret;
 }
 
-void blk_cbt_update_size(struct block_device *bdev)
+static void blk_cbt_update_cbt_size(struct cbt_info *cbt, loff_t new_sz)
 {
-	struct request_queue *q;
-	struct cbt_info *new, *cbt;
+	struct cbt_info *new;
 	unsigned long to_cpy, idx;
 	unsigned bsz;
-	loff_t new_sz = i_size_read(bdev->bd_inode);
 	int in_use = 0;
 
-	if (!bdev->bd_disk || !bdev_get_queue(bdev))
-		return;
-
-	q = bdev_get_queue(bdev);
-	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
-	if (!cbt) {
-		mutex_unlock(&cbt_mutex);
-		return;
-	}
 	bsz = 1 << cbt->block_bits;
 	if (DIV_ROUND_UP(new_sz, bsz) <= cbt->block_max)
-		goto err_mtx;
+		return;
 
-	new = do_cbt_alloc(q, cbt->name, new_sz, bsz);
+	new = do_cbt_alloc(cbt->queue, cbt->name, new_sz, bsz);
 	if (IS_ERR(new)) {
 		set_bit(CBT_ERROR, &cbt->flags);
-		goto err_mtx;
+		return;
 	}
 	to_cpy = NR_PAGES(cbt->block_max);
 	set_bit(CBT_NOCACHE, &cbt->flags);
@@ -606,15 +594,27 @@ void blk_cbt_update_size(struct block_device *bdev)
 		if (CBT_PAGE(new, idx))
 			get_page(CBT_PAGE(new, idx));
 	}
-	rcu_assign_pointer(q->cbt, new);
+	list_replace_rcu(&cbt->list, &new->list);
 	in_use = cbt->count;
 	spin_unlock_irq(&cbt->lock);
 	if (!in_use)
 		call_rcu(&cbt->rcu, &cbt_release_callback);
-err_mtx:
+}
+
+void blk_cbt_update_size(struct block_device *bdev)
+{
+	struct request_queue *q;
+	struct cbt_info *cbt;
+
+	if (!bdev->bd_disk || !bdev_get_queue(bdev))
+		return;
+
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	list_for_each_entry(cbt, &q->cbt_list, list)
+		blk_cbt_update_cbt_size(cbt, i_size_read(bdev->bd_inode));
+
 	mutex_unlock(&cbt_mutex);
-
-
 }
 
 static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
@@ -632,16 +632,19 @@ static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __us
 
 	q = bdev_get_queue(bdev);
 	mutex_lock(&cbt_mutex);
-	if (q->cbt) {
-		ret = -EBUSY;
-		goto err_mtx;
-	}
+
+	if (blk_cbt_find(q, ci.ci_name))
+		return -EEXIST;
+
+	if (list_count_nodes(&q->cbt_list) == CBT_MAX_ENTRIES)
+		return -E2BIG;
+
 	cbt = do_cbt_alloc(q, ci.ci_name, i_size_read(bdev->bd_inode), ci.ci_blksize);
 	if (IS_ERR(cbt))
 		ret = PTR_ERR(cbt);
 	else
-		rcu_assign_pointer(q->cbt, cbt);
-err_mtx:
+		list_add_tail_rcu(&cbt->list, &q->cbt_list);
+
 	mutex_unlock(&cbt_mutex);
 	return ret;
 }
@@ -667,35 +670,48 @@ static void cbt_release_callback(struct rcu_head *head)
 	kfree(cbt);
 }
 
-void blk_cbt_release(struct request_queue *q)
+static void blk_cbt_del(struct cbt_info *cbt)
 {
-	struct cbt_info *cbt;
-	int in_use = 0;
 	unsigned long flags;
+	int in_use;
 
-	cbt = q->cbt;
-	if (!cbt)
-		return;
 	spin_lock_irqsave(&cbt->lock, flags);
 	set_bit(CBT_DEAD, &cbt->flags);
-	rcu_assign_pointer(q->cbt, NULL);
+	list_del_rcu(&cbt->list);
 	in_use = cbt->count;
 	spin_unlock_irqrestore(&cbt->lock, flags);
 	if (!in_use)
 		call_rcu(&cbt->rcu, &cbt_release_callback);
 }
 
-static int cbt_ioc_stop(struct block_device *bdev)
+void blk_cbt_release(struct request_queue *q)
 {
+	struct cbt_info *cbt, *tmp;
+
+	list_for_each_entry_safe(cbt, tmp, &q->cbt_list, list)
+		blk_cbt_del(cbt);
+}
+
+static int cbt_ioc_stop(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+	struct blk_user_cbt_info ci;
 	struct request_queue *q;
+	struct cbt_info *cbt;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
 
 	mutex_lock(&cbt_mutex);
 	q = bdev_get_queue(bdev);
-	if(!q->cbt) {
+
+	cbt = blk_cbt_find(q, ci.ci_name);
+	if (!cbt) {
 		mutex_unlock(&cbt_mutex);
 		return -EINVAL;
 	}
-	blk_cbt_release(q);
+
+	blk_cbt_del(cbt);
+
 	mutex_unlock(&cbt_mutex);
 	return 0;
 }
@@ -839,7 +855,8 @@ static int cbt_ioc_get(struct block_device *bdev, struct blk_user_cbt_info __use
 	ret = -EINVAL;
 	q = bdev_get_queue(bdev);
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+
+	cbt = blk_cbt_find(q, ci.ci_name);
 	if (!cbt ||
 	    (ci.ci_start >> cbt->block_bits) > cbt->block_max)
 		goto ioc_get_failed;
@@ -925,7 +942,7 @@ static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __use
 
 	ret = -EINVAL;
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+	cbt = blk_cbt_find(q, ci.ci_name);
 	if (!cbt)
 		goto ioc_set_failed;
 
@@ -948,8 +965,8 @@ static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __use
 
 		ex.start  = cur_ex->ce_physical >> cbt->block_bits;
 		ex.len  = DIV_ROUND_UP(cur_ex->ce_length, 1 << cbt->block_bits);
-		if (ex.start > q->cbt->block_max ||
-		    ex.start + ex.len > q->cbt->block_max ||
+		if (ex.start > cbt->block_max ||
+		    ex.start + ex.len > cbt->block_max ||
 		    ex.len == 0) {
 			ret = -EINVAL;
 			break;
@@ -1007,7 +1024,7 @@ int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	case BLKCBTSTART:
 		return cbt_ioc_init(bdev, ucbt_ioc);
 	case BLKCBTSTOP:
-		return cbt_ioc_stop(bdev);
+		return cbt_ioc_stop(bdev, ucbt_ioc);
 	case BLKCBTSET:
 		return cbt_ioc_set(bdev, ucbt_ioc, 1);
 	case BLKCBTCLR:
diff --git a/block/blk-core.c b/block/blk-core.c
index 6abad29dd501..e3aa923f399e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -428,6 +428,8 @@ struct request_queue *blk_alloc_queue(int node_id)
 	init_waitqueue_head(&q->mq_freeze_wq);
 	mutex_init(&q->mq_freeze_lock);
 
+	INIT_LIST_HEAD(&q->cbt_list);
+
 	blkg_init_queue(q);
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fcbc83909b91..3009964707e6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -520,7 +520,7 @@ struct request_queue {
 	struct throtl_data *td;
 #endif
 #ifdef CONFIG_BLK_DEV_CBT
-	struct cbt_info *cbt;
+	struct list_head cbt_list;
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 4820d5923b89..5d7014be0139 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -129,6 +129,8 @@ struct blk_user_cbt_extent {
 };
 
 #define CBT_NAME_LENGTH 128
+#define CBT_MAX_ENTRIES 16
+
 struct blk_user_cbt_info {
 	__u8  ci_name[CBT_NAME_LENGTH];	/* CBT name */
 	__u64 ci_start;		/* start phisical range of mapping which
@@ -169,7 +171,7 @@ struct blk_user_cbt_snap_create {
 };
 
 #define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
-#define BLKCBTSTOP _IO(0x12,201)
+#define BLKCBTSTOP _IOR(0x12, 201, struct blk_user_cbt_info)
 #define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
 #define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
 #define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
-- 
2.39.3



More information about the Devel mailing list