[Devel] [PATCH RHEL9 COMMIT] block/blk-cbt: allow multiple cbts in a single queue

Konstantin Khorenko khorenko at virtuozzo.com
Tue Jan 28 17:39:57 MSK 2025


The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git
after rh9-5.14.0-427.44.1.vz9.80.7
------>
commit 0d3dc1c86af908554c169a1b03b1a585279a48f0
Author: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
Date:   Tue Jan 28 11:15:06 2025 +0300

    block/blk-cbt: allow multiple cbts in a single queue
    
    Store cbts in a list instead of a single pointer. Allow
    up to 16 entries.
    Update all APIs to work with user-specified bitmap.
    
    We need to exercise extra care when accessing queue->cbt_list
    to be sure that accessed element still exists. Majority of
    accesses, which are mostly user-space ioctls, are protected
    by cbt_mutex. On hot path, we use rcu_read_lock() and
    list_for_each_entry_rcu() paired with list_replace_rcu() and
    list_del_rcu() for list modification.
    cbt_page_alloc() can be called in both cases, so here is a small
    scheme of possible paths with lock analysis
      +-< cbt_page_alloc
        +-< __blk_cbt_set
        | +-< blk_cbt_add
        | | +-< blk_cbt_bio_queue # under rcu_read_lock
        | +-< __cbt_flush_cpu_cache
            +-< cbt_flush_cache
              +-< blk_cbt_snap_create # under cbt_mutex
              +-< blk_cbt_update_cbt_size
              | +-< blk_cbt_update_size # under cbt_mutex
              +-< cbt_ioc_get # under cbt_mutex
              +-< cbt_ioc_set # under cbt_mutex
        | +-< cbt_ioc_get # under cbt_mutex
        | +-< cbt_ioc_set # under cbt_mutex
        +-< blk_cbt_snap_merge_back # under cbt_mutex
        +-< cbt_flush_cache # ok, see above
    
    https://virtuozzo.atlassian.net/browse/VSTOR-96269
    Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
    Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    ======
    Patchset description:
    blk-cbt: improvements
    
    Currently blk-cbt use UUID as a name. This is a big limitation.
    We are going to expose bitmap names to libvirt, so we should
    not expect users to follow this. Moreover, even VHI backup
    utilities use non-UUID names sometimes. Technically QEMU
    allows us bitmap names up to 1024 bytes, but 128 should be
    fine.
    Multiple simultaneous bitmaps are needed for backup plans,
    so allow them.
    Add new API to list bitmap names, adjust most of the API to
    make operations on bitmap with user-provided name.
    
    Andrey Zhadchenko (5):
      block/blk-cbt: rework uuid field
      block/blk-cbt: allow multiple cbts in a single queue
      blk-cbt: introduce ABI versioning
      block/blk-cbt: add BLKCBTLIST ioctl
      block/blk-cbt: add BLKCBTRENAME instead of BLKCBTSET flag
    
    Feature: cbt: changed block tracking (for backup)
---
 block/blk-cbt.c         | 169 ++++++++++++++++++++++++++----------------------
 block/blk-core.c        |   2 +
 include/linux/blkdev.h  |   2 +-
 include/uapi/linux/fs.h |   4 +-
 4 files changed, 99 insertions(+), 78 deletions(-)

diff --git a/block/blk-cbt.c b/block/blk-cbt.c
index 8ae993143098..a12d16a1ed39 100644
--- a/block/blk-cbt.c
+++ b/block/blk-cbt.c
@@ -53,6 +53,7 @@ struct cbt_info {
 	blkcnt_t snp_block_max;
 
 	spinlock_t lock;
+	struct list_head list;
 };
 
 
@@ -112,7 +113,11 @@ static int cbt_page_alloc(struct cbt_info  **cbt_pp, unsigned long idx,
 	if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
 		cbt->count++;
 	} else {
-		struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
+		struct cbt_info *i, *new = NULL;
+
+		list_for_each_entry_rcu(i, &cbt->queue->cbt_list, list)
+			if (!memcmp(i->name, cbt->name, sizeof(cbt->name)))
+				new = i;
 
 		spin_unlock_irq(&cbt->lock);
 		/* was cbt updated ? */
@@ -216,33 +221,26 @@ static int __blk_cbt_set(struct cbt_info  *cbt, blkcnt_t block,
 	return (pages_missed && *pages_missed) ? -EAGAIN : 0;
 }
 
-static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
+static void blk_cbt_add(struct cbt_info *cbt, blkcnt_t start, blkcnt_t len)
 {
-	struct cbt_info *cbt;
 	struct cbt_extent *ex;
 	struct cbt_extent old;
 	blkcnt_t end;
-	/* Check per-cpu cache */
-
-	rcu_read_lock();
-	cbt = rcu_dereference(q->cbt);
-	if (unlikely(!cbt))
-		goto out_rcu;
 
 	if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
-		goto out_rcu;
+		return;
 	end = DIV_ROUND_UP(start + len, 1 << cbt->block_bits);
 	start >>= cbt->block_bits;
 	len = end - start;
 	if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
 		__blk_cbt_set(cbt, start, len, 1, 1, NULL, NULL);
-		goto out_rcu;
+		return;
 	}
 	ex = get_cpu_ptr(cbt->cache);
 	if (ex->start + ex->len == start) {
 		ex->len += len;
 		put_cpu_ptr(cbt->cache);
-		goto out_rcu;
+		return;
 	}
 	old = *ex;
 	ex->start = start;
@@ -251,16 +249,21 @@ static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
 
 	if (likely(old.len))
 		__blk_cbt_set(cbt, old.start, old.len, 1, 1, NULL, NULL);
-out_rcu:
-	rcu_read_unlock();
 }
 
-inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
 {
-	if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
-		return;
+	struct cbt_info *cbt;
+
+	rcu_read_lock();
+	if (list_empty(&q->cbt_list) || bio_data_dir(bio) == READ || !bio->bi_iter.bi_size)
+		goto out_rcu;
 
-	blk_cbt_add(q, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
+	list_for_each_entry_rcu(cbt, &q->cbt_list, list)
+		blk_cbt_add(cbt, bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
+
+out_rcu:
+	rcu_read_unlock();
 }
 
 static struct cbt_info *do_cbt_alloc(struct request_queue *q, __u8 *name,
@@ -365,6 +368,17 @@ static int copy_cbt_to_user(struct page **map, unsigned long size,
         return 0;
 }
 
+static struct cbt_info *blk_cbt_find(struct request_queue *q, __u8 *name)
+{
+	struct cbt_info *cbt;
+
+	list_for_each_entry(cbt, &q->cbt_list, list)
+		if (!memcmp(name, cbt->name, CBT_NAME_LENGTH))
+			return cbt;
+
+	return NULL;
+}
+
 static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
 			       struct blk_user_cbt_snap_create __user *arg)
 {
@@ -382,8 +396,7 @@ static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
 		return -EFAULT;
 
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
-
+	cbt = blk_cbt_find(q, name);
 	if (!cbt) {
 		mutex_unlock(&cbt_mutex);
 		return -ENOENT;
@@ -392,11 +405,6 @@ static int blk_cbt_snap_create(struct request_queue *q, __u8 *name,
 	BUG_ON(!cbt->map);
 	BUG_ON(!cbt->block_max);
 
-	if (!name || memcmp(name, cbt->name, sizeof(cbt->name))) {
-		mutex_unlock(&cbt_mutex);
-		return -EINVAL;
-	}
-
 	if (cbt->snp_map) {
 		mutex_unlock(&cbt_mutex);
 		return -EBUSY;
@@ -461,7 +469,7 @@ static int blk_cbt_snap_drop(struct request_queue *q, __u8 *name)
 	int ret;
 
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+	cbt = blk_cbt_find(q, name);
 
 	ret = -ENOENT;
 	if (!cbt)
@@ -470,10 +478,6 @@ static int blk_cbt_snap_drop(struct request_queue *q, __u8 *name)
 	BUG_ON(!cbt->map);
 	BUG_ON(!cbt->block_max);
 
-	ret = -EINVAL;
-	if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
-		goto out;
-
 	ret = -ENODEV;
 	map = cbt->snp_map;
 	if (!map)
@@ -511,7 +515,7 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
 	int ret;
 
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+	cbt = blk_cbt_find(q, name);
 
 	ret = -ENOENT;
 	if (!cbt)
@@ -520,10 +524,6 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
 	BUG_ON(!cbt->map);
 	BUG_ON(!cbt->block_max);
 
-	ret = -EINVAL;
-	if (!name || memcmp(name, cbt->name, sizeof(cbt->name)))
-		goto out;
-
 	map = cbt->snp_map;
 	block_max = cbt->snp_block_max;
 	ret = -ENODEV;
@@ -568,33 +568,21 @@ static int blk_cbt_snap_merge_back(struct request_queue *q, __u8 *name)
 	return ret;
 }
 
-void blk_cbt_update_size(struct block_device *bdev)
+static void blk_cbt_update_cbt_size(struct cbt_info *cbt, loff_t new_sz)
 {
-	struct request_queue *q;
-	struct cbt_info *new, *cbt;
+	struct cbt_info *new;
 	unsigned long to_cpy, idx;
 	unsigned bsz;
-	loff_t new_sz = i_size_read(bdev->bd_inode);
 	int in_use = 0;
 
-	if (!bdev->bd_disk || !bdev_get_queue(bdev))
-		return;
-
-	q = bdev_get_queue(bdev);
-	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
-	if (!cbt) {
-		mutex_unlock(&cbt_mutex);
-		return;
-	}
 	bsz = 1 << cbt->block_bits;
 	if (DIV_ROUND_UP(new_sz, bsz) <= cbt->block_max)
-		goto err_mtx;
+		return;
 
-	new = do_cbt_alloc(q, cbt->name, new_sz, bsz);
+	new = do_cbt_alloc(cbt->queue, cbt->name, new_sz, bsz);
 	if (IS_ERR(new)) {
 		set_bit(CBT_ERROR, &cbt->flags);
-		goto err_mtx;
+		return;
 	}
 	to_cpy = NR_PAGES(cbt->block_max);
 	set_bit(CBT_NOCACHE, &cbt->flags);
@@ -606,15 +594,27 @@ void blk_cbt_update_size(struct block_device *bdev)
 		if (CBT_PAGE(new, idx))
 			get_page(CBT_PAGE(new, idx));
 	}
-	rcu_assign_pointer(q->cbt, new);
+	list_replace_rcu(&cbt->list, &new->list);
 	in_use = cbt->count;
 	spin_unlock_irq(&cbt->lock);
 	if (!in_use)
 		call_rcu(&cbt->rcu, &cbt_release_callback);
-err_mtx:
-	mutex_unlock(&cbt_mutex);
+}
+
+void blk_cbt_update_size(struct block_device *bdev)
+{
+	struct request_queue *q;
+	struct cbt_info *cbt;
 
+	if (!bdev->bd_disk || !bdev_get_queue(bdev))
+		return;
 
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	list_for_each_entry(cbt, &q->cbt_list, list)
+		blk_cbt_update_cbt_size(cbt, i_size_read(bdev->bd_inode));
+
+	mutex_unlock(&cbt_mutex);
 }
 
 static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
@@ -632,16 +632,19 @@ static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __us
 
 	q = bdev_get_queue(bdev);
 	mutex_lock(&cbt_mutex);
-	if (q->cbt) {
-		ret = -EBUSY;
-		goto err_mtx;
-	}
+
+	if (blk_cbt_find(q, ci.ci_name))
+		return -EEXIST;
+
+	if (list_count_nodes(&q->cbt_list) == CBT_MAX_ENTRIES)
+		return -E2BIG;
+
 	cbt = do_cbt_alloc(q, ci.ci_name, i_size_read(bdev->bd_inode), ci.ci_blksize);
 	if (IS_ERR(cbt))
 		ret = PTR_ERR(cbt);
 	else
-		rcu_assign_pointer(q->cbt, cbt);
-err_mtx:
+		list_add_tail_rcu(&cbt->list, &q->cbt_list);
+
 	mutex_unlock(&cbt_mutex);
 	return ret;
 }
@@ -667,35 +670,48 @@ static void cbt_release_callback(struct rcu_head *head)
 	kfree(cbt);
 }
 
-void blk_cbt_release(struct request_queue *q)
+static void blk_cbt_del(struct cbt_info *cbt)
 {
-	struct cbt_info *cbt;
-	int in_use = 0;
 	unsigned long flags;
+	int in_use;
 
-	cbt = q->cbt;
-	if (!cbt)
-		return;
 	spin_lock_irqsave(&cbt->lock, flags);
 	set_bit(CBT_DEAD, &cbt->flags);
-	rcu_assign_pointer(q->cbt, NULL);
+	list_del_rcu(&cbt->list);
 	in_use = cbt->count;
 	spin_unlock_irqrestore(&cbt->lock, flags);
 	if (!in_use)
 		call_rcu(&cbt->rcu, &cbt_release_callback);
 }
 
-static int cbt_ioc_stop(struct block_device *bdev)
+void blk_cbt_release(struct request_queue *q)
+{
+	struct cbt_info *cbt, *tmp;
+
+	list_for_each_entry_safe(cbt, tmp, &q->cbt_list, list)
+		blk_cbt_del(cbt);
+}
+
+static int cbt_ioc_stop(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
 {
+	struct blk_user_cbt_info ci;
 	struct request_queue *q;
+	struct cbt_info *cbt;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
 
 	mutex_lock(&cbt_mutex);
 	q = bdev_get_queue(bdev);
-	if(!q->cbt) {
+
+	cbt = blk_cbt_find(q, ci.ci_name);
+	if (!cbt) {
 		mutex_unlock(&cbt_mutex);
 		return -EINVAL;
 	}
-	blk_cbt_release(q);
+
+	blk_cbt_del(cbt);
+
 	mutex_unlock(&cbt_mutex);
 	return 0;
 }
@@ -839,7 +855,8 @@ static int cbt_ioc_get(struct block_device *bdev, struct blk_user_cbt_info __use
 	ret = -EINVAL;
 	q = bdev_get_queue(bdev);
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+
+	cbt = blk_cbt_find(q, ci.ci_name);
 	if (!cbt ||
 	    (ci.ci_start >> cbt->block_bits) > cbt->block_max)
 		goto ioc_get_failed;
@@ -925,7 +942,7 @@ static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __use
 
 	ret = -EINVAL;
 	mutex_lock(&cbt_mutex);
-	cbt = q->cbt;
+	cbt = blk_cbt_find(q, ci.ci_name);
 	if (!cbt)
 		goto ioc_set_failed;
 
@@ -948,8 +965,8 @@ static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __use
 
 		ex.start  = cur_ex->ce_physical >> cbt->block_bits;
 		ex.len  = DIV_ROUND_UP(cur_ex->ce_length, 1 << cbt->block_bits);
-		if (ex.start > q->cbt->block_max ||
-		    ex.start + ex.len > q->cbt->block_max ||
+		if (ex.start > cbt->block_max ||
+		    ex.start + ex.len > cbt->block_max ||
 		    ex.len == 0) {
 			ret = -EINVAL;
 			break;
@@ -1007,7 +1024,7 @@ int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	case BLKCBTSTART:
 		return cbt_ioc_init(bdev, ucbt_ioc);
 	case BLKCBTSTOP:
-		return cbt_ioc_stop(bdev);
+		return cbt_ioc_stop(bdev, ucbt_ioc);
 	case BLKCBTSET:
 		return cbt_ioc_set(bdev, ucbt_ioc, 1);
 	case BLKCBTCLR:
diff --git a/block/blk-core.c b/block/blk-core.c
index 12a0e48d0112..f6493853c2f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -428,6 +428,8 @@ struct request_queue *blk_alloc_queue(int node_id)
 	init_waitqueue_head(&q->mq_freeze_wq);
 	mutex_init(&q->mq_freeze_lock);
 
+	INIT_LIST_HEAD(&q->cbt_list);
+
 	blkg_init_queue(q);
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fcbc83909b91..3009964707e6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -520,7 +520,7 @@ struct request_queue {
 	struct throtl_data *td;
 #endif
 #ifdef CONFIG_BLK_DEV_CBT
-	struct cbt_info *cbt;
+	struct list_head cbt_list;
 #endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 4820d5923b89..5d7014be0139 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -129,6 +129,8 @@ struct blk_user_cbt_extent {
 };
 
 #define CBT_NAME_LENGTH 128
+#define CBT_MAX_ENTRIES 16
+
 struct blk_user_cbt_info {
 	__u8  ci_name[CBT_NAME_LENGTH];	/* CBT name */
 	__u64 ci_start;		/* start phisical range of mapping which
@@ -169,7 +171,7 @@ struct blk_user_cbt_snap_create {
 };
 
 #define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
-#define BLKCBTSTOP _IO(0x12,201)
+#define BLKCBTSTOP _IOR(0x12, 201, struct blk_user_cbt_info)
 #define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
 #define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
 #define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)


More information about the Devel mailing list