[Devel] [PATCH RHEL7 COMMIT] cbt: introduce changed block tracking

Konstantin Khorenko khorenko at virtuozzo.com
Fri Oct 2 00:54:27 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-229.7.2.vz7.8.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-229.7.2.vz7.8.2
------>
commit 85f9d6ce47d7975b0b4902835470391aed597bca
Author: Dmitry Monakhov <dmonakhov at openvz.org>
Date:   Fri Oct 2 11:54:26 2015 +0400

    cbt: introduce changed block tracking
    
    Combined patch list:
    diff-cbt-add-changed-block-trace-infrastructure
    diff-cbt-fixup-use-after-free-inside-__blk_cbt_set
    diff-cbt-use-propper-mem-allocation-context
    diff-cbt-support-blockdevice-size-update-v2
    diff-cbt-blk_cbt_update_size-add-block_dev-sanity-check
    diff-cbt-ignore-device-shrink
    diff-block-cbt-fix-mistype-statement
    diff-cbt-add-get_once-feature
    diff-cbt-fix-bytes-to-block-conversion-bug
    diff-cbt-add-missed-mutex_unlock
    
    Only minor context fixes from original patches.
    
    https://jira.sw.ru/browse/PSBM-34156
    
    Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
 block/Kconfig           |   8 +
 block/Makefile          |   1 +
 block/blk-cbt.c         | 605 ++++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c        |   1 +
 block/blk-sysfs.c       |   1 +
 block/ioctl.c           |   9 +-
 drivers/md/dm.c         |   2 +-
 fs/block_dev.c          |   9 +-
 include/linux/blkdev.h  |  14 ++
 include/linux/fs.h      |   1 +
 include/uapi/linux/fs.h |  34 +++
 11 files changed, 682 insertions(+), 3 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index a7e40a7..3d11f0c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -105,6 +105,14 @@ source "block/partitions/Kconfig"
 
 endmenu
 
+config BLK_DEV_CBT
+	bool "Block layer changed block tracking support"
+	---help---
+	Block layer changed block tracking support, It can be used to optimize
+	device backup,copy.
+
+	If unsure, say N.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
diff --git a/block/Makefile b/block/Makefile
index 21f4618..44f9426 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
+obj-$(CONFIG_BLK_DEV_CBT)	+= blk-cbt.o
diff --git a/block/blk-cbt.c b/block/blk-cbt.c
new file mode 100644
index 0000000..99d4a76
--- /dev/null
+++ b/block/blk-cbt.c
@@ -0,0 +1,605 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define CBT_MAX_EXTENTS	(UINT_MAX / sizeof(struct blk_user_cbt_extent))
+#define NR_PAGES(bits) (((bits) + PAGE_SIZE*8 - 1) / (PAGE_SIZE*8))
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+
+static __cacheline_aligned_in_smp DEFINE_MUTEX(cbt_mutex);
+
+struct cbt_extent{
+	blkcnt_t start;
+	blkcnt_t len;
+};
+
+struct cbt_info {
+	__u8 	 uuid[16];
+	struct request_queue *queue;
+	blkcnt_t block_max;
+	blkcnt_t block_bits;
+	unsigned long flags;
+
+	struct rcu_head rcu;
+	unsigned int count;
+	struct cbt_extent __percpu *cache;
+	struct page **map;
+	spinlock_t lock;
+};
+
+
+enum CBT_FLAGS
+{
+	CBT_ERROR = 0,
+	CBT_DEAD  = 1,
+	CBT_NOCACHE  = 2,
+};
+static void cbt_release_callback(struct rcu_head *head);
+static void cbt_flush_cache(struct cbt_info *cbt);
+
+static inline void spin_lock_page(struct page *page)
+{
+	while(!trylock_page(page))
+		cpu_relax();
+}
+
+static void set_bits(void *bm, int cur, int len, bool is_set)
+{
+	__u32 *addr;
+	__u32 pattern = is_set? 0xffffffff : 0;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: set whole word at once */
+			addr = bm + (cur >> 3);
+
+			*addr = pattern;
+			cur += 32;
+			continue;
+		}
+		if (is_set)
+			set_bit(cur, bm);
+		else
+			clear_bit(cur, bm);
+		cur++;
+	}
+}
+
+static int __blk_cbt_set(struct cbt_info  *cbt, blkcnt_t block,
+			  blkcnt_t count, bool in_rcu, bool set)
+{
+	struct page *page;
+
+	if (unlikely(block + count > cbt->block_max)) {
+		printk("WARN: %s eof access block:%lld, len: %lld, max:%lld\n",
+		       __FUNCTION__, (unsigned long long) block,
+		       (unsigned long long)count,
+		       (unsigned long long)cbt->block_max);
+		set_bit(CBT_ERROR, &cbt->flags);
+		return -EINVAL;
+	}
+
+	while(count) {
+		unsigned long idx = block >> (PAGE_SHIFT + 3);
+		unsigned long off = block & (BITS_PER_PAGE -1);
+		unsigned long len = count & (BITS_PER_PAGE -1);
+
+		if (off + len > BITS_PER_PAGE)
+			len = BITS_PER_PAGE - off;
+		page = rcu_dereference(cbt->map[idx]);
+		if (page) {
+			spin_lock_page(page);
+			set_bits(page_address(page), off, len, set);
+			unlock_page(page);
+			count -= len;
+			block += len;
+			continue;
+		} else {
+			if (!set) {
+				len = count & (BITS_PER_PAGE -1);
+				count -= len;
+				block += len;
+				continue;
+			}
+		}
+		/* Page not allocated yet. Synchronization required */
+		spin_lock_irq(&cbt->lock);
+		if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
+			cbt->count++;
+		} else {
+			struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
+
+			spin_unlock_irq(&cbt->lock);
+			/* was cbt updated ? */
+			if (new != cbt) {
+				cbt = new;
+				continue;
+			} else {
+				break;
+			}
+		}
+		spin_unlock_irq(&cbt->lock);
+		if (in_rcu)
+			rcu_read_unlock();
+		page = alloc_page(GFP_NOIO|__GFP_ZERO);
+		if (in_rcu)
+			rcu_read_lock();
+		spin_lock_irq(&cbt->lock);
+		if (unlikely(!cbt->count-- && test_bit(CBT_DEAD, &cbt->flags))) {
+			spin_unlock_irq(&cbt->lock);
+			call_rcu(&cbt->rcu, &cbt_release_callback);
+			if (page)
+				__free_page(page);
+			break;
+		}
+		if (unlikely(!page)) {
+			set_bit(CBT_ERROR, &cbt->flags);
+			spin_unlock_irq(&cbt->lock);
+			return -ENOMEM;
+		}
+		cbt->map[idx] = page;
+		page = NULL;
+		spin_unlock_irq(&cbt->lock);
+	}
+	return 0;
+}
+
+static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
+{
+	struct cbt_info *cbt;
+	struct cbt_extent *ex;
+	struct cbt_extent old;
+	blkcnt_t end;
+	/* Check per-cpu cache */
+
+	rcu_read_lock();
+	cbt = rcu_dereference(q->cbt);
+	if (unlikely(!cbt))
+		goto out_rcu;
+
+	if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
+		goto out_rcu;
+	end = (start + len + (1 << cbt->block_bits) -1) >> cbt->block_bits;
+	start >>= cbt->block_bits;
+	len = end - start;
+	if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
+		__blk_cbt_set(cbt, start, len, 1, 1);
+		goto out_rcu;
+	}
+	ex = this_cpu_ptr(cbt->cache);
+	if (ex->start + ex->len == start) {
+		ex->len += len;
+		goto out_rcu;
+	}
+	old = *ex;
+	ex->start = start;
+	ex->len = len;
+
+	if (likely(old.len))
+		__blk_cbt_set(cbt, old.start, old.len, 1, 1);
+out_rcu:
+	rcu_read_unlock();
+}
+
+inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_size)
+		return;
+
+	blk_cbt_add(q, bio->bi_sector << 9, bio->bi_size);
+}
+
+static struct cbt_info* do_cbt_alloc(struct request_queue *q, __u8 *uuid,
+				     loff_t size, loff_t blocksize)
+{
+	struct cbt_info *cbt;
+	struct cbt_extent *ex;
+	int i;
+
+
+	cbt = kzalloc(sizeof(*cbt), GFP_KERNEL);
+	if (!cbt)
+		return ERR_PTR(-ENOMEM);
+
+	cbt->block_bits = ilog2(blocksize);
+	cbt->block_max  = (size + blocksize) >> cbt->block_bits;
+	spin_lock_init(&cbt->lock);
+	memcpy(cbt->uuid, uuid, sizeof(cbt->uuid));
+	cbt->cache = alloc_percpu(struct cbt_extent);
+	if (!cbt->cache)
+		goto err_cbt;
+
+	for_each_possible_cpu(i) {
+		ex = per_cpu_ptr(cbt->cache, i);
+		memset(ex, 0, sizeof (*ex));
+	}
+
+	cbt->map = vmalloc(NR_PAGES(cbt->block_max) * sizeof(void*));
+	if (!cbt->map)
+		goto err_pcpu;
+
+	memset(cbt->map, 0, NR_PAGES(cbt->block_max) * sizeof(void*));
+	cbt->queue = q;
+	return cbt;
+err_pcpu:
+	free_percpu(cbt->cache);
+err_cbt:
+	kfree(cbt);
+	return ERR_PTR(-ENOMEM);
+}
+
+
+void blk_cbt_update_size(struct block_device *bdev)
+{
+	struct request_queue *q;
+	struct cbt_info *new, *cbt;
+	unsigned long to_cpy, idx;
+	unsigned bsz;
+	loff_t new_sz = i_size_read(bdev->bd_inode);
+	int in_use = 0;
+
+	if (!bdev->bd_disk || !bdev_get_queue(bdev))
+		return;
+
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return;
+	}
+	bsz = 1 << cbt->block_bits;
+	if ((new_sz + bsz) >> cbt->block_bits <= cbt->block_max)
+		goto err_mtx;
+
+	new = do_cbt_alloc(q, cbt->uuid, new_sz, bsz);
+	if (IS_ERR(new)) {
+		set_bit(CBT_ERROR, &cbt->flags);
+		goto err_mtx;
+	}
+	to_cpy = NR_PAGES(new->block_max);
+	set_bit(CBT_NOCACHE, &cbt->flags);
+	cbt_flush_cache(cbt);
+	spin_lock_irq(&cbt->lock);
+	set_bit(CBT_DEAD, &cbt->flags);
+	for (idx = 0; idx < to_cpy; idx++){
+		new->map[idx] = cbt->map[idx];
+		if (new->map[idx])
+			get_page(new->map[idx]);
+	}
+	rcu_assign_pointer(q->cbt, new);
+	in_use = cbt->count;
+	spin_unlock(&cbt->lock);
+	if (!in_use)
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+err_mtx:
+	mutex_unlock(&cbt_mutex);
+
+
+}
+
+static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+	struct request_queue *q;
+	struct blk_user_cbt_info ci;
+	struct cbt_info *cbt;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+
+	if (((ci.ci_blksize -1) & ci.ci_blksize))
+		return -EINVAL;
+
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	if (q->cbt) {
+		ret = -EBUSY;
+		goto err_mtx;
+	}
+	cbt = do_cbt_alloc(q, ci.ci_uuid, i_size_read(bdev->bd_inode), ci.ci_blksize);
+	if (IS_ERR(cbt))
+		ret = PTR_ERR(cbt);
+	else
+		rcu_assign_pointer(q->cbt, cbt);
+err_mtx:
+	mutex_unlock(&cbt_mutex);
+	return ret;
+}
+
+static void cbt_release_callback(struct rcu_head *head)
+{
+	struct cbt_info *cbt;
+	int nr_pages, i;
+
+	cbt = container_of(head, struct cbt_info, rcu);
+	nr_pages = NR_PAGES(cbt->block_max);
+	for (i = 0; i < nr_pages; i++)
+		if (cbt->map[i])
+			__free_page(cbt->map[i]);
+
+	vfree(cbt->map);
+	free_percpu(cbt->cache);
+	kfree(cbt);
+}
+
+void blk_cbt_release(struct request_queue *q)
+{
+	struct cbt_info *cbt;
+	int in_use = 0;
+
+	cbt = q->cbt;
+	if (!cbt)
+		return;
+	spin_lock(&cbt->lock);
+	set_bit(CBT_DEAD, &cbt->flags);
+	rcu_assign_pointer(q->cbt, NULL);
+	in_use = cbt->count;
+	spin_unlock(&cbt->lock);
+	if (in_use)
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+}
+
+static int cbt_ioc_stop(struct block_device *bdev)
+{
+	struct request_queue *q;
+
+	mutex_lock(&cbt_mutex);
+	q = bdev_get_queue(bdev);
+	if(!q->cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	blk_cbt_release(q);
+	mutex_unlock(&cbt_mutex);
+	return 0;
+}
+
+static inline void __cbt_flush_cpu_cache(void *ptr)
+{
+	struct cbt_info *cbt = (struct cbt_info *) ptr;
+	struct cbt_extent *ex = this_cpu_ptr(cbt->cache);
+
+	if (ex->len) {
+		__blk_cbt_set(cbt, ex->start, ex->len, 0, 1);
+		ex->start += ex->len;
+		ex->len = 0;
+	}
+}
+
+static void cbt_flush_cache(struct cbt_info *cbt)
+{
+	on_each_cpu(__cbt_flush_cpu_cache, cbt, 1);
+}
+
+static void cbt_find_next_extent(struct cbt_info *cbt, blkcnt_t block, struct cbt_extent *ex)
+{
+	unsigned long off, off2, idx;
+	struct page *page;
+	bool found = 0;
+
+	ex->start = cbt->block_max;
+	ex->len = 0;
+
+	idx = block >> (PAGE_SHIFT + 3);
+	while (block < cbt->block_max) {
+		off = block & (BITS_PER_PAGE -1);
+		page = rcu_dereference(cbt->map[idx]);
+		if (!page) {
+			if (found)
+				break;
+			goto next;
+		}
+		spin_lock_page(page);
+		/* Find extent start */
+		if (!found) {
+			ex->start = find_next_bit(page_address(page), BITS_PER_PAGE, off);
+			if (ex->start != BITS_PER_PAGE) {
+				off = ex->start;
+				ex->start += idx << (PAGE_SHIFT + 3);
+				found = 1;
+			} else {
+				unlock_page(page);
+				goto next;
+			}
+		}
+		if (found) {
+			off2 = find_next_zero_bit(page_address(page), BITS_PER_PAGE, off);
+			ex->len += off2 - off;
+			if (off2 != BITS_PER_PAGE) {
+				unlock_page(page);
+				break;
+			}
+		}
+		unlock_page(page);
+	next:
+		idx++;
+		block = idx << (PAGE_SHIFT + 3);
+		continue;
+	}
+}
+
+static int cbt_ioc_get(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+	struct request_queue *q;
+	struct blk_user_cbt_info ci;
+	struct blk_user_cbt_extent __user *cur_u_ex;
+	struct blk_user_cbt_extent u_ex;
+	struct cbt_info *cbt;
+	struct cbt_extent ex;
+	blkcnt_t block , end;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+	if (ci.ci_flags &  ~CI_FLAG_ONCE)
+		return -EINVAL;
+	if (ci.ci_extent_count > CBT_MAX_EXTENTS)
+		return -EINVAL;
+
+	cur_u_ex = (struct blk_user_cbt_extent __user*)
+		((char *)ucbt_ioc + sizeof(struct blk_user_cbt_info));
+
+	if (ci.ci_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, cur_u_ex,
+		       ci.ci_extent_count * sizeof(struct blk_user_cbt_extent))){
+		return -EFAULT;
+	}
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	if ((ci.ci_start >> cbt->block_bits) > cbt->block_max) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	if (test_bit(CBT_ERROR, &cbt->flags)) {
+		mutex_unlock(&cbt_mutex);
+		return -EIO;
+	}
+	cbt_flush_cache(cbt);
+
+	memcpy(&ci.ci_uuid, cbt->uuid, sizeof(cbt->uuid));
+	ci.ci_blksize = 1UL << cbt->block_bits;
+	block = ci.ci_start >> cbt->block_bits;
+	end = (ci.ci_start + ci.ci_length) >> cbt->block_bits;
+	if (end > cbt->block_max)
+		end = cbt->block_max;
+
+	while (ci.ci_mapped_extents < ci.ci_extent_count) {
+		cbt_find_next_extent(cbt, block, &ex);
+		if (!ex.len || ex.start > end) {
+			ret = 0;
+			break;
+		}
+		u_ex.ce_physical = ex.start << cbt->block_bits;
+		u_ex.ce_length = ex.len << cbt->block_bits;
+		if (copy_to_user(cur_u_ex, &u_ex, sizeof(u_ex))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (ci.ci_flags & CI_FLAG_ONCE)
+			__blk_cbt_set(cbt, ex.start, ex.len, 0, 0);
+		cur_u_ex++;
+		ci.ci_mapped_extents++;
+		block = ex.start + ex.len;
+	}
+	mutex_unlock(&cbt_mutex);
+	if (!ret && copy_to_user(ucbt_ioc, &ci, sizeof(ci)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc, bool set)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct cbt_info *cbt;
+	struct blk_user_cbt_info ci;
+	struct blk_user_cbt_extent __user u_ex, *cur_u_ex, *end;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+	if (ci.ci_extent_count > CBT_MAX_EXTENTS)
+		return -EINVAL;
+	if (ci.ci_extent_count < ci.ci_mapped_extents)
+		return -EINVAL;
+
+	cur_u_ex = (struct blk_user_cbt_extent __user*)
+		((char *)ucbt_ioc + sizeof(struct blk_user_cbt_info));
+	end = cur_u_ex + ci.ci_mapped_extents;
+	if (!access_ok(VERIFY_READ, cur_u_ex,
+		       ci.ci_mapped_extents * sizeof(struct blk_user_cbt_extent)))
+		return -EFAULT;
+
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	if (ci.ci_flags & CI_FLAG_NEW_UUID)
+		memcpy(cbt->uuid, &ci.ci_uuid, sizeof(ci.ci_uuid));
+	else if (memcmp(cbt->uuid, &ci.ci_uuid, sizeof(ci.ci_uuid))) {
+			mutex_unlock(&cbt_mutex);
+			return -EINVAL;
+	}
+	if (test_bit(CBT_ERROR, &cbt->flags)) {
+		mutex_unlock(&cbt_mutex);
+		return -EIO;
+	}
+
+	/* Do not care about pcpu caches on set, only in case of clear */
+	if (!set)
+		cbt_flush_cache(cbt);
+
+	while (cur_u_ex < end) {
+		struct cbt_extent ex;
+
+		if (copy_from_user(&u_ex, cur_u_ex, sizeof(u_ex))) {
+			ret = -EFAULT;
+			break;
+		}
+		ex.start  = u_ex.ce_physical >> cbt->block_bits;
+		ex.len  = (u_ex.ce_length + (1 << cbt->block_bits) -1) >> cbt->block_bits;
+		if (ex.start > q->cbt->block_max ||
+		    ex.start + ex.len > q->cbt->block_max ||
+		    ex.len == 0) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = __blk_cbt_set(cbt, ex.start, ex.len, 0, set);
+		if (ret)
+			break;
+		cur_u_ex++;
+	}
+	mutex_unlock(&cbt_mutex);
+	return ret;
+}
+
+int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	struct blk_user_cbt_info __user *ucbt_ioc = (struct blk_user_cbt_info __user *) arg;
+
+	switch(cmd) {
+	case BLKCBTSTART:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return cbt_ioc_init(bdev, ucbt_ioc);
+	case BLKCBTSTOP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_stop(bdev);
+	case BLKCBTGET:
+		return cbt_ioc_get(bdev, ucbt_ioc);
+	case BLKCBTSET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_set(bdev, ucbt_ioc, 1);
+	case BLKCBTCLR:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_set(bdev, ucbt_ioc, 0);
+	default:
+		BUG();
+	}
+	return -ENOTTY;
+}
diff --git a/block/blk-core.c b/block/blk-core.c
index 66f7be3..08ac4d3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1845,6 +1845,7 @@ generic_make_request_checks(struct bio *bio)
 		return false;	/* throttled, will be resubmitted later */
 
 	trace_block_bio_queue(q, bio);
+	blk_cbt_bio_queue(q, bio);
 	return true;
 
 end_io:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 10d2058..21bc0c7 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -551,6 +551,7 @@ static void blk_release_queue(struct kobject *kobj)
 	kfree(q->flush_rq);
 
 	blk_trace_shutdown(q);
+	blk_cbt_release(q);
 
 	bdi_destroy(&q->backing_dev_info);
 
diff --git a/block/ioctl.c b/block/ioctl.c
index 93a9fdc..c56168b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -139,7 +139,7 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			}
 			disk_part_iter_exit(&piter);
 			part_nr_sects_write(part, (sector_t)length);
-			i_size_write(bdevp->bd_inode, p.length);
+			bd_write_size(bdevp, p.length);
 			mutex_unlock(&bdevp->bd_mutex);
 			mutex_unlock(&bdev->bd_mutex);
 			bdput(bdevp);
@@ -437,6 +437,13 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACETEARDOWN:
 		ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
 		break;
+	case BLKCBTSTART:
+	case BLKCBTSTOP:
+	case BLKCBTGET:
+	case BLKCBTSET:
+	case BLKCBTCLR:
+		ret = blk_cbt_ioctl(bdev, cmd, (char __user *)arg);
+		break;
 	default:
 		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 	}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6bf6815..aca38b5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2338,7 +2338,7 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+	bd_write_size(md->bdev, (loff_t)size << SECTOR_SHIFT);
 }
 
 /*
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e6a2837..a22f439 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1037,12 +1037,19 @@ int check_disk_change(struct block_device *bdev)
 
 EXPORT_SYMBOL(check_disk_change);
 
+void bd_write_size(struct block_device *bdev, loff_t size)
+{
+	i_size_write(bdev->bd_inode, size);
+	blk_cbt_update_size(bdev);
+}
+EXPORT_SYMBOL(bd_write_size);
+
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 
 	mutex_lock(&bdev->bd_inode->i_mutex);
-	i_size_write(bdev->bd_inode, size);
+	bd_write_size(bdev, size);
 	mutex_unlock(&bdev->bd_inode->i_mutex);
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bea378b..2a05818 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -501,6 +501,9 @@ struct request_queue {
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+#ifdef CONFIG_BLK_DEV_CBT
+	struct cbt_info	*cbt;
+#endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
 	struct percpu_counter	mq_usage_counter;
@@ -1637,6 +1640,17 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g)
 
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
+#if defined (CONFIG_BLK_DEV_CBT)
+extern void blk_cbt_update_size(struct block_device *bdev);
+extern void blk_cbt_release(struct request_queue *q);
+extern void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio);
+extern int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg);
+#else /* CONFIG_BLK_DEV_CBT */
+#define blk_cbt_update_size(b) (0)
+#define blk_cbt_release(q) (0)
+#define blk_cbt_bio_queue(q,bio) (0)
+#define blk_cbt_ioctl(b,c,a) (-ENOTTY)
+#endif /* CONFIG_BLK_DEV_CBT */
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 553bca3..7e7bd3f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2370,6 +2370,7 @@ extern int register_blkdev(unsigned int, const char *);
 extern void unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
 extern struct block_device *bdgrab(struct block_device *bdev);
+extern void bd_write_size(struct block_device *, loff_t size);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 9b964a5..359bf02 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -153,6 +153,40 @@ struct inodes_stat_t {
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
 
+/* Hole from 127..199 */
+struct blk_user_cbt_extent {
+	__u64 ce_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 ce_length;   /* length in bytes for this extent */
+	__u64 ce_reserved64[1];
+};
+
+struct blk_user_cbt_info {
+	__u8  ci_uuid[16];      /* Bitmap UUID */
+	__u64 ci_start;		/* start phisical range of mapping which
+				   userspace wants (in) */
+	__u64 ci_length;	/* phisical length of mapping which
+				 * userspace wants (in) */
+	__u32 ci_blksize;	/* cbt logical block size */
+	__u32 ci_flags;		/* CI_FLAG_* flags for request (in/out) */
+	__u32 ci_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 ci_extent_count;  /* size of fm_extents array (in) */
+	__u32 ci_reserved;
+	struct blk_user_cbt_extent ci_extents[0]; /* array of mapped extents (out) */
+};
+
+enum CI_FLAGS
+{
+	CI_FLAG_ONCE = 1, /* BLKCBTGET will clear bits */
+	CI_FLAG_NEW_UUID = 2 /* BLKCBTSET update uuid */
+};
+
+#define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
+#define BLKCBTSTOP _IO(0x12,201)
+#define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
+#define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
+#define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
+
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */



More information about the Devel mailing list