[Devel] [PATCH RH8] dm-qcow2: Introduce driver to create block devices over QCOW2 files

Mon Jul 19 15:52:25 MSK 2021

https://jira.sw.ru/browse/PSBM-123244

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 07/16/2021 07:19 PM, Kirill Tkhai wrote:
> Driver for attaching QCOW2 files as block devices. It cares
> about performance-critical actions like actual IO and
> snapshot COW and merge, while complex and fast metadata-related
> service actions (e.g., snapshot creation and resize)
> are delegated to userspace.
>
> Suspend/resume is a barrier between the driver and userspace.
> On suspend the driver brings QCOW2 images in consistent state,
> while on resume it re-reads metadata from QCOW2 images.
> Userspace proceeds metadata-related service actions, while
> device is suspended, and it never does this on running device.
> This demarcation allows driver code to be pretty small.
>
> Implemented full support for QCOW2 format: compressed clusters,
> internal snapshots, backing files and extended L2 table.
> Current limitations: the lowest backing file must be QCOW2
> (RAW backing files are not supported), cluster and subcluster
> sizes for all backing files in chain must be equal (I'm going
> to change this in the future).
>
> Also, implemented backward merge (from top to lower image).
>
> The driver is request based, since this allows to use blk-mq
> merging of request. Driver splits requests itself, and every
> request (i.e., qio) after splitting fits a single cluster.
> (In some cases it is worth to create bigger splits, and this
> is a subject of further optimizations).
>
> Example of usage is shown in scripts/qcow2-dm.sh. That script allows
> to create/remove device from QCOW2 file/backing files chain.
>
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
>  drivers/md/Kconfig           |   17
>  drivers/md/Makefile          |    2
>  drivers/md/dm-qcow2-cmd.c    |  337 +++
>  drivers/md/dm-qcow2-map.c    | 4068 ++++++++++++++++++++++++++++++++++++++++++
>  drivers/md/dm-qcow2-target.c |  935 ++++++++++
>  drivers/md/dm-qcow2.h        |  360 ++++
>  scripts/qcow2-dm.sh          |  103 +
>  7 files changed, 5822 insertions(+)
>  create mode 100644 drivers/md/dm-qcow2-cmd.c
>  create mode 100644 drivers/md/dm-qcow2-map.c
>  create mode 100644 drivers/md/dm-qcow2-target.c
>  create mode 100644 drivers/md/dm-qcow2.h
>  create mode 100755 scripts/qcow2-dm.sh
>
> diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
> index fc58d19da43f..af1018c5e272 100644
> --- a/drivers/md/Kconfig
> +++ b/drivers/md/Kconfig
> @@ -573,4 +573,21 @@ config DM_ZONED
>
>  	  If unsure, say N.
>
> +config DM_QCOW2
> +	tristate "QCOW2 target support"
> +	depends on BLK_DEV_DM
> +	depends on ZLIB_INFLATE
> +	help
> +	  Driver for attaching QCOW2 files as block devices. It cares
> +	  about performance-critical actions like actual IO and
> +	  snapshot COW and merge, while complex and fast metadata-related
> +	  service actions (e.g., snapshot creation and resize)
> +	  are delegated to userspace.
> +
> +	  Suspend/resume is a barrier between the driver and userspace.
> +	  On suspend the driver brings QCOW2 images in consistent state,
> +	  while on resume it re-reads metadata from QCOW2 images. Thus,
> +	  userspace proceeds metadata-related service actions, while
> +	  device is suspended.
> +
>  endif # MD
> diff --git a/drivers/md/Makefile b/drivers/md/Makefile
> index 7a9b6e85285b..0685b83b7883 100644
> --- a/drivers/md/Makefile
> +++ b/drivers/md/Makefile
> @@ -21,6 +21,7 @@ dm-era-y	+= dm-era-target.o
>  ploop-y		+= dm-ploop-target.o dm-ploop-map.o dm-ploop-cmd.o \
>  		    dm-ploop-bat.o
>  push-backup-y	+= dm-push-backup.o
> +dm-qcow2-y	+= dm-qcow2-target.o dm-qcow2-map.o dm-qcow2-cmd.o
>  dm-verity-y	+= dm-verity-target.o
>  md-mod-y	+= md.o md-bitmap.o
>  raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
> @@ -69,6 +70,7 @@ obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
>  obj-$(CONFIG_DM_ERA)		+= dm-era.o
>  obj-$(CONFIG_DM_PLOOP)		+= ploop.o
>  obj-$(CONFIG_DM_PUSH_BACKUP)	+= push-backup.o
> +obj-$(CONFIG_DM_QCOW2)		+= dm-qcow2.o
>  obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
>  obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
>  obj-$(CONFIG_DM_ZONED)		+= dm-zoned.o
> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c
> new file mode 100644
> index 000000000000..ee4f4a43ad80
> --- /dev/null
> +++ b/drivers/md/dm-qcow2-cmd.c
> @@ -0,0 +1,337 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *  Copyright (C) 2021 Virtuozzo International GmbH. All rights reserved.
> + */
> +#include <linux/device-mapper.h>
> +#include <linux/sched/signal.h>
> +#include <linux/file.h>
> +#include "dm-qcow2.h"
> +
> +#define MERGE_QIOS_MAX 64
> +
> +static int qcow2_get_errors(struct qcow2_target *tgt, char *result,
> +			    unsigned int maxlen)
> +{
> +	bool wants_check = qcow2_wants_check(tgt);
> +	unsigned int sz = 0;
> +	int ret;
> +
> +	ret = DMEMIT("wants_check=%d\nmd_writeback_error=%d\ntruncate_error=%d\n",
> +		      wants_check, tgt->md_writeback_error, tgt->truncate_error);
> +
> +	return ret ? 1 : 0;
> +}
> +
> +int qcow2_inflight_ref_switch(struct qcow2_target *tgt)
> +{
> +	struct completion *comp = &tgt->inflight_ref_comp;
> +	u8 ref_index = tgt->inflight_ref_index;
> +
> +	tgt->inflight_ref_index = !ref_index;
> +
> +	percpu_ref_kill(&tgt->inflight_ref[ref_index]);
> +	wait_for_completion(comp);
> +
> +	percpu_ref_reinit(&tgt->inflight_ref[ref_index]);
> +	reinit_completion(comp);
> +	return 0;
> +}
> +
> +static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio,
> +			      void *data, blk_status_t status)
> +{
> +	blk_status_t *status_ptr = data;
> +	unsigned long flags;
> +
> +	if (unlikely(status)) {
> +		spin_lock_irqsave(&tgt->err_status_lock, flags);
> +		*status_ptr = status;
> +		spin_unlock_irqrestore(&tgt->err_status_lock, flags);
> +	}
> +
> +	atomic_dec(&tgt->service_qios);
> +	wake_up(&tgt->service_wq);
> +}
> +
> +static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2,
> +			      loff_t end, loff_t step, u8 qio_flags)
> +{
> +	static blk_status_t service_status;
> +	struct bio_vec bvec = {0};
> +	struct qio *qio;
> +	int ret = 0;
> +	loff_t pos;
> +
> +	WRITE_ONCE(service_status, BLK_STS_OK);
> +
> +	for (pos = 0; pos < end; pos += step) {
> +		if (fatal_signal_pending(current)) {
> +			ret = -EINTR;
> +			break;
> +		}
> +
> +		qio = alloc_qio(tgt->qio_pool, true);
> +		if (!qio) {
> +			ret = -ENOMEM;
> +			break;
> +		}
> +
> +		/* See fake_merge_qio() and fake_l1cow_qio() */
> +		init_qio(qio, REQ_OP_WRITE, qcow2);
> +		qio->flags |= qio_flags|QIO_FREE_ON_ENDIO_FL;
> +		qio->bi_io_vec = &bvec;
> +		qio->bi_iter.bi_sector = to_sector(pos);
> +		qio->bi_iter.bi_size = 0;
> +		qio->bi_iter.bi_idx = 0;
> +		qio->bi_iter.bi_bvec_done = 0;
> +		qio->endio_cb = service_qio_endio;
> +		qio->endio_cb_data = &service_status;
> +
> +		dispatch_qios(qcow2, qio, NULL);
> +		if (atomic_inc_return(&tgt->service_qios) == MERGE_QIOS_MAX) {
> +			wait_event(tgt->service_wq,
> +				   atomic_read(&tgt->service_qios) < MERGE_QIOS_MAX);
> +		}
> +
> +		if (unlikely(READ_ONCE(service_status)))
> +			break;
> +	}
> +
> +	wait_event(tgt->service_wq, !atomic_read(&tgt->service_qios));
> +	if (!ret) {
> +		spin_lock_irq(&tgt->err_status_lock);
> +		ret = blk_status_to_errno(service_status);
> +		spin_unlock_irq(&tgt->err_status_lock);
> +	}
> +
> +	return ret;
> +}
> +
> +static int qcow2_merge_common(struct qcow2_target *tgt)
> +{
> +	struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
> +	u32 clu_size = qcow2->clu_size;
> +	loff_t end = lower->hdr.size;
> +
> +	return qcow2_service_iter(tgt, qcow2, end, clu_size, QIO_IS_MERGE_FL);
> +}
> +
> +/*
> + * Forward merge is a simple COW simulation in every clu.
> + * After that, all mapped clus from lower delta become
> + * mapped in top delta. Then, userspace may remove lower
> + * delta from the deltas stack (and it also has to update
> + * backing file name in top delta's metadata).
> + */
> +static int qcow2_merge_forward(struct qcow2_target *tgt)
> +{
> +	return -ENOTTY; /* TODO */
> +}
> +
> +static int qcow2_break_l1cow(struct qcow2_target *tgt)
> +{
> +	struct qcow2 *qcow2 = tgt->top;
> +	loff_t end = qcow2->hdr.size;
> +	loff_t step = (u64)qcow2->l2_entries * qcow2->clu_size;
> +
> +	return qcow2_service_iter(tgt, qcow2, end, step, QIO_IS_L1COW_FL);
> +}
> +
> +static void set_backward_merge_in_process(struct qcow2_target *tgt,
> +				     struct qcow2 *qcow2, bool set)
> +{
> +	LIST_HEAD(list);
> +
> +	/*
> +	 * To avoid race between allocations and COWS
> +	 * we completely stop queueing qios and wait
> +	 * for pending qios. Lock is for visability.
> +	 */
> +	spin_lock_irq(&qcow2->deferred_lock);
> +	qcow2->pause_submitting_qios = true;
> +	spin_unlock_irq(&qcow2->deferred_lock);
> +	qcow2_inflight_ref_switch(tgt);
> +
> +	/* queue is stopped */
> +	spin_lock_irq(&qcow2->deferred_lock);
> +	WARN_ON_ONCE(qcow2->backward_merge_in_process == set);
> +	qcow2->backward_merge_in_process = set;
> +	qcow2->pause_submitting_qios = false;
> +	list_splice_init(&qcow2->paused_qios, &list);
> +	spin_unlock_irq(&qcow2->deferred_lock);
> +
> +	submit_embedded_qios(tgt, &list);
> +}
> +
> +static int qcow2_merge_backward(struct qcow2_target *tgt)
> +{
> +	struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
> +	int ret, ret2;
> +
> +	ret = -ENOENT;
> +	if (!lower)
> +		goto out;
> +	ret = -EACCES;
> +	if (!(lower->file->f_mode & FMODE_WRITE))
> +		goto out;
> +	ret = -EOPNOTSUPP;
> +	if (qcow2->clu_size != lower->clu_size)
> +		goto out;
> +	ret = -EBADSLT;
> +	if (lower->hdr.size < qcow2->hdr.size)
> +		goto out;
> +	/*
> +	 * Break all COW clus at L1 level. Otherwise, later
> +	 * there would be problems with unusing them:
> +	 * we'd have to freeze IO going to all data clusters
> +	 * under every L1 entry related to several snapshots.
> +	 */
> +	ret = qcow2_break_l1cow(tgt);
> +	if (ret) {
> +		pr_err("dm-qcow2: Can't break L1 COW\n");
> +		goto out;
> +	}
> +
> +	ret = qcow2_set_image_file_features(lower, true);
> +	if (ret) {
> +		pr_err("dm-qcow2: Can't set dirty bit\n");
> +		goto out;
> +	}
> +	set_backward_merge_in_process(tgt, qcow2, true);
> +
> +	/* Start merge */
> +	ret = qcow2_merge_common(tgt);
> +	if (ret) {
> +		set_backward_merge_in_process(tgt, qcow2, false);
> +		ret2 = qcow2_set_image_file_features(lower, false);
> +		if (ret2 < 0)
> +			pr_err("dm-qcow2: Can't unuse lower (%d)\n", ret2);
> +		goto out;
> +	}
> +	tgt->nr_images--;
> +	tgt->top = lower;
> +	smp_wmb(); /* Pairs with qcow2_ref_inc() */
> +	qcow2_inflight_ref_switch(tgt); /* Pending qios */
> +	flush_deferred_activity(tgt, qcow2); /* Delayed md pages */
> +	qcow2->lower = NULL;
> +
> +	ret2 = qcow2_set_image_file_features(qcow2, false);
> +	if (ret2 < 0)
> +		pr_err("dm-qcow2: Can't unuse merged img (%d)\n", ret2);
> +	qcow2_destroy(qcow2);
> +out:
> +	return ret;
> +}
> +
> +static int qcow2_get_fd(struct qcow2_target *tgt, u32 img_id,
> +			char *result, unsigned int maxlen)
> +{
> +	struct qcow2 *qcow2 = tgt->top;
> +	unsigned int sz = 0;
> +	struct file *file;
> +	int skip, fd;
> +
> +	lockdep_assert_held(&tgt->ctl_mutex); /* tgt->top */
> +
> +	skip = tgt->nr_images - 1 - img_id;
> +	while (qcow2 && skip > 0) {
> +		qcow2 = qcow2->lower;
> +		skip--;
> +	}
> +
> +	if (!qcow2 || skip)
> +		return -ENOENT;
> +
> +	fd = get_unused_fd_flags(0);
> +	if (fd < 0)
> +		return fd;
> +
> +	if (DMEMIT("%d\n", fd) == 0) {
> +		/* Not enough space in @result */
> +		put_unused_fd(fd);
> +		return 0;
> +	}
> +
> +	file = qcow2->file;
> +	fd_install(fd, get_file(file));
> +	return 1;
> +}
> +
> +static int qcow2_get_img_name(struct qcow2_target *tgt, u32 img_id,
> +			      char *result, unsigned int maxlen)
> +{
> +	struct qcow2 *qcow2 = tgt->top;
> +	int skip, ret;
> +	char *p;
> +
> +	lockdep_assert_held(&tgt->ctl_mutex); /* tgt->top */
> +
> +	skip = tgt->nr_images - 1 - img_id;
> +	while (qcow2 && skip > 0) {
> +		qcow2 = qcow2->lower;
> +		skip--;
> +	}
> +
> +	if (!qcow2 || skip)
> +		return -ENOENT;
> +
> +	p = file_path(qcow2->file, result, maxlen - 1);
> +	if (IS_ERR(p)) {
> +		if (PTR_ERR(p) == -ENAMETOOLONG)
> +			return 0; /* dm should pass bigger buffer */
> +		return PTR_ERR(p);
> +	}
> +
> +	ret = strlen(p);
> +	memmove(result, p, ret);
> +	result[ret] = 0;
> +	return 1;
> +}
> +
> +int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv,
> +		  char *result, unsigned int maxlen)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +	int ret = -EPERM;
> +	u32 val;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		goto out;
> +
> +	ret = -EINVAL;
> +	if (argc < 1)
> +		goto out;
> +
> +	ret = mutex_lock_killable(&tgt->ctl_mutex);
> +	if (ret)
> +		goto out;
> +
> +	if (!strcmp(argv[0], "get_errors")) {
> +		ret = qcow2_get_errors(tgt, result, maxlen);
> +	} else if (!strcmp(argv[0], "get_img_fd")) {
> +		if (argc != 2 || kstrtou32(argv[1], 10, &val)) {
> +			ret = -EINVAL;
> +			goto unlock;
> +		}
> +		ret = qcow2_get_fd(tgt, val, result, maxlen);
> +	} else if (!strcmp(argv[0], "get_img_name")) {
> +		if (argc != 2 || kstrtou32(argv[1], 10, &val)) {
> +			ret = -EINVAL;
> +			goto unlock;
> +		}
> +		ret = qcow2_get_img_name(tgt, val, result, maxlen);
> +	} else if (!tgt->service_operations_allowed) {
> +		ret = -EBUSY; /* Suspended */
> +		/* Service operations goes below: */
> +	} else if (!strcmp(argv[0], "merge_forward")) {
> +		ret = qcow2_merge_forward(tgt);
> +	} else if (!strcmp(argv[0], "merge_backward")) {
> +		ret = qcow2_merge_backward(tgt);
> +	} else {
> +		ret = -ENOTTY;
> +	}
> +unlock:
> +	mutex_unlock(&tgt->ctl_mutex);
> +out:
> +	return ret;
> +}
> diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c
> new file mode 100644
> index 000000000000..133d74bf0b33
> --- /dev/null
> +++ b/drivers/md/dm-qcow2-map.c
> @@ -0,0 +1,4068 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *  Copyright (C) 2021 Virtuozzo International GmbH. All rights reserved.
> + */
> +#include <linux/spinlock.h>
> +#include <linux/uio.h>
> +#include <linux/fs.h>
> +#include <uapi/linux/falloc.h>
> +#include <linux/blk-mq.h>
> +#include <linux/zlib.h>
> +
> +#include "dm.h"
> +#include "dm-rq.h"
> +#include "dm-qcow2.h"
> +
> +/* "Exactly one bit" has the same number in L1 and L2 */
> +#define LX_REFCOUNT_EXACTLY_ONE (1ULL << 63)
> +#define L1_RESERVED_ZERO_MASK 0x7F000000000001FFULL
> +#define L2_READS_ALL_ZEROES (1ULL << 0)
> +#define L2_COMPRESSED_CLUSTER (1ULL << 62)
> +#define L2_RESERVED_ZERO_MASK 0x3F000000000001FEULL
> +#define R1_RESERVED_ZERO_MASK 0x1FFULL
> +
> +#define qcow2_for_each_bvec(iter, bv, start_iter, from_bv)			\
> +	for (iter = start_iter;							\
> +	     iter.bi_size && ((bv = mp_bvec_iter_bvec(from_bv, iter)), 1);	\
> +	     bvec_iter_advance(from_bv, &iter, bv.bv_len))
> +
> +struct qcow2_map_item {
> +	/*
> +	 * Index in clu and index in page.
> +	 * For L1, L2 and R1 is measured in u64 (even if extended L2).
> +	 * For R2 is measured in R2 size (may refer to the middle of byte).
> +	 */
> +	u32 index;
> +	u32 index_in_page;
> +	u64 page_id;
> +	struct md_page *md;
> +};
> +
> +struct qcow2_map {
> +	struct qcow2_map_item l1, l2;
> +	struct qcow2_map_item r1, r2;
> +#define L1_LEVEL (1 << 0)
> +#define L2_LEVEL (1 << 1)
> +	u8 level; /* Cached levels */
> +	/* L2 entry has "reads all zeroes", and refers to prealloced block */
> +	bool prealloced:1;
> +	bool compressed:1;
> +	bool clu_is_cow:1;
> +	bool all_zeroes:1;
> +	/*
> +	 * DATA clu is allocated (the same time "all zeroes
> +	 * read" or "sublu is not allocated" may be set).
> +	 */
> +	bool data_clu_alloced:1;
> +	bool backing_file_cow:1;
> +
> +	u32 compressed_sectors;
> +	u32 subclus_mask;
> +	/*
> +	 * Cluster-aligned COW source: clusters containing
> +	 * compressed sectors or internal snapshot cluster.
> +	 * Their usage count will be decremented after COW.
> +	 */
> +	loff_t cow_clu_pos;
> +	loff_t cow_clu_end;
> +
> +	u64 ext_l2;
> +	u64 data_clu_pos;
> +
> +	struct qcow2 *qcow2;
> +};
> +
> +struct qcow2_bvec {
> +	unsigned int nr_pages;
> +	struct bio_vec bvec[0];
> +};
> +
> +static int handle_r1r2_maps(struct qcow2 *qcow2, loff_t pos, struct qio **qio,
> +	struct qcow2_map_item *r1, struct qcow2_map_item *r2, bool compressed);
> +static int punch_hole(struct file *file, loff_t pos, loff_t len);
> +static void handle_cleanup_mask(struct qio *qio);
> +static void process_read_qio(struct qcow2 *qcow2, struct qio *qio,
> +			     struct qcow2_map *map);
> +
> +static loff_t bytes_off_in_cluster(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	return CLU_OFF(qcow2, to_bytes(qio->bi_iter.bi_sector));
> +}
> +
> +static loff_t bio_sector_to_file_pos(struct qcow2 *qcow2, struct qio *qio,
> +				     struct qcow2_map *map)
> +{
> +	WARN_ON_ONCE(!map->data_clu_pos);
> +
> +	return map->data_clu_pos + bytes_off_in_cluster(qcow2, qio);
> +}
> +
> +static loff_t compressed_clu_end_pos(loff_t start, sector_t compressed_sectors)
> +{
> +	if (start % SECTOR_SIZE == 0)
> +		compressed_sectors++;
> +
> +	return start + to_bytes(compressed_sectors);
> +}
> +
> +static u8 qio_subclu_indexes(struct qcow2 *qcow2, struct qio *qio, u8 *end_bit)
> +{
> +	u64 off = bytes_off_in_cluster(qcow2, qio);
> +
> +	WARN_ON_ONCE(!qcow2->ext_l2 || !qio->bi_iter.bi_size);
> +	*end_bit = (off + qio->bi_iter.bi_size - 1) / qcow2->subclu_size;
> +
> +	return off / qcow2->subclu_size;
> +}
> +
> +static u32 qio_subclus_mask(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	u8 start_bit, end_bit;
> +	u32 mask = ~0U;
> +
> +	WARN_ON_ONCE(!qcow2->ext_l2 || !qio->bi_iter.bi_size);
> +
> +	start_bit = qio_subclu_indexes(qcow2, qio, &end_bit);
> +	mask = (mask >> start_bit) << start_bit;
> +	mask = (mask << (31 - end_bit)) >> (31 - end_bit);
> +
> +	return mask;
> +}
> +
> +static u32 next_bit(u32 mask, u32 from)
> +{
> +	mask >>= from;
> +	if (!mask)
> +		return 32;
> +	return __ffs(mask) + from;
> +}
> +
> +static u32 next_zero_bit(u32 mask, u32 from)
> +{
> +	return next_bit(~mask, from);
> +}
> +
> +static u8 find_bits_range_from(u32 mask, u8 from, u8 *nr)
> +{
> +	u8 left, right;
> +
> +	if (from == 32)
> +		return 32;
> +	left = next_bit(mask, from);
> +	if (left == 32)
> +		return 32;
> +	right = next_zero_bit(mask, left);
> +	*nr = right - left;
> +	return left;
> +}
> +
> +static u32 get_bits_range_from(u32 mask, u8 from)
> +{
> +	u8 nr;
> +
> +	from = find_bits_range_from(mask, from, &nr);
> +	if (from == 32)
> +		return 0;
> +	return (~(u32)0 >> (32 - nr)) << from;
> +}
> +
> +static u32 get_bits_range_up_to(u32 mask, u8 to)
> +{
> +	int i, next = 0;
> +
> +	if (to == 0)
> +		return (1 << 0) & mask;
> +	if (!(mask << (31 - to) >> (31 - to)))
> +		return 0;
> +
> +	while (next < to) {
> +		i = next;
> +		next = next_zero_bit(mask, i + 1);
> +	}
> +
> +	mask = (mask >> i) << i; /* i is last prev zero bit */
> +	mask = (mask << (31 - to)) >> (31 - to);
> +	return mask;
> +}
> +
> +static u64 get_u64_from_page(struct page *page, int index_in_page)
> +{
> +	u64 *indexes, val;
> +
> +	indexes = kmap_atomic(page);
> +	val = indexes[index_in_page];
> +	kunmap_atomic(indexes);
> +
> +	return val;
> +}
> +
> +static u64 get_u64_from_be_page(struct page *page, int index_in_page)
> +{
> +	return be64_to_cpu(get_u64_from_page(page, index_in_page));
> +}
> +
> +static void set_u64_to_page(struct page *page, int index_in_page, u64 val)
> +{
> +	u64 *indexes;
> +
> +	indexes = kmap_atomic(page);
> +	indexes[index_in_page] = val;
> +	kunmap_atomic(indexes);
> +}
> +
> +static void set_u64_to_be_page(struct page *page, int index_in_page, u64 val)
> +{
> +	return set_u64_to_page(page, index_in_page, cpu_to_be64(val));
> +}
> +
> +struct qcow2 *qcow2_ref_inc(struct qcow2_target *tgt, u8 *ref_index)
> +{
> +	struct percpu_ref *ref;
> +	struct qcow2 *qcow2;
> +
> +	rcu_read_lock();
> +	do {
> +		*ref_index = tgt->inflight_ref_index;
> +		smp_rmb(); /* Pairs with qcow2_merge_backward() */
> +		qcow2 = tgt->top;
> +		ref = &tgt->inflight_ref[*ref_index];
> +	} while (unlikely(!percpu_ref_tryget(ref)));
> +	rcu_read_unlock();
> +
> +	return qcow2;
> +}
> +
> +void qcow2_ref_dec(struct qcow2_target *tgt, u8 ref_index)
> +{
> +	struct percpu_ref *ref = &tgt->inflight_ref[ref_index];
> +
> +	percpu_ref_put(ref);
> +}
> +
> +/* Zero @count bytes of @bi_iter pointed @bi_io_vec since @from byte */
> +static void zero_fill_iter_bvec(struct bvec_iter *bi_iter, struct bio_vec *bi_io_vec,
> +				u32 from, u32 count)
> +{
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +	u32 bytes;
> +
> +	qcow2_for_each_bvec(iter, bv, *bi_iter, bi_io_vec) {
> +		void *data;
> +
> +		if (!count)
> +			break;
> +		if (from >= bv.bv_len) {
> +			from -= bv.bv_len;
> +			continue;
> +		}
> +
> +		bytes = bv.bv_len - from;
> +		if (bytes > count)
> +			bytes = count;
> +
> +		data = kmap(bv.bv_page);
> +		memset(data + bv.bv_offset + from, 0, bytes);
> +		flush_dcache_page(bv.bv_page);
> +		kunmap(bv.bv_page);
> +		from = 0;
> +		count -= bytes;
> +	}
> +}
> +
> +/* Zero @count bytes of @qio->bi_io_vec since @from byte */
> +static void zero_fill_qio(struct qio *qio, u32 from, u32 count)
> +{
> +	zero_fill_iter_bvec(&qio->bi_iter, qio->bi_io_vec, from, count);
> +}
> +
> +static bool maybe_mapped_in_lower_delta(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	if (!qcow2->lower)
> +		return false;
> +	return (to_bytes(qio->bi_iter.bi_sector) < qcow2->lower->hdr.size);
> +}
> +
> +/* Shorten tail behind qcow2 max possible size */
> +static void shorten_and_zero_qio_tail(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	loff_t start = to_bytes(qio->bi_iter.bi_sector);
> +	loff_t end = start + qio->bi_iter.bi_size;
> +	loff_t size = qcow2->hdr.size;
> +
> +	if (likely(size >= end))
> +		return;
> +	if (WARN_ON_ONCE(start >= size))
> +		return;
> +	zero_fill_qio(qio, size - start, end - size);
> +	qio->bi_iter.bi_size -= end - size;
> +}
> +
> +static unsigned int qio_nr_segs(struct qio *qio)
> +{
> +	unsigned int nr_segs = 0;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +
> +	qcow2_for_each_bvec(iter, bv, qio->bi_iter, qio->bi_io_vec)
> +		nr_segs++;
> +
> +	return nr_segs;
> +}
> +
> +struct qio *alloc_qio(mempool_t *pool, bool zero)
> +{
> +	struct qio *qio;
> +
> +	qio = mempool_alloc(pool, GFP_NOIO);
> +	if (qio) {
> +		if (zero)
> +			memset(qio, 0, sizeof(*qio));
> +	}
> +	return qio;
> +}
> +
> +void init_qio(struct qio *qio, unsigned int bi_op, struct qcow2 *qcow2)
> +{
> +	qio->qcow2 = qcow2;
> +	qio->queue_list_id = QLIST_DEFERRED;
> +	qio->ext = NULL;
> +	qio->data = NULL;
> +	qio->bi_op = bi_op;
> +	qio->bi_io_vec = NULL;
> +	qio->flags = 0;
> +	qio->ref_index = REF_INDEX_INVALID;
> +	atomic_set(&qio->remaining, 1);
> +
> +	/*
> +	 * Initially set into BLK_STS_OK, while aio complete,
> +	 * md write complete, etc rewrite bi_status on error.
> +	 */
> +	qio->bi_status = BLK_STS_OK;
> +}
> +
> +static int alloc_qio_ext(struct qio *qio)
> +{
> +	if (WARN_ON_ONCE(qio->ext))
> +		return -EIO;
> +	qio->ext = kzalloc(sizeof(*(qio->ext)), GFP_NOIO);
> +	if (!qio->ext)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
> +static void finalize_qio_ext(struct qio *qio)
> +{
> +	if (qio->ext) {
> +		handle_cleanup_mask(qio);
> +		kfree(qio->ext);
> +		qio->ext = NULL;
> +	}
> +}
> +
> +static void free_qio(struct qio *qio, mempool_t *pool)
> +{
> +	mempool_free(qio, pool);
> +}
> +
> +static void do_qio_endio(struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	struct qcow2_target *tgt = qcow2->tgt;
> +	qcow2_endio_t endio_cb = qio->endio_cb;
> +	void *endio_cb_data = qio->endio_cb_data;
> +	unsigned int flags = qio->flags;
> +	u8 ref_index = qio->ref_index;
> +
> +	if (!atomic_dec_and_test(&qio->remaining))
> +		return;
> +
> +	qio->ref_index = REF_INDEX_INVALID;
> +	/* Note, that this may free qio or its container memory */
> +	endio_cb(tgt, qio, endio_cb_data, qio->bi_status);
> +
> +	if (ref_index < REF_INDEX_INVALID)
> +		qcow2_ref_dec(tgt, ref_index);
> +	if (flags & QIO_FREE_ON_ENDIO_FL)
> +		free_qio(qio, tgt->qio_pool);
> +}
> +
> +static void qio_endio(struct qio *qio)
> +{
> +	finalize_qio_ext(qio);
> +
> +	do_qio_endio(qio);
> +}
> +
> +static void dispatch_qio(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	WARN_ON_ONCE(qcow2 != qio->qcow2 ||
> +		     qio->queue_list_id >= QLIST_INVALID);
> +	lockdep_assert_held(&qcow2->deferred_lock);
> +
> +	list_add_tail(&qio->link, &qcow2->qios[qio->queue_list_id]);
> +}
> +
> +void dispatch_qios(struct qcow2 *qcow2, struct qio *qio,
> +		   struct list_head *qio_list)
> +{
> +	unsigned long flags;
> +
> +	if (!qio && (!qio_list || list_empty(qio_list)))
> +		return;
> +
> +	spin_lock_irqsave(&qcow2->deferred_lock, flags);
> +	if (qio)
> +		dispatch_qio(qcow2, qio);
> +	if (qio_list) {
> +		while ((qio = qio_list_pop(qio_list)) != NULL)
> +			dispatch_qio(qcow2, qio);
> +	}
> +
> +	spin_unlock_irqrestore(&qcow2->deferred_lock, flags);
> +
> +	queue_work(qcow2->tgt->wq, &qcow2->worker);
> +}
> +
> +static void end_qios(struct list_head *qio_list, blk_status_t status)
> +{
> +	struct qio *qio;
> +
> +	while ((qio = qio_list_pop(qio_list)) != NULL) {
> +		if (status != BLK_STS_OK)
> +			qio->bi_status = status;
> +		qio_endio(qio);
> +	}
> +}
> +
> +static void qio_chain_endio(struct qcow2_target *tgt, struct qio *qio,
> +			    void *parent_ptr, blk_status_t bi_status)
> +{
> +	struct qio *parent = parent_ptr;
> +
> +	if (unlikely(bi_status))
> +		parent->bi_status = bi_status;
> +
> +	do_qio_endio(parent);
> +}
> +
> +static void qio_chain(struct qio *qio, struct qio *parent)
> +{
> +	WARN_ON(qio->endio_cb_data || qio->endio_cb);
> +
> +	qio->endio_cb_data = parent;
> +	qio->endio_cb = qio_chain_endio;
> +	atomic_inc(&parent->remaining);
> +}
> +
> +/* Clone of bio_advance_iter() */
> +static void qio_advance(struct qio *qio, unsigned int bytes)
> +{
> +	struct bvec_iter *iter = &qio->bi_iter;
> +
> +	iter->bi_sector += bytes >> 9;
> +
> +	if (op_is_discard(qio->bi_op))
> +		iter->bi_size -= bytes;
> +	else
> +		bvec_iter_advance(qio->bi_io_vec, iter, bytes);
> +}
> +
> +static struct qio *split_and_chain_qio(struct qcow2 *qcow2,
> +				       struct qio *qio, u32 len)
> +{
> +	struct qio *split;
> +
> +	split = alloc_qio(qcow2->tgt->qio_pool, true);
> +	if (!split)
> +		return NULL;
> +
> +	init_qio(split, qio->bi_op, qcow2);
> +	split->queue_list_id = qio->queue_list_id;
> +	split->flags |= QIO_FREE_ON_ENDIO_FL;
> +	split->flags |= (qio->flags & QIO_SPLIT_INHERITED_FLAGS);
> +	split->bi_io_vec = qio->bi_io_vec;
> +	split->bi_iter = qio->bi_iter;
> +	split->bi_iter.bi_size = len;
> +	split->endio_cb = NULL;
> +	split->endio_cb_data = NULL;
> +	qio_chain(split, qio);
> +	if (len)
> +		qio_advance(qio, len);
> +	return split;
> +}
> +
> +static int split_qio_to_list(struct qcow2 *qcow2, struct qio *qio,
> +			     struct list_head *ret_list)
> +{
> +	u32 clu_size = qcow2->clu_size;
> +	struct qio *split;
> +	LIST_HEAD(list);
> +
> +	while (1) {
> +		loff_t start = to_bytes(qio->bi_iter.bi_sector);
> +		loff_t end = start + qio->bi_iter.bi_size;
> +		unsigned int len;
> +
> +		WARN_ON_ONCE(start == end);
> +
> +		if (start / clu_size == (end - 1) / clu_size)
> +			break;
> +		end = round_up(start + 1, clu_size);
> +		len = end - start;
> +
> +		split = split_and_chain_qio(qcow2, qio, len);
> +		if (!split)
> +			goto err;
> +
> +		list_add_tail(&split->link, &list);
> +	}
> +
> +	list_splice_tail(&list, ret_list);
> +	list_add_tail(&qio->link, ret_list);
> +	return 0;
> +err:
> +	while ((qio = qio_list_pop(&list)) != NULL) {
> +		qio->bi_status = BLK_STS_RESOURCE;
> +		qio_endio(qio);
> +	}
> +	return -ENOMEM;
> +}
> +
> +static void perform_zero_read(struct qio *qio, u32 size)
> +{
> +	zero_fill_qio(qio, 0, size);
> +}
> +
> +static void inc_inflight_md(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	struct qcow2_target *tgt = qcow2->tgt;
> +	struct percpu_ref *ref;
> +	u8 ref_index;
> +
> +	do {
> +		ref_index = tgt->inflight_ref_index;
> +		ref = &tgt->inflight_ref[ref_index];
> +	} while (unlikely(!percpu_ref_tryget_live(ref)));
> +
> +	qio->ref_index = ref_index;
> +}
> +
> +static void dec_inflight_md(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	struct qcow2_target *tgt = qcow2->tgt;
> +	u8 ref_index = qio->ref_index;
> +
> +	if (!(WARN_ON_ONCE(ref_index > 1)))
> +		percpu_ref_put(&tgt->inflight_ref[ref_index]);
> +}
> +
> +static void inc_wpc_readers(struct md_page *md)
> +{
> +	atomic_inc(&md->wpc_readers);
> +}
> +
> +static void dec_wpc_readers(struct qcow2 *qcow2, struct md_page *md)
> +{
> +	LIST_HEAD(wait_list);
> +	unsigned long flags;
> +	bool last;
> +
> +	last = atomic_dec_and_lock_irqsave(&md->wpc_readers,
> +					   &qcow2->md_pages_lock, flags);
> +	if (last) {
> +		list_splice_tail_init(&md->wpc_readers_wait_list, &wait_list);
> +		spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +		dispatch_qios(qcow2, NULL, &wait_list);
> +	}
> +}
> +
> +static bool delay_if_has_wpc_readers(struct qcow2 *qcow2, struct md_page *md,
> +				     struct qio **qio)
> +{
> +	bool ret = false;
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	if (atomic_read(&md->wpc_readers)) {
> +		list_add_tail(&(*qio)->link, &md->wpc_readers_wait_list);
> +		*qio = NULL;
> +		ret = true;
> +	}
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +	return ret;
> +}
> +
> +static u32 calc_cow_mask(struct qcow2 *qcow2, u64 ext_l2,
> +			 struct qio *qio, bool wants_backing,
> +			 bool wants_data, bool wants_zeroes)
> +{
> +	loff_t start = to_bytes(qio->bi_iter.bi_sector);
> +	loff_t end = start + qio->bi_iter.bi_size;
> +	u32 mask, subclus_mask, mapped_mask, cow_mask;
> +	u8 start_bit, end_bit;
> +
> +	subclus_mask = cow_mask = 0;
> +	if (!qcow2->ext_l2)
> +		goto out;
> +
> +	if (fake_merge_qio(qio) || !op_is_write(qio->bi_op)) {
> +		WARN_ON_ONCE(wants_backing);
> +		goto continue_mask;
> +	}
> +
> +	WARN_ON_ONCE(start == end);
> +	start_bit = qio_subclu_indexes(qcow2, qio, &end_bit);
> +	mapped_mask = (u32)ext_l2|(ext_l2 >> 32);
> +	subclus_mask = qio_subclus_mask(qcow2, qio);
> +
> +	if (SUBCLU_OFF(qcow2, start)) {
> +		if ((wants_backing && ((1 << start_bit) & ~mapped_mask)) ||
> +		    (wants_data && ((1 << start_bit) & (u32)ext_l2)) ||
> +		    (wants_zeroes && ((1 << start_bit) & (ext_l2 >> 32))))
> +			cow_mask |= (1 << start_bit);
> +	}
> +	if (SUBCLU_OFF(qcow2, end)) {
> +		if ((wants_backing && ((1 << end_bit) & ~mapped_mask)) ||
> +		    (wants_data && ((1 << end_bit) & (u32)ext_l2)) ||
> +		    (wants_zeroes && ((1 << end_bit) & (ext_l2 >> 32))))
> +			cow_mask |= (1 << end_bit);
> +	}
> +
> +continue_mask:
> +	if (wants_data) {
> +		/* Unchanged COW subclus */
> +		mask = (u32)ext_l2 & ~subclus_mask;
> +		cow_mask |= mask;
> +	}
> +	if (wants_zeroes) {
> +		mask = (ext_l2 >> 32) & ~subclus_mask;
> +		cow_mask |= mask;
> +	}
> +out:
> +	return cow_mask;
> +}
> +
> +#define CB_OR_RET(start, end, d_p, d2_p, d3_p)			\
> +	do {							\
> +		int __ret = cb(start, end, d_p, d2_p, d3_p);	\
> +		if (__ret)					\
> +			return __ret;				\
> +	} while (0)
> +
> +static int for_each_cow_interval_ext_l2(struct qio *qio, loff_t start, loff_t end,
> +				 int (*cb)(loff_t, loff_t, void *, void *, void *),
> +				 void *d_p, void *d2_p, void *d3_p)
> +{
> +	loff_t from, to, i_from[2], i_to[2], pos;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	struct qio_ext *ext = qio->ext;
> +	u32 subclu_size = qcow2->subclu_size;
> +	u32 mask, cow_mask = ext->cow_mask;
> +	u8 start_bit, i, j, nr, end_bit;
> +
> +	i_from[0] = i_from[1] = OFFSET_MAX;
> +
> +	if (fake_merge_qio(qio) || !op_is_write(qio->bi_op))
> +		goto iterate;
> +
> +	start_bit = qio_subclu_indexes(qcow2, qio, &end_bit);
> +	/* Firstly, find two intervals near qio boundaries: */
> +	if (SUBCLU_OFF(qcow2, start) && ((1 << start_bit) & cow_mask)) {
> +		/* Left boundary */
> +		pos = round_down(start, subclu_size);
> +		if (start_bit != 0 && (cow_mask & (1 << (start_bit - 1)))) {
> +			/* Left mapped neighbours */
> +			mask = get_bits_range_up_to(cow_mask, start_bit - 1);
> +			cow_mask &= ~mask;
> +			pos -= hweight32(mask) * subclu_size;
> +		}
> +		i_from[0] = pos;
> +		i_to[0] = start;
> +	}
> +	if (SUBCLU_OFF(qcow2, end) && ((1 << end_bit) & cow_mask)) {
> +		/* Right boundary */
> +		pos = round_up(end, subclu_size);
> +		if (end_bit != 31 && (cow_mask & (1 << (end_bit + 1)))) {
> +			/* Right mapped neighbours */
> +			mask = get_bits_range_from(cow_mask, end_bit + 1);
> +			cow_mask &= ~mask;
> +			pos += hweight32(mask) * subclu_size;
> +		}
> +		i_from[1] = end;
> +		i_to[1] = pos;
> +	}
> +	cow_mask &= ~((1 << start_bit) | (1 << end_bit));
> +
> +iterate:
> +	/*
> +	 * Start ordered iteration over unchanged COW subclus
> +	 * and two above intervals:
> +	 */
> +	if (cow_mask) {
> +		pos = round_down(start, qcow2->clu_size);
> +		for (i = 0;
> +		     (i = find_bits_range_from(cow_mask, i, &nr)) < 32;
> +		     i += nr) {
> +			from = pos + (loff_t)i * subclu_size;
> +			to = pos + (loff_t)(i + nr) * subclu_size;
> +
> +			for (j = 0; j < 2; j++) {
> +				if (i_from[j] >= from)
> +					continue;
> +				CB_OR_RET(i_from[j], i_to[j], d_p, d2_p, d3_p);
> +				i_from[j] = OFFSET_MAX;
> +			}
> +			CB_OR_RET(from, to, d_p, d2_p, d3_p);
> +		}
> +	}
> +	/* Iterate boundary intervals, if we haven't done that yet: */
> +	for (j = 0; j < 2; j++) {
> +		if (i_from[j] != OFFSET_MAX)
> +			CB_OR_RET(i_from[j], i_to[j], d_p, d2_p, d3_p);
> +	}
> +
> +	return 0;
> +}
> +
> +/*
> + * This function calls @cb for each interval of COW clu,
> + * which is not rewritten by @qio. E.g., let bi_iter of WRITE
> + * @qio refers to [off + clu_size / 4, off + clu_size / 3],
> + * where off is multiply of clu_size, while ext_l2 is disabled.
> + * Then, @cb will be called twice from inside the function:
> + * 1)@cb(off, off + clu_size / 4, ...)
> + * 2)@cb(off + clu_size / 3, off + clu_size, ...).
> + * ext_l2 case also cares about allocated subclus.
> + *
> + * We use this to allocate a single bio_vec[] array with pages
> + * to accommodate and to read/write only not-rewritable data
> + * from COW clu to new place.
> + */
> +static int for_each_cow_interval(struct qio *qio,
> +				 int (*cb)(loff_t, loff_t, void *, void *, void *),
> +				 void *d_p, void *d2_p, void *d3_p)
> +{
> +	loff_t start = to_bytes(qio->bi_iter.bi_sector);
> +	loff_t end = start + qio->bi_iter.bi_size;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	u32 clu_size = qcow2->clu_size;
> +
> +	if (!qcow2->ext_l2) {
> +		if (fake_merge_qio(qio) || !op_is_write(qio->bi_op)) {
> +			return cb(round_down(start, clu_size),
> +				  round_up(start + 1, clu_size),
> +				  d_p, d2_p, d3_p);
> +		}
> +
> +		if (CLU_OFF(qcow2, start) != 0)
> +			CB_OR_RET(round_down(start, clu_size), start, d_p, d2_p, d3_p);
> +		if (CLU_OFF(qcow2, end) != 0)
> +			CB_OR_RET(end, round_up(end, clu_size), d_p, d2_p, d3_p);
> +		return 0;
> +	}
> +
> +	return for_each_cow_interval_ext_l2(qio, start, end, cb, d_p, d2_p, d3_p);
> +}
> +#undef CB_OR_RET
> +
> +static int count_cow_pages(loff_t start, loff_t end, void *nr_pages_p,
> +			   void *nr_segs_p, void *unused)
> +{
> +	u32 *nr_pages = nr_pages_p, *nr_segs = nr_segs_p;
> +
> +	start = round_down(start, PAGE_SIZE);
> +	end = round_up(end, PAGE_SIZE);
> +
> +	*nr_pages += (end - start) / PAGE_SIZE;
> +	*nr_segs += 1;
> +	return 0;
> +}
> +
> +static struct qcow2_bvec *alloc_qvec_with_data(u32 nr_vecs, void **data, u32 data_sz)
> +{
> +	struct qcow2_bvec *qvec = NULL;
> +	unsigned int size;
> +
> +	size = sizeof(struct qcow2_bvec) + nr_vecs * sizeof(struct bio_vec);
> +	qvec = kzalloc(size + data_sz, GFP_NOIO);
> +	if (qvec)
> +		qvec->nr_pages = nr_vecs;
> +	if (data)
> +		*data = (void *)qvec + size;
> +	return qvec;
> +}
> +
> +static void free_qvec_with_pages(struct qcow2_bvec *qvec)
> +{
> +	if (qvec) {
> +		while (qvec->nr_pages-- > 0)
> +			put_page(qvec->bvec[qvec->nr_pages].bv_page);
> +		kfree(qvec);
> +	}
> +}
> +
> +static struct qcow2_bvec *alloc_qvec_with_pages(ushort nr_pages)
> +{
> +	struct qcow2_bvec *qvec;
> +	struct bio_vec *bvec;
> +	int i;
> +
> +	qvec = alloc_qvec_with_data(nr_pages, NULL, 0);
> +	if (!qvec)
> +		return NULL;
> +
> +	bvec = qvec->bvec;
> +	for (i = 0; i < nr_pages; i++) {
> +		bvec[i].bv_page = alloc_page(GFP_NOIO);
> +		if (!bvec[i].bv_page)
> +			goto err;
> +		bvec[i].bv_len = PAGE_SIZE;
> +		bvec[i].bv_offset = 0;
> +	}
> +
> +	return qvec;
> +err:
> +	qvec->nr_pages = i;
> +	free_qvec_with_pages(qvec);
> +	return NULL;
> +}
> +
> +static void free_wbd(struct wb_desc *wbd)
> +{
> +	if (wbd) {
> +		if (wbd->pe_page)
> +			put_page(wbd->pe_page);
> +		kfree(wbd->changed_indexes);
> +		kfree(wbd);
> +	}
> +}
> +
> +static struct wb_desc *alloc_wbd(bool needs_prealloced)
> +{
> +	struct wb_desc *wbd;
> +
> +	wbd = kzalloc(sizeof(*wbd), GFP_NOIO);
> +	if (!wbd)
> +		return NULL;
> +	wbd->changed_indexes = kzalloc(LX_INDEXES_BYTES, GFP_NOIO);
> +	if (!wbd->changed_indexes)
> +		goto err;
> +	if (needs_prealloced) {
> +		wbd->pe_page = alloc_page(GFP_NOIO|__GFP_ZERO);
> +		if (!wbd->pe_page)
> +			goto err;
> +	}
> +
> +	INIT_LIST_HEAD(&wbd->submitted_list);
> +	INIT_LIST_HEAD(&wbd->completed_list);
> +	INIT_LIST_HEAD(&wbd->dependent_list);
> +	return wbd;
> +err:
> +	free_wbd(wbd);
> +	return NULL;
> +}
> +
> +void slow_wb_timer_fn(struct timer_list *t)
> +{
> +	struct qcow2 *qcow2 = from_timer(qcow2, t, slow_wb_timer);
> +	unsigned long flags;
> +	bool queue;
> +
> +	spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +	queue = !list_empty(&qcow2->slow_wb_batch_list);
> +	list_splice_init(&qcow2->slow_wb_batch_list, &qcow2->wb_batch_list);
> +	spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +
> +	if (queue)
> +		queue_work(qcow2->tgt->wq, &qcow2->worker);
> +}
> +
> +static bool md_make_dirty(struct qcow2 *qcow2, struct md_page *md, bool is_refs)
> +{
> +	struct list_head *head;
> +	bool new = false;
> +
> +	head = !is_refs ? &qcow2->wb_batch_list : &qcow2->slow_wb_batch_list;
> +
> +	/* md->status must be visible for complete handlers */
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +
> +	if (!(md->status & MD_DIRTY)) {
> +		md->status |= MD_DIRTY;
> +		list_add_tail(&md->wb_link, head);
> +		new = true;
> +
> +		if (is_refs && !timer_pending(&qcow2->slow_wb_timer))
> +			mod_timer(&qcow2->slow_wb_timer,
> +				  jiffies + WB_TIMEOUT_JI);
> +		/* Sanity: 1)only L1L2 have wbd, 2)only R1R2 allow redirtying */
> +		WARN_ON(md->wbd && ((md->status & MD_WRITEBACK) || is_refs));
> +	}
> +	return new;
> +}
> +
> +static u64 get_r2_entry(struct qcow2 *qcow2, struct md_page *md,
> +			u32 r2_index_in_page)
> +{
> +	u32 index, start, tail, bits = qcow2->refblock_bits;
> +	u64 entry;
> +
> +	/* index of u64 qword containing our refcounter */
> +	index = r2_index_in_page * bits / 64;
> +	entry = get_u64_from_page(md->page, index);
> +
> +	if (bits == 64)
> +		return be64_to_cpu(entry);
> +	if (bits == 32)
> +		return be32_to_cpu(((u32 *)&entry)[r2_index_in_page % 2]);
> +	if (bits == 16)
> +		return be16_to_cpu(((u16 *)&entry)[r2_index_in_page % 4]);
> +	/*
> +	 * We want to swab original BE u64 qword on both BE and LE.
> +	 * For LE it is already done because of get_u64_from_page().
> +	 * For BE it is made here.
> +	 */
> +	entry = cpu_to_le64(entry);
> +	/* Bit start in u64 qword */
> +	start = r2_index_in_page * bits % 64;
> +	/* Cut tail bits */
> +	tail = 64 - bits - start;
> +	entry = (entry << tail) >> tail;
> +	/* Cut bits before start */
> +	entry >>= start;
> +	return entry;
> +}
> +
> +static void set_r2_entry(struct qcow2 *qcow2, struct md_page *md,
> +			 u32 r2_index_in_page, u64 val)
> +{
> +	u32 index, start, bits = qcow2->refblock_bits;
> +	u64 mask, entry;
> +
> +	/* index of u64 qword containing our refcounter */
> +	index = r2_index_in_page * bits / 64;
> +
> +	if (bits == 64) {
> +		entry = cpu_to_be64(val);
> +		goto set;
> +	}
> +
> +	entry = get_u64_from_page(md->page, index);
> +
> +	if (bits == 32) {
> +		((u32 *)&entry)[r2_index_in_page % 2] = cpu_to_be32(val);
> +		goto set;
> +	}
> +	if (bits == 16) {
> +		((u16 *)&entry)[r2_index_in_page % 4] = cpu_to_be16(val);
> +		goto set;
> +	}
> +
> +	/* Bit start in u64 qword */
> +	start = r2_index_in_page * bits % 64;
> +	/* 0b0000...11 mask */
> +	mask = (~(u64)0) >> (64 - bits);
> +	/* Move to position and swab on BE */
> +	mask = cpu_to_le64(mask << start);
> +	val = cpu_to_le64(val << start);
> +	/* Clear old bits and set new bits */
> +	entry &= ~mask;
> +	entry |= val;
> +set:
> +	/* Store to BE page: swab on LE */
> +	set_u64_to_page(md->page, index, entry);
> +}
> +
> +static void calc_page_id_and_index(loff_t pos, u64 *page_id, u32 *index_in_page)
> +{
> +	*page_id = pos >> PAGE_SHIFT;
> +	*index_in_page = (pos & ~PAGE_MASK) / sizeof(u64);
> +}
> +
> +static int calc_cluster_map(struct qcow2 *qcow2, struct qio *qio,
> +			    struct qcow2_map *map)
> +{
> +	loff_t start = to_bytes(qio->bi_iter.bi_sector);
> +	loff_t end = start + qio->bi_iter.bi_size;
> +	u32 clu_size = qcow2->clu_size;
> +	loff_t pos;
> +
> +	if (unlikely(start / clu_size != (end - 1) / clu_size &&
> +		     (start != end || (!fake_merge_qio(qio) &&
> +				       !fake_l1cow_qio(qio)))))
> +		goto eio;
> +	if (unlikely(end > qcow2->hdr.size))
> +		goto eio;
> +
> +	map->l2.index = (start / clu_size) % qcow2->l2_entries;
> +	map->l1.index = (start / clu_size) / qcow2->l2_entries;
> +
> +	if (qcow2->ext_l2) {
> +		/*
> +		 * Unlike proposed in qcow2 documentation,
> +		 * we measure index in sizeof(u64).
> +		 */
> +		map->l2.index *= 2;
> +	}
> +
> +	if (unlikely(map->l1.index >= qcow2->hdr.l1_size))
> +		goto eio;
> +
> +	pos = qcow2->hdr.l1_table_offset + map->l1.index * sizeof(u64);
> +	calc_page_id_and_index(pos, &map->l1.page_id, &map->l1.index_in_page);
> +	/* TODO: we can count l2.index_in_page. See calc_refcounters_map() */
> +	return 0;
> +eio:
> +	WARN_ONCE(1, "qio(%lld,%lld, 0x%x), map(%u, %u)\n", start, end,
> +		      qio->bi_op, map->l1.index, map->l2.index);
> +	return -EIO;
> +}
> +
> +static int calc_refcounters_map(struct qcow2 *qcow2, loff_t pos,
> +				struct qcow2_map_item *r1,
> +				struct qcow2_map_item *r2)
> +{
> +	u32 refblock_entries = qcow2->refblock_entries;
> +	u32 clus = qcow2->hdr.refcount_table_clusters;
> +	u32 bits = qcow2->refblock_bits;
> +	u32 clu_size = qcow2->clu_size;
> +
> +	r2->index = (pos / clu_size) % refblock_entries;
> +	r1->index = (pos / clu_size) / refblock_entries;
> +
> +	if (unlikely((u64)r1->index * sizeof(u64) >= (u64)clus * clu_size))
> +		goto eio;
> +
> +	pos = qcow2->hdr.refcount_table_offset + r1->index * sizeof(u64);
> +	calc_page_id_and_index(pos, &r1->page_id, &r1->index_in_page);
> +	/*
> +	 * Since cluster is multiply of PAGE_SIZE, we may count index_in_page.
> +	 * Note, this may be half/quarter of byte (the same as r2->index).
> +	 */
> +	r2->index_in_page = r2->index % (PAGE_SIZE * 8 / bits);
> +	return 0;
> +eio:
> +	WARN_ONCE(1, "ref(%u, %u)\n", r1->index, r2->index);
> +	return -EIO;
> +}
> +
> +static int calc_r2_page_id(struct qcow2 *qcow2, struct qcow2_map_item *r1,
> +						struct qcow2_map_item *r2)
> +{
> +	u64 entry = get_u64_from_be_page(r1->md->page, r1->index_in_page);
> +	u32 bits = qcow2->refblock_bits;
> +	loff_t pos;
> +
> +	if (WARN_ON_ONCE((entry & R1_RESERVED_ZERO_MASK) ||
> +			 (CLU_OFF(qcow2, entry) != 0)))
> +		return -EIO;
> +
> +	/* The corresponding refcount block has not yet been allocated */
> +	if (!entry)
> +		return -ENOENT;
> +
> +	pos = entry + r2->index * bits / 8;
> +	r2->page_id = pos >> PAGE_SHIFT;
> +	return 0;
> +}
> +
> +/* Whether L1 or L2 md is under writeback and @index is allocating */
> +static bool dirty_or_writeback(struct qcow2 *qcow2, struct md_page *md,
> +			       u32 index_in_page)
> +{
> +	bool ret = false;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	if (md->wbd && (md->status & (MD_DIRTY|MD_WRITEBACK)))
> +		ret = test_bit(index_in_page, md->wbd->changed_indexes);
> +	return ret;
> +}
> +
> +static bool delay_if_dirty(struct qcow2 *qcow2, struct md_page *md,
> +			   u32 index_in_page, struct qio **qio)
> +{
> +	bool ret = false;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	if (md->status & MD_DIRTY) {
> +		ret = test_bit(index_in_page, md->wbd->changed_indexes);
> +		if (ret) {
> +			list_add_tail(&(*qio)->link, &md->wait_list);
> +			*qio = NULL;
> +		}
> +	}
> +	return ret;
> +}
> +
> +/*
> + * This is helper for parse_metadata().
> + * In case of writeback is in progress, it's prohibited to:
> + * 1)write to indexes, which are under writeback;
> + * 2)add completely new allocations;
> + * 3)reuse preallocations (they force L2 entry update).
> + * I.e., we may write only to clusters, whose indexes are
> + * already written in image file.
> + */
> +static bool __delay_if_writeback(struct qcow2 *qcow2, struct md_page *md,
> +				 u32 index_in_page, struct qio **qio,
> +				 bool wants_allocation)
> +{
> +	bool ret = false;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +
> +	if ((md->status & MD_WRITEBACK) &&
> +	    (wants_allocation ||
> +	     test_bit(index_in_page, md->wbd->changed_indexes))) {
> +		list_add_tail(&(*qio)->link, &md->wait_list);
> +		*qio = NULL;
> +		ret = true;
> +	}
> +	return ret;
> +}
> +
> +static bool delay_if_writeback(struct qcow2 *qcow2, struct md_page *md,
> +			       u32 index_in_page, struct qio **qio,
> +			       bool wants_allocation)
> +{
> +	bool ret;
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	ret = __delay_if_writeback(qcow2, md, index_in_page,
> +				   qio, wants_allocation);
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +	return ret;
> +}
> +
> +static bool delay_if_wpc_readers_locked(struct qcow2 *qcow2, struct md_page *md,
> +					struct qio **qio)
> +{
> +	bool ret = false;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	if (md->wpc_noread_count) {
> +		list_add_tail(&(*qio)->link, &md->wait_list);
> +		*qio = NULL;
> +		ret = true;
> +	}
> +
> +	return ret;
> +}
> +
> +static void md_index_set_locked(struct qcow2 *qcow2, struct md_page *md,
> +				u32 index_in_page)
> +{
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	WARN_ON_ONCE(test_bit(index_in_page, md->lockd->indexes));
> +	set_bit(index_in_page, md->lockd->indexes);
> +	md->lockd->nr++;
> +}
> +
> +static bool delay_if_locked(struct qcow2 *qcow2, struct md_page *md,
> +			    u32 index_in_page, struct qio **qio)
> +{
> +	bool ret = false;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	if (md->lockd && test_bit(index_in_page, md->lockd->indexes)) {
> +		list_add_tail(&(*qio)->link, &md->wait_list);
> +		*qio = NULL;
> +		ret = true;
> +	}
> +
> +	return ret;
> +}
> +
> +
> +/*
> + * Note, that we delay R1 and R2 pages writeback. In case of power down,
> + * they can easily be restored from L1, L2 and other stable metadata.
> + */
> +static void mark_cluster_used(struct qcow2 *qcow2, struct md_page *r2_md,
> +			      u32 r2_index_in_page)
> +{
> +	WARN_ON_ONCE(READ_ONCE(r2_md->status) & MD_WRITEBACK);
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	WARN_ON_ONCE(get_r2_entry(qcow2, r2_md, r2_index_in_page));
> +	set_r2_entry(qcow2, r2_md, r2_index_in_page, 1);
> +	WARN_ON_ONCE(get_r2_entry(qcow2, r2_md, r2_index_in_page) != 1);
> +
> +	md_make_dirty(qcow2, r2_md, true);
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +}
> +
> +static void mark_cluster_unused(struct qcow2 *qcow2, struct md_page *r2_md,
> +				u32 r2_index_in_page, loff_t pos)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +	WARN_ON_ONCE(get_r2_entry(qcow2, r2_md, r2_index_in_page) != 1);
> +	set_r2_entry(qcow2, r2_md, r2_index_in_page, 0);
> +	WARN_ON_ONCE(get_r2_entry(qcow2, r2_md, r2_index_in_page) != 0);
> +
> +	md_make_dirty(qcow2, r2_md, true);
> +	if (qcow2->free_cluster_search_pos > pos)
> +		qcow2->free_cluster_search_pos = pos;
> +	spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +}
> +
> +static void dec_cluster_usage(struct qcow2 *qcow2, struct md_page *r2_md,
> +			      u32 r2_index_in_page, loff_t pos)
> +{
> +	unsigned long flags;
> +	u64 val;
> +
> +	spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +	val = get_r2_entry(qcow2, r2_md, r2_index_in_page);
> +	WARN_ON_ONCE(val < 1);
> +	val--;
> +	set_r2_entry(qcow2, r2_md, r2_index_in_page, val);
> +	WARN_ON_ONCE(get_r2_entry(qcow2, r2_md, r2_index_in_page) != val);
> +
> +	md_make_dirty(qcow2, r2_md, true);
> +	if (!val && qcow2->free_cluster_search_pos > pos)
> +		qcow2->free_cluster_search_pos = pos;
> +	spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +}
> +
> +
> +static void do_md_page_read_complete(int ret, struct qcow2 *qcow2,
> +				     struct md_page *md)
> +{
> +	LIST_HEAD(wait_list);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +	if (ret < 0)
> +		md_page_erase(qcow2, md);
> +	else
> +		md->status |= MD_UPTODATE;
> +
> +	list_splice_tail_init(&md->wait_list, &wait_list);
> +	spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +
> +	if (ret < 0) {
> +		free_md_page(md);
> +		end_qios(&wait_list, errno_to_blk_status(ret));
> +	} else {
> +		dispatch_qios(qcow2, NULL, &wait_list);
> +	}
> +}
> +
> +/* Be careful with dirty_or_writeback()/etc! Check races. */
> +static void revert_clusters_alloc(struct qcow2 *qcow2, struct wb_desc *wbd)
> +{
> +	struct qcow2_map_item r1, r2;
> +	struct page *pe_page;
> +	u64 pos, old;
> +	int i, ret;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	for_each_set_bit(i, wbd->changed_indexes, LX_INDEXES_PER_PAGE) {
> +		pos = get_u64_from_be_page(wbd->md->page, i);
> +		WARN_ON_ONCE(!(pos & ~LX_REFCOUNT_EXACTLY_ONE) ||
> +			     !(pos & LX_REFCOUNT_EXACTLY_ONE));
> +
> +		/* Here we restore prealloced and compressed clu mappings */
> +		pe_page = wbd->pe_page;
> +		if (pe_page) { /* Only L2 has this. */
> +			old = get_u64_from_be_page(pe_page, i);
> +			if (old != 0) {
> +				set_u64_to_be_page(wbd->md->page, i, old);
> +				continue; /* Avoid mark_cluster_unused() */
> +			}
> +		}
> +
> +		set_u64_to_be_page(wbd->md->page, i, 0);
> +		spin_unlock(&qcow2->md_pages_lock);
> +		pos &= ~LX_REFCOUNT_EXACTLY_ONE;
> +
> +		/*
> +		 * R1/R2 should be cached, since we was able
> +		 * to submit cluster allocation.
> +		 */
> +		ret = handle_r1r2_maps(qcow2, pos, NULL, &r1, &r2, false);
> +		if (WARN_ON_ONCE(ret <= 0))
> +			continue;
> +
> +		mark_cluster_unused(qcow2, r2.md, r2.index_in_page, pos);
> +		spin_lock(&qcow2->md_pages_lock);
> +	}
> +}
> +
> +static void clear_writeback_status(struct qcow2 *qcow2, struct md_page *md,
> +				   int ret, struct list_head *wait_list,
> +				   struct list_head *end_list)
> +{
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +
> +	md->status &= ~(MD_WRITEBACK|MD_WRITEBACK_ERROR);
> +	list_splice_init(&md->wait_list, wait_list);
> +	if (ret && !md->wbd) {
> +		/*
> +		 * L1L2 updates can do safe revert,
> +		 * so here we care about R1R2 only.
> +		 */
> +		md->status |= MD_WRITEBACK_ERROR;
> +	}
> +	if (md->wbd) {
> +		if (likely(ret == 0))
> +			list_splice_init(&md->wbd->dependent_list, wait_list);
> +		else
> +			list_splice_init(&md->wbd->dependent_list, end_list);
> +		md->wbd = NULL;
> +	}
> +}
> +
> +static void complete_wbd(struct qcow2 *qcow2, struct wb_desc *wbd)
> +{
> +	if (unlikely(wbd->ret < 0)) {
> +		LIST_HEAD(wait_list);
> +		LIST_HEAD(end_list);
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +		revert_clusters_alloc(qcow2, wbd);
> +		clear_writeback_status(qcow2, wbd->md, wbd->ret,
> +				       &wait_list, &end_list);
> +		spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +
> +		dispatch_qios(qcow2, NULL, &wait_list);
> +		end_qios(&end_list, errno_to_blk_status(wbd->ret));
> +	}
> +	free_wbd(wbd);
> +}
> +
> +static void do_md_page_write_complete(int ret, struct qcow2 *qcow2,
> +				      struct md_page *md)
> +{
> +	struct wb_desc *wbd = NULL;
> +	bool finalize_wbd = false;
> +	LIST_HEAD(wait_list);
> +	LIST_HEAD(end_list);
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	WARN_ON_ONCE(!(md->status & MD_WRITEBACK));
> +	wbd = md->wbd;
> +	if (wbd) {
> +		wbd->completed = true;
> +		wbd->ret = ret;
> +		list_splice_init(&wbd->completed_list, &end_list);
> +		/*
> +		 * In case of this md writeback completed before
> +		 * parallel data qios, wbd is finalized by last
> +		 * completed data qio.
> +		 */
> +		finalize_wbd = list_empty(&wbd->submitted_list);
> +		/*
> +		 * We can finish wb only in case of success.
> +		 * Otherwise this is done in finalize_wbd()
> +		 * after data qios stopped use wbd clusters
> +		 * and clusters allocations reverted.
> +		 */
> +		if (likely(ret == 0)) {
> +			clear_writeback_status(qcow2, md, ret,
> +					       &wait_list, &end_list);
> +		}
> +		/* FIXME: we should reread md after write fail */
> +	} else {
> +		clear_writeback_status(qcow2, md, ret, &wait_list, &end_list);
> +	}
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +	end_qios(&end_list, errno_to_blk_status(ret));
> +	dispatch_qios(qcow2, NULL, &wait_list);
> +	if (finalize_wbd)
> +		complete_wbd(qcow2, wbd);
> +}
> +
> +static void md_page_read_complete(struct qio *qio)
> +{
> +	struct qcow2_bvec *qvec = qio->data;
> +	struct md_page *md = qio->ext->md;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	int ret = qio->ret;
> +	mode_t mode;
> +
> +	BUG_ON(qvec->bvec[0].bv_page != md->page);
> +
> +	if (unlikely(ret != PAGE_SIZE && ret > 0)) {
> +		/* Read near EOF? See qcow2_attach_file() */
> +		loff_t pos = (md->id << PAGE_SHIFT) + ret;
> +
> +		mode = qcow2->file->f_mode;
> +		if (pos == qcow2->file_size && !(mode & FMODE_WRITE)) {
> +			zero_fill_page_from(md->page, ret);
> +			ret = PAGE_SIZE;
> +		}
> +	}
> +	if (unlikely(ret != PAGE_SIZE))
> +		ret = -EIO;
> +	else
> +		ret = 0;
> +
> +	do_md_page_read_complete(ret, qcow2, md);
> +	dec_inflight_md(qcow2, qio);
> +	kfree(qvec); /* qio and ext are tail bytes after qvec */
> +}
> +
> +static void md_page_write_complete(struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	unsigned long flags;
> +
> +	qio->queue_list_id = QLIST_COMPLETED_WB;
> +	spin_lock_irqsave(&qcow2->deferred_lock, flags);
> +	list_add_tail(&qio->link, &qcow2->qios[QLIST_COMPLETED_WB]);
> +	spin_unlock_irqrestore(&qcow2->deferred_lock, flags);
> +	queue_work(qcow2->tgt->wq, &qcow2->fsync_worker);
> +}
> +
> +static void submit_rw_md_page(unsigned int rw, struct qcow2 *qcow2,
> +			      struct md_page *md)
> +{
> +	loff_t pos = md->id << PAGE_SHIFT;
> +	struct qcow2_bvec *qvec = NULL;
> +	struct bio_vec *bvec;
> +	struct iov_iter iter;
> +	int size, err = 0;
> +	struct qio *qio;
> +
> +	if (pos > qcow2->file_size) {
> +		pr_err_once("qcow2: rw=%x pos=%lld behind EOF %lld\n",
> +			     rw, pos, qcow2->file_size);
> +		err = -EIO;
> +	} else {
> +		/*
> +		 * Note, this is fake qio, and qio_endio()
> +		 * can't be called on it!
> +		 */
> +		size = sizeof(struct qio) + sizeof(struct qio_ext);
> +		qvec = alloc_qvec_with_data(1, (void *)&qio, size);
> +		if (!qvec)
> +			err = -ENOMEM;
> +	}
> +	if (err) {
> +		if (rw == READ)
> +			do_md_page_read_complete(err, qcow2, md);
> +		else
> +			do_md_page_write_complete(err, qcow2, md);
> +		return;
> +	}
> +
> +	init_qio(qio, rw == READ ? REQ_OP_READ : REQ_OP_WRITE, qcow2);
> +	qio->ext = (void *)qio + sizeof(*qio);
> +	qio->ext->md = md;
> +	qio->data = qvec;
> +	if (rw == READ)
> +		qio->complete = md_page_read_complete;
> +	else
> +		qio->complete = md_page_write_complete;
> +
> +	bvec = &qvec->bvec[0];
> +	bvec->bv_page = md->page;
> +	bvec->bv_len = PAGE_SIZE;
> +	bvec->bv_offset = 0;
> +
> +	iov_iter_bvec(&iter, rw, bvec, 1, PAGE_SIZE);
> +
> +	inc_inflight_md(qcow2, qio);
> +	call_rw_iter(qcow2->file, pos, rw, &iter, qio);
> +}
> +
> +static int submit_read_md_page(struct qcow2 *qcow2, struct qio **qio,
> +			       u64 page_id)
> +{
> +	struct md_page *md;
> +	int ret;
> +
> +	ret = alloc_and_insert_md_page(qcow2, page_id, &md);
> +	if (ret < 0) {
> +		pr_err("Can't alloc: ret=%d, page_id=%llu\n", ret, page_id);
> +		return -EIO;
> +	}
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	list_add_tail(&(*qio)->link, &md->wait_list);
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +	*qio = NULL;
> +
> +	submit_rw_md_page(READ, qcow2, md);
> +	return 0;
> +}
> +
> +/*
> + * This may be called with @qio == NULL, in case of we are
> + * interesting in searching cached in memory md only.
> + */
> +static int handle_md_page(struct qcow2 *qcow2, u64 page_id,
> +		 struct qio **qio, struct md_page **ret_md)
> +{
> +	struct md_page *md;
> +
> +	md = md_page_find_or_postpone(qcow2, page_id, qio);
> +	if (!md) {
> +		if (qio && *qio)
> +			return submit_read_md_page(qcow2, qio, page_id);
> +		return 0;
> +	}
> +
> +	*ret_md = md;
> +	return 1;
> +}
> +
> +static u32 qio_subclus_covered_start_size(struct qcow2 *qcow2,
> +					  struct qio *qio,
> +					  u32 subclus_mask)
> +{
> +	u8 start_bit, end_bit, bit;
> +
> +	start_bit = qio_subclu_indexes(qcow2, qio, &end_bit);
> +
> +	bit = next_zero_bit(subclus_mask, start_bit);
> +	if (bit == start_bit)
> +		return 0;
> +	if (bit > end_bit)
> +		return qio->bi_iter.bi_size;
> +	return bit * qcow2->subclu_size - bytes_off_in_cluster(qcow2, qio);
> +}
> +
> +static u32 qio_unmapped_size(struct qcow2 *qcow2, struct qio *qio,
> +			     struct qcow2_map *map)
> +{
> +	u32 mapped_mask = (map->ext_l2 >> 32) | (u32)map->ext_l2;
> +
> +	if (!qcow2->ext_l2) {
> +		if (!map->data_clu_alloced && !map->all_zeroes)
> +			return qio->bi_iter.bi_size;
> +		return 0;
> +	}
> +
> +	return qio_subclus_covered_start_size(qcow2, qio, ~mapped_mask);
> +}
> +
> +static u32 qio_mapped_not_zeroes_size(struct qcow2 *qcow2, struct qio *qio,
> +				      struct qcow2_map *map)
> +{
> +	if (!qcow2->ext_l2) {
> +		if (map->data_clu_alloced && !map->all_zeroes)
> +			return qio->bi_iter.bi_size;
> +		return 0;
> +	}
> +
> +	return qio_subclus_covered_start_size(qcow2, qio, (u32)map->ext_l2);
> +}
> +static u32 qio_all_zeroes_size(struct qcow2 *qcow2, struct qio *qio,
> +			       struct qcow2_map *map)
> +{
> +	if (!qcow2->ext_l2) {
> +		if (map->all_zeroes)
> +			return qio->bi_iter.bi_size;
> +		return 0;
> +	}
> +
> +	return qio_subclus_covered_start_size(qcow2, qio, map->ext_l2 >> 32);
> +}
> +
> +static bool qio_border_is_inside_unmapped_unit(struct qcow2 *qcow2,
> +					       struct qio *qio,
> +					       struct qcow2_map *map)
> +{
> +	u64 start_off, end_off;
> +	u8 start_bit, end_bit;
> +	u32 mapped_mask;
> +	bool ret;
> +
> +	if (WARN_ON_ONCE(!(map->level & L2_LEVEL)))
> +		return false;
> +
> +	if (qio->bi_iter.bi_size == qcow2->clu_size)
> +		return false;
> +
> +	if (!qcow2->ext_l2)
> +		return !map->data_clu_alloced && !map->all_zeroes;
> +
> +	start_bit = qio_subclu_indexes(qcow2, qio, &end_bit);
> +	start_off = bytes_off_in_cluster(qcow2, qio);
> +	end_off = start_off + qio->bi_iter.bi_size;
> +	mapped_mask = (u32)map->ext_l2 | (map->ext_l2 >> 32);
> +
> +	ret = SUBCLU_OFF(qcow2, start_off) != 0 && ((1 << start_bit) & ~mapped_mask);
> +	ret |= SUBCLU_OFF(qcow2, end_off) != 0 && ((1 << end_bit) & ~mapped_mask);
> +	return ret;
> +}
> +
> +static bool qio_is_fully_alloced(struct qcow2 *qcow2, struct qio *qio,
> +				 struct qcow2_map *map)
> +{
> +	u32 subclus_mask, alloced_mask;
> +
> +	if (!(map->level & L2_LEVEL))
> +		return false;
> +
> +	if (!qcow2->ext_l2)
> +		return map->data_clu_alloced && !map->all_zeroes;
> +
> +	subclus_mask = qio_subclus_mask(qcow2, qio);
> +	alloced_mask = (u32)map->ext_l2;
> +
> +	return !(subclus_mask & ~alloced_mask);
> +}
> +
> +static loff_t parse_l1(struct qcow2 *qcow2, struct qcow2_map *map,
> +		       struct qio **qio, bool write)
> +
> +{
> +	struct qcow2_map_item *l1 = &map->l1;
> +	bool wants_alloc, exactly_one;
> +	u64 pos, entry;
> +	loff_t ret;
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	entry = get_u64_from_be_page(l1->md->page, l1->index_in_page);
> +	exactly_one = entry & LX_REFCOUNT_EXACTLY_ONE;
> +	pos = entry & ~LX_REFCOUNT_EXACTLY_ONE;
> +
> +	ret = -EIO;
> +	if (WARN_ON_ONCE(entry & L1_RESERVED_ZERO_MASK))
> +		goto out;
> +	if (WARN_ON_ONCE(CLU_OFF(qcow2, pos) != 0))
> +		goto out;
> +	if (WARN_ON_ONCE(pos && !qcow2->hdr.nb_snapshots && !exactly_one))
> +		goto out;
> +
> +	if (pos && !exactly_one) {
> +		map->clu_is_cow = true;
> +		map->cow_clu_pos = pos;
> +		map->cow_clu_end = pos + qcow2->clu_size;
> +	}
> +
> +	ret = 0;
> +	if (delay_if_locked(qcow2, l1->md, l1->index_in_page, qio))
> +		goto out;
> +	wants_alloc = write && (pos == 0 || map->clu_is_cow);
> +	if (__delay_if_writeback(qcow2, l1->md, l1->index_in_page, qio, wants_alloc))
> +		goto out;
> +	if (delay_if_dirty(qcow2, l1->md, l1->index_in_page, qio))
> +		goto out;
> +	if (write && map->clu_is_cow)
> +		goto out; /* Avoid to return pos */
> +
> +	ret = pos;
> +out:
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +	return ret;
> +}
> +
> +static int parse_compressed_l2(struct qcow2 *qcow2, struct qcow2_map *map,
> +			       struct qio **qio, bool write, u64 entry)
> +{
> +	u8 offset_bits = 62 - (qcow2->hdr.cluster_bits - 8);
> +	struct qcow2_map_item *l2 = &map->l2;
> +	u64 pos, end;
> +
> +	/* Even for write: it reads compressed clu firstly */
> +	if (delay_if_wpc_readers_locked(qcow2, l2->md, qio))
> +		return 0;
> +	if (WARN_ON_ONCE(dirty_or_writeback(qcow2, l2->md, l2->index_in_page)))
> +		return -EIO;
> +
> +	map->compressed = true;
> +	pos = entry << (64 - offset_bits) >> (64 - offset_bits);
> +	map->compressed_sectors = entry >> offset_bits;
> +	end = compressed_clu_end_pos(pos, map->compressed_sectors);
> +
> +	map->clu_is_cow = true;
> +	/* @pos may point to middle of cluster, so this may take 2 clusters */
> +	map->cow_clu_pos = round_down(pos, qcow2->clu_size);
> +	map->cow_clu_end = round_up(end, qcow2->clu_size);
> +	map->ext_l2 = ~(u32)0;
> +
> +	if (WARN_ON_ONCE((pos >> 56) != 0 || !entry ||
> +			 /* This would be very strange compression */
> +			 end - pos > qcow2->clu_size))
> +		return -EIO;
> +	map->data_clu_alloced = true;
> +	return pos;
> +}
> +
> +static loff_t parse_l2(struct qcow2 *qcow2, struct qcow2_map *map,
> +		       struct qio **qio, bool write)
> +
> +{
> +	bool wants_alloc, exactly_one, all_zeroes;
> +	struct qcow2_map_item *l2 = &map->l2;
> +	u64 entry, pos, ext_l2;
> +	loff_t ret;
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	entry = get_u64_from_be_page(l2->md->page, l2->index_in_page);
> +	exactly_one = entry & LX_REFCOUNT_EXACTLY_ONE;
> +	entry &= ~LX_REFCOUNT_EXACTLY_ONE;
> +
> +	/*
> +	 * COW -- note that original cluster type here may be even compressed.
> +	 * READ: cluster data may disappear and become reused after
> +	 *	 compressed COW fail.
> +	 * WRITE: now we don't handle sending of accompanying qios.
> +	 */
> +	ret = 0;
> +	if (delay_if_locked(qcow2, l2->md, l2->index_in_page, qio))
> +		goto out;
> +
> +	ret = -EIO;
> +	if (entry & L2_COMPRESSED_CLUSTER) {
> +		entry &= ~L2_COMPRESSED_CLUSTER;
> +		if (WARN_ON_ONCE(exactly_one || !entry))
> +			goto out;
> +		ret = parse_compressed_l2(qcow2, map, qio, write, entry);
> +		goto out;
> +	}
> +
> +	all_zeroes = map->all_zeroes = entry & L2_READS_ALL_ZEROES;
> +	entry &= ~L2_READS_ALL_ZEROES;
> +	pos = entry;
> +
> +	if (pos && !exactly_one) {
> +		map->clu_is_cow = true;
> +		map->cow_clu_pos = pos;
> +		map->cow_clu_end = pos + qcow2->clu_size;
> +	}
> +
> +	if (WARN_ON_ONCE(entry & L2_RESERVED_ZERO_MASK))
> +		goto out;
> +	if (WARN_ON_ONCE(pos && !qcow2->hdr.nb_snapshots && !exactly_one))
> +		goto out;
> +	if (WARN_ON_ONCE(CLU_OFF(qcow2, entry) != 0))
> +		goto out;
> +
> +	if (!qcow2->ext_l2) {
> +		if (all_zeroes && pos)
> +			map->prealloced = true;
> +
> +		if (WARN_ON_ONCE(map->prealloced && !exactly_one))
> +			goto out;
> +		if (WARN_ON_ONCE(map->clu_is_cow && all_zeroes))
> +			goto out;
> +
> +		ret = 0;
> +
> +		wants_alloc = (pos == 0 || map->prealloced || map->clu_is_cow);
> +		if (write && __delay_if_writeback(qcow2, l2->md, l2->index_in_page,
> +						  qio, wants_alloc))
> +			goto out;
> +		/*
> +		 * When cluster is under allocation, READ should see zeroes.
> +		 * On writeback, we could delay READ like for WRITE is done,
> +		 * but fast zeroing may be useful optimizations on big
> +		 * clusters (say, 1Mb).
> +		 * In case of md is dirty, WRITE is not delayed. It becomes
> +		 * linked to md->wbd in perform_rw_mapped(), and it runs
> +		 * in parallel with md writeback (accompanying qio).
> +		 */
> +		if (!write && dirty_or_writeback(qcow2, l2->md, l2->index_in_page)) {
> +			perform_zero_read(*qio, (*qio)->bi_iter.bi_size);
> +			goto out;
> +		}
> +	} else {
> +		ext_l2 = get_u64_from_be_page(l2->md->page,
> +					      l2->index_in_page + 1);
> +		map->ext_l2 = ext_l2;
> +		map->subclus_mask = 0;
> +		if (!fake_merge_qio(*qio) && !fake_l1cow_qio(*qio))
> +			map->subclus_mask = qio_subclus_mask(qcow2, *qio);
> +
> +		if (WARN_ON_ONCE(all_zeroes || (ext_l2 & (ext_l2 >> 32))))
> +			goto out;
> +
> +		/*
> +		 * Note, that if "l2->index_in_page" is changed,
> +		 * then "l2->index_in_page + 1" is also changed.
> +		 * So, here we check only the second of them.
> +		 */
> +		ret = 0;
> +		if (!write &&
> +		    delay_if_dirty(qcow2, l2->md, l2->index_in_page + 1, qio))
> +			goto out;
> +		if (__delay_if_writeback(qcow2, l2->md, l2->index_in_page + 1,
> +					 qio, true))
> +			goto out;
> +	}
> +
> +	if (pos)
> +		map->data_clu_alloced = true;
> +
> +	/* See comment in submit_read_whole_cow_clu() */
> +	if (!write && pos && !all_zeroes && !exactly_one &&
> +	    delay_if_wpc_readers_locked(qcow2, l2->md, qio))
> +		goto out;
> +
> +	ret = pos;
> +out:
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +	return ret;
> +}
> +
> +/*
> + * This may be called with @qio == NULL, in case of we sure
> + * that R1/R2 are already cached and up to date.
> + */
> +static int __handle_r1r2_maps(struct qcow2 *qcow2, loff_t pos, struct qio **qio,
> +			   struct qcow2_map_item *r1, struct qcow2_map_item *r2)
> +{
> +	int ret;
> +
> +	if (calc_refcounters_map(qcow2, pos, r1, r2) < 0)
> +		return -EIO;
> +
> +	/* Check R1 table */
> +	ret = handle_md_page(qcow2, r1->page_id, qio, &r1->md);
> +	if (ret <= 0)
> +		return ret;
> +
> +	ret = calc_r2_page_id(qcow2, r1, r2);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = handle_md_page(qcow2, r2->page_id, qio, &r2->md);
> +	if (ret <= 0)
> +		return ret;
> +
> +	/*
> +	 * XXX: we do not care about R1 or R2 may be under writeback,
> +	 * since the most actual version of them is cached in memory.
> +	 */
> +	return 1;
> +}
> +
> +/*
> + * This aims to be called for resolving R1 and R2 md pages
> + * related to already allocated cluster at @pos.
> + * Return value: 1 if pages are found and cached; 0 in case
> + * of read md page was submitted; negative in case of error.
> + * The difference to raw __handle_r1r2_maps() is in sanity
> + * checks of R2 cluster exists and refblock entry is sane.
> + * Sanity check is disabled on clusters containing compressed
> + * clusters (their refcount is equal to num of compressed users).
> + */
> +static int handle_r1r2_maps(struct qcow2 *qcow2, loff_t pos, struct qio **qio,
> +	struct qcow2_map_item *r1, struct qcow2_map_item *r2, bool compressed)
> +{
> +	u64 entry;
> +	int ret;
> +
> +	ret = __handle_r1r2_maps(qcow2, pos, qio, r1, r2);
> +	/* Cluster mapped, but refcount table doesn't know? */
> +	WARN_ON_ONCE(ret == -ENOENT);
> +
> +	if (ret == 1 && !qcow2->hdr.nb_snapshots && !compressed) {
> +		entry = get_r2_entry(qcow2, r2->md, r2->index_in_page);
> +		/* Sanity check */
> +		if (unlikely(entry > 1)) {
> +			pr_err("refblock=%llu, while no snapshots\n", entry);
> +			return -EIO;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +/*
> + * This caches pages of allocated on disk md levels, which are
> + * required for submission @qio, and checks they are stable.
> + * The result of parsing L1/L2 entries is stored in @map.
> + * Returns: negative in case of error, or 0 if success.
> + * Special case: return 0 with zeroed @qio means @qio was deferred
> + * till some event: reading of md page, end of writeback, etc.
> + */
> +static int parse_metadata(struct qcow2 *qcow2, struct qio **qio,
> +			  struct qcow2_map *map, bool write)
> +{
> +	struct md_page *md;
> +	u64 pos;
> +	int ret;
> +
> +	WARN_ON_ONCE(map->data_clu_pos != 0);
> +	if (calc_cluster_map(qcow2, *qio, map) < 0)
> +		return -EIO;
> +
> +	/* Check L1 page */
> +	ret = handle_md_page(qcow2, map->l1.page_id, qio, &md);
> +	if (ret <= 0)
> +		return ret;
> +	map->l1.md = md;
> +	map->level = L1_LEVEL;
> +
> +	/* Find L2 cluster (from L1 page) */
> +	pos = parse_l1(qcow2, map, qio, write);
> +	if (pos <= 0) /* Err, delayed, L2 is not allocated, or zero read */
> +		return pos;
> +
> +	/* pos is start of cluster */
> +	pos += map->l2.index * sizeof(u64);
> +	calc_page_id_and_index(pos, &map->l2.page_id, &map->l2.index_in_page);
> +
> +	/* Check L2 page */
> +	ret = handle_md_page(qcow2, map->l2.page_id, qio, &md);
> +	if (ret <= 0)
> +		return ret;
> +	map->l2.md = md;
> +	map->level |= L2_LEVEL;
> +
> +	/* Find DATA cluster (from L2 page) */
> +	pos = parse_l2(qcow2, map, qio, write);
> +	if (pos <= 0) /* Err, delayed, DATA is not allocated, or zero read */
> +		return pos;
> +
> +	map->data_clu_pos = pos;
> +	if (!write)
> +		return 0;
> +
> +	/* Now refcounters table/block */
> +	if (!qcow2->hdr.nb_snapshots && !map->compressed)
> +		return 0;
> +	ret = handle_r1r2_maps(qcow2, pos, qio, &map->r1,
> +			       &map->r2, map->compressed);
> +	return ret < 0 ? ret : 0;
> +}
> +
> +/*
> + * This occupies cluster at @r2_pos for R2 cluster,
> + * and connects it to R1 table entry.
> + */
> +static int place_r2(struct qcow2 *qcow2, struct qcow2_map_item *r1,
> +		    struct qcow2_map_item *r2, loff_t r2_pos, struct qio **qio)
> +{
> +	u64 page_id = r2_pos >> PAGE_SHIFT;
> +	int ret;
> +
> +	if (delay_if_writeback(qcow2, r1->md, r1->index_in_page, qio, true))
> +		return 0;
> +
> +	ret = punch_hole(qcow2->file, r2_pos, qcow2->clu_size);
> +	if (ret) {
> +		pr_err("qcow2: punch hole: %d\n", ret);
> +		return ret;
> +	}
> +
> +	ret = alloc_and_insert_md_page(qcow2, page_id, &r2->md);
> +	if (ret < 0) {
> +		pr_err("Can't alloc: ret=%d, page_id=%llu\n", ret, page_id);
> +		return -EIO;
> +	}
> +
> +	zero_fill_page_from(r2->md->page, 0);
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	set_u64_to_be_page(r1->md->page, r1->index_in_page, r2_pos);
> +	md_make_dirty(qcow2, r1->md, true);
> +	r2->md->status |= MD_UPTODATE;
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +	mark_cluster_used(qcow2, r2->md, r2->index_in_page);
> +	return 1;
> +}
> +
> +static s32 find_unused_block_entry(struct qcow2 *qcow2, struct md_page *md,
> +				   u32 from)
> +{
> +	u32 indexes_per_page = PAGE_SIZE * 8 / qcow2->refblock_bits;
> +	long i, ret = -ENOENT;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	for (i = from; i < indexes_per_page; i++) {
> +		if (get_r2_entry(qcow2, md, i) == 0) {
> +			ret = i;
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
> +static loff_t find_unused_cluster(struct qcow2 *qcow2, struct qio **qio,
> +				  struct qcow2_map_item *r1,
> +				  struct qcow2_map_item *r2)
> +{
> +	u32 clu_size = qcow2->clu_size;
> +	s32 index, ret;
> +	loff_t pos;
> +again:
> +	pos = READ_ONCE(qcow2->free_cluster_search_pos);
> +	if (pos >= qcow2->reftable_max_file_size)
> +		return -ENOENT;
> +
> +	ret = __handle_r1r2_maps(qcow2, pos, qio, r1, r2);
> +	if (ret <= 0) {
> +		if (ret != -ENOENT)
> +			return ret;
> +		/*
> +		 * Since pos is not covered by R2, the whole cluster
> +		 * must be unused. Use it to store R2 cluster.
> +		 * Both indexes must be 0 here, because of we allocate
> +		 * clusters from small to big.
> +		 */
> +		WARN_ON_ONCE(r2->index_in_page != 0 || r2->index != 0);
> +		ret = place_r2(qcow2, r1, r2, pos, qio);
> +		if (ret <= 0)
> +			return ret;
> +		goto again;
> +	}
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	/*
> +	 * This is rare usually and very rare during intensive write,
> +	 * since R1 and R2 writeback are delayed. We faster make all
> +	 * blocks of the page to be used, than writeback starts.
> +	 */
> +	if (__delay_if_writeback(qcow2, r2->md, r2->index_in_page, qio, true)) {
> +		pos = 0;
> +		goto unlock;
> +	}
> +
> +	if (unlikely(pos != qcow2->free_cluster_search_pos)) {
> +		/* Parallel mark_cluster_unused() changed it */
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +		goto again;
> +	}
> +
> +	index = find_unused_block_entry(qcow2, r2->md, r2->index_in_page);
> +	if (index < 0) {
> +		/* No unused entries in this page */
> +		pos = round_up(pos + 1, qcow2->r2_page_covered_file_size);
> +		qcow2->free_cluster_search_pos = pos;
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +		goto again;
> +	}
> +
> +	/* Advance pos and R2 indexes to point to the block entry */
> +	pos += (u64)(index - r2->index_in_page) * clu_size;
> +	r2->index += index - r2->index_in_page;
> +	r2->index_in_page = index;
> +
> +	/* In case of caller fails, we have this value cached */
> +	qcow2->free_cluster_search_pos = pos;
> +unlock:
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +	return pos;
> +}
> +
> +int qcow2_truncate_safe(struct file *file, loff_t new_len)
> +{
> +	int ret;
> +
> +	ret = vfs_truncate(&file->f_path, new_len);
> +	if (ret)
> +		return ret;
> +
> +	return vfs_fsync(file, 0);
> +}
> +
> +static int truncate_prealloc_safe(struct qcow2 *qcow2, loff_t len, const char *func)
> +{
> +	loff_t prealloc_len, max_prealloc_len = qcow2->reftable_max_file_size;
> +	struct file *file = qcow2->file;
> +	loff_t new_len = len;
> +	int ret;
> +
> +	if (new_len <= qcow2->file_size)
> +		return 0;
> +	if (new_len < qcow2->reftable_max_file_size) {
> +		prealloc_len = ALIGN(new_len, PREALLOC_SIZE);
> +		new_len = min_t(loff_t, prealloc_len, max_prealloc_len);
> +	}
> +
> +	ret = qcow2_truncate_safe(file, new_len);
> +	if (ret) {
> +		pr_err("qcow2: %s->truncate: %d\n", func, ret);
> +		return ret;
> +	}
> +
> +	qcow2->file_size = new_len;
> +	qcow2->file_preallocated_area_start = len;
> +	return 0;
> +}
> +
> +static int punch_hole(struct file *file, loff_t pos, loff_t len)
> +{
> +	return vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
> +			     pos, len);
> +}
> +
> +static void set_reftable_in_raw_hdr(struct page *page0, loff_t pos, loff_t clus)
> +{
> +	struct QCowHeader *raw_hdr;
> +
> +	raw_hdr = kmap(page0);
> +	raw_hdr->refcount_table_offset = cpu_to_be64(pos);
> +	raw_hdr->refcount_table_clusters = cpu_to_be32(clus);
> +	kunmap(page0);
> +}
> +
> +/*
> + * After all file space covered by current reftable (R1) became used,
> + * this relocates reftable (R1) to new place and extends its size.
> + * The situation is rather rare, and since the function is already
> + * complicated, we should not demonstrate excessive creativity
> + * and optimize it in prejudice of readability.
> + *
> + * We act in the way to provide safe rollback throughout whole function.
> + * Firstly, we cache every related md we're going to use on relocation.
> + * New reftable (R1) is placed next to max cluster covered by old R1.
> + * Since new reftable (R1) clusters should be marked as used after
> + * relocation, we also allocate md for new refblocks (R2) (they should
> + * cover both new R1 and new R2 -- themself). Note, that new R1 and R2
> + * clusters are in the part of file, which is not covered by old R1.
> + *
> + * Then, we try to write new hdr on disk. In case of failure, we
> + * restore old hdr in memory and do safe rollback. In case of success,
> + * there should not be more reasons to fail (only this driver's bugs).
> + * Cached old reftable (R1) pages we renumber to point to new reftable (R1)
> + * place. Then we mark old reftable (R1) clusters as unused, while
> + * new reftable (R1) clusters as used.
> + *
> + * In further, updated R1 and R2 pages will be written on disk
> + * on writeback like during any other R1/R2 update. Even in case of
> + * power down, when refcounts become lost, check util on next mount
> + * can easily restore them by L1 and L2, which are stable.
> + */
> +static int relocate_refcount_table(struct qcow2 *qcow2, struct qio **qio)
> +{
> +	loff_t i, old_pos, old_end, pos, end, r2_end, delta;
> +	u32 old_clus, clus, clu_size = qcow2->clu_size;
> +	u32 r2_clus, bits = qcow2->refblock_bits;
> +	unsigned long nr_pages, index;
> +	struct qcow2_map_item r1, r2;
> +	struct md_page *md0, *md;
> +	int ret;
> +
> +	/* FIXME: check there is no in-flight operations */
> +	old_clus = qcow2->hdr.refcount_table_clusters;
> +	clus = min_t(u32, old_clus + 1, REFCOUNT_TABLE_MAX_SIZE / clu_size);
> +	if (clus <= old_clus) {
> +		pr_debug_ratelimited("qcow2: maximal refcount table size\n");
> +		return -ENFILE;
> +	}
> +
> +	/* Boundaries of old reftable (R1) */
> +	old_pos = qcow2->hdr.refcount_table_offset;
> +	old_end = old_pos + (u64)old_clus * clu_size;
> +	nr_pages = (old_end - old_pos) / PAGE_SIZE;
> +
> +	/* Cache old reftable (R1) pages and image header */
> +	index = old_pos / PAGE_SIZE;
> +	for (i = 0; i <= nr_pages; i++, index++) {
> +		if (i == nr_pages)
> +			index = 0; /* hdr */
> +		ret = handle_md_page(qcow2, index, qio, &md);
> +		if (ret <= 0)
> +			return ret;
> +		/*
> +		 * Writeback mustn't require cluster allocation,
> +		 * otherwise it may result in deadlock here.
> +		 */
> +		if (delay_if_writeback(qcow2, md, -1, qio, true))
> +			return 0;
> +	}
> +	md0 = md;
> +
> +	/* Cache R1/R2 pages covering clusters of old reftable (R1) */
> +	for (i = old_pos; i < old_end; i += PAGE_SIZE) {
> +		ret = handle_r1r2_maps(qcow2, i, qio, &r1, &r2, false);
> +		if (ret <= 0)
> +			return ret;
> +		if (delay_if_writeback(qcow2, r1.md, -1, qio, true))
> +			return 0;
> +	}
> +
> +	/*
> +	 * We need R2 clusters to mark used both: new reftable (R1) clusters
> +	 * and these refblock clusters (R2) themself. This number comes from:
> +	 * r2_clus >= (clus + r2_clus) * bits / (8 * clu_size)
> +	 */
> +	r2_clus = DIV_ROUND_UP((u64)clus * bits, 8 * clu_size - bits);
> +
> +	/* Choose position next to max cluster covered by old R1/R2 */
> +	pos = qcow2->reftable_max_file_size;
> +	end = pos + (u64)clus * clu_size;
> +	r2_end = end + (u64)r2_clus * clu_size;
> +	ret = truncate_prealloc_safe(qcow2, r2_end, __func__);
> +	if (ret)
> +		return ret;
> +
> +	/* Alloc R1/R2 pages covering clusters of new R1 and new R2 */
> +	for (i = pos + (u64)old_clus * clu_size; i < r2_end; i += PAGE_SIZE) {
> +		ret = alloc_and_insert_md_page(qcow2, i >> PAGE_SHIFT, &md);
> +		if (ret < 0)
> +			goto err_free_r2_pages;
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		zero_fill_page_from(md->page, 0);
> +		md->status |= MD_UPTODATE;
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +	}
> +
> +	set_reftable_in_raw_hdr(md0->page, pos, clus);
> +	/* Write new hdr: last potential failing operation */
> +	ret = rw_page_sync(WRITE, qcow2, 0, md0->page);
> +	if (ret) {
> +		/* Restore old R1 */
> +		set_reftable_in_raw_hdr(md0->page, old_pos, old_clus);
> +		goto err_free_r2_pages;
> +	}
> +
> +	/* Update cached values */
> +	qcow2->hdr.refcount_table_offset = pos;
> +	qcow2->hdr.refcount_table_clusters = clus;
> +	calc_cached_parameters(qcow2, &qcow2->hdr);
> +
> +	/* Now renumber R1 cached pages to point new place and mark dirty */
> +	index = old_pos / PAGE_SIZE;
> +	delta = (pos - old_pos) / PAGE_SIZE;
> +	for (i = 0; i < nr_pages; i++, index++) {
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		md = md_page_renumber(qcow2, index, index + delta);
> +		if (!WARN_ON_ONCE(!md))
> +			md_make_dirty(qcow2, md, true);
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +		if (!md)
> +			goto err_free_r2_pages;
> +	}
> +
> +	/* Connect new R2 to new R1 */
> +	for (i = end; i < r2_end; i += clu_size) {
> +		if (calc_refcounters_map(qcow2, i, &r1, &r2) < 0) {
> +			WARN_ON_ONCE(1);
> +			goto err_free_r2_pages;
> +		}
> +		md = md_page_find_or_postpone(qcow2, r1.page_id, NULL);
> +		if (WARN_ON_ONCE(!md))
> +			goto err_free_r2_pages;
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		set_u64_to_be_page(md->page, r1.index_in_page, i);
> +		md_make_dirty(qcow2, md, true);
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +	}
> +
> +	/* Mark used new R1 and R2 clusters */
> +	for (i = pos; i < r2_end; i += clu_size) {
> +		ret = handle_r1r2_maps(qcow2, i, NULL, &r1, &r2, false);
> +		if (WARN_ON_ONCE(ret <= 0))
> +			goto err_free_r2_pages;
> +		mark_cluster_used(qcow2, r2.md, r2.index_in_page);
> +	}
> +
> +	/* Mark unused old reftable (R1) clusters */
> +	for (i = old_pos; i < old_end; i += clu_size) {
> +		ret = handle_r1r2_maps(qcow2, i, NULL, &r1, &r2, false);
> +		if (WARN_ON_ONCE(ret <= 0))
> +			goto err_free_r2_pages;
> +		mark_cluster_unused(qcow2, r2.md, r2.index_in_page, i);
> +	}
> +
> +	return 1;
> +
> +err_free_r2_pages:
> +	for (i = end; i < r2_end; i += clu_size) {
> +		md = md_page_find_or_postpone(qcow2, i >> PAGE_SHIFT, NULL);
> +		if (!md)
> +			break;
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		md_page_erase(qcow2, md);
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +		free_md_page(md);
> +	}
> +	/* TODO: switch to RO */
> +	return -EIO;
> +}
> +
> +/*
> + * This function is aimed to be called only from main work.
> + * In case of wish to use it from more places, it's needed
> + * to make sure nobody can reuse cluster obtained from
> + * find_unused_cluster() before mark_cluster_used() in done.
> + */
> +static loff_t allocate_cluster(struct qcow2 *qcow2, struct qio *qio,
> +		      struct md_page **r2_md, u32 *r2_index_in_page)
> +{
> +	u32 clu_size = qcow2->clu_size;
> +	struct file *file = qcow2->file;
> +	loff_t pos, off, end, old_size;
> +	struct qcow2_map_item r1, r2;
> +	int ret;
> +again:
> +	pos = find_unused_cluster(qcow2, &qio, &r1, &r2);
> +	if (unlikely(pos == -ENOENT)) {
> +		ret = relocate_refcount_table(qcow2, &qio);
> +		if (ret <= 0)
> +			return ret;
> +		goto again;
> +	}
> +	if (pos <= 0)
> +		return pos;
> +
> +	end = pos + clu_size;
> +	old_size = qcow2->file_size;
> +
> +	if (pos < qcow2->file_preallocated_area_start) {
> +		/* Clu at @pos may contain dirty data */
> +		off = min_t(loff_t, old_size, end);
> +		ret = punch_hole(file, pos, off - pos);
> +		if (ret) {
> +			pr_err("qcow2: punch hole: %d\n", ret);
> +			return ret;
> +		}
> +	}
> +
> +	if (end > old_size) {
> +		ret = truncate_prealloc_safe(qcow2, end, __func__);
> +		if (ret)
> +			return ret;
> +	} else if (pos < qcow2->file_preallocated_area_start) {
> +		/*
> +		 * Flush punch_hole() modifications.
> +		 * TODO: track recentry unused blocks
> +		 * and punch holes in background.
> +		 */
> +		ret = vfs_fsync(file, 0);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (end > qcow2->file_preallocated_area_start)
> +		qcow2->file_preallocated_area_start = end;
> +
> +	mark_cluster_used(qcow2, r2.md, r2.index_in_page);
> +	if (r2_md)
> +		*r2_md = r2.md;
> +	if (r2_index_in_page)
> +		*r2_index_in_page = r2.index_in_page;
> +	return pos;
> +}
> +
> +#define LU_SET_ONE_MASK		(1 << 0)
> +#define LU_WANTS_PE_PAGE	(1 << 1)
> +#define LU_WANTS_ALLOC		(1 << 2)
> +#define LU_IGN_CHANGED_IND	(1 << 3)
> +static int prepare_l_entry_update(struct qcow2 *qcow2, struct qio *qio,
> +				  struct md_page *md, u32 index_in_page,
> +				  u64 *pval, u32 arg_mask)
> +{
> +	bool wants_pe_page = (arg_mask & LU_WANTS_PE_PAGE);
> +	struct wb_desc *new_wbd = NULL;
> +	struct page *pe_page = NULL;
> +	u64 old_val, val = *pval;
> +
> +	/* parse_metadata()->delay_if_writeback() delays them */
> +	if (WARN_ON_ONCE(READ_ONCE(md->status) & MD_WRITEBACK))
> +		return -EIO;
> +	/*
> +	 * L1/L2 pages become set and unset dirty from main
> +	 * work only, so lock is not required for visibility.
> +	 */
> +	if (!(md->status & MD_DIRTY)) {
> +		/* We're the first changing entry in this md page. */
> +		new_wbd = alloc_wbd(wants_pe_page);
> +		if (!new_wbd)
> +			return -ENOMEM;
> +		new_wbd->md = md;
> +	} else if (wants_pe_page && !md->wbd->pe_page) {
> +		pe_page = alloc_page(GFP_NOIO|__GFP_ZERO);
> +		if (!pe_page)
> +			return -ENOMEM;
> +	}
> +
> +	if (arg_mask & LU_WANTS_ALLOC) {
> +		/* Allocate new zeroed data cluster: no failing actions after it */
> +		loff_t pos = allocate_cluster(qcow2, qio, NULL, NULL);
> +
> +		if (pos <= 0) {
> +			free_wbd(new_wbd);
> +			if (pe_page)
> +				put_page(pe_page);
> +			return (int)pos;
> +		}
> +		val = *pval = pos;
> +	}
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	if (md_make_dirty(qcow2, md, false))
> +		md->wbd = new_wbd;
> +	else
> +		WARN_ON_ONCE(new_wbd);
> +	if (!(arg_mask & LU_IGN_CHANGED_IND))
> +		WARN_ON_ONCE(test_bit(index_in_page, md->wbd->changed_indexes));
> +	set_bit(index_in_page, md->wbd->changed_indexes);
> +
> +	if (wants_pe_page && !md->wbd->pe_page)
> +		md->wbd->pe_page = pe_page;
> +	else
> +		WARN_ON_ONCE(pe_page);
> +	if (wants_pe_page) {
> +		old_val = get_u64_from_be_page(md->page, index_in_page);
> +		set_u64_to_be_page(md->wbd->pe_page, index_in_page, old_val);
> +	}
> +
> +	/* Set new mapping */
> +	if (arg_mask & LU_SET_ONE_MASK)
> +		val |= LX_REFCOUNT_EXACTLY_ONE;
> +	set_u64_to_be_page(md->page, index_in_page, val);
> +
> +	/* Keep in mind, we link qio to md in perform_rw_mapped() */
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +	return 1;
> +}
> +
> +static int prepare_l1l2_allocation(struct qcow2 *qcow2, struct qio *qio,
> +				   struct qcow2_map *map)
> +{
> +	u32 arg_mask, subclus_mask;
> +	u64 val;
> +	int ret;
> +
> +	if (WARN_ON_ONCE(!(map->level & L1_LEVEL)))
> +		return -EIO; /* Sanity check: L1 must be cached */
> +
> +	if (!(map->level & L2_LEVEL)) {
> +		WARN_ON_ONCE(map->prealloced || map->compressed);
> +		/* Allocate cluster for L2 entries, and prepare L1 update */
> +		ret = prepare_l_entry_update(qcow2, qio, map->l1.md,
> +					     map->l1.index_in_page, &val,
> +					     LU_SET_ONE_MASK|LU_WANTS_ALLOC);
> +		if (ret <= 0)
> +			return ret;
> +
> +		/*
> +		 * 1)For now we don't do parallel L1 and L2 updates.
> +		 * 2)For COW from backing file this is must.
> +		 */
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		list_add_tail(&qio->link, &map->l1.md->wait_list);
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +		return 0;
> +	}
> +
> +	if (!map->data_clu_alloced || map->all_zeroes) {
> +		WARN_ON_ONCE(!map->prealloced != !map->data_clu_pos ||
> +			     map->compressed || map->clu_is_cow);
> +		/* Allocate cluster for DATA, and prepare L2 update */
> +		arg_mask = LU_SET_ONE_MASK;
> +		if (map->prealloced || qcow2->ext_l2)
> +			arg_mask |= LU_WANTS_PE_PAGE;
> +		if (!map->prealloced)
> +			arg_mask |= LU_WANTS_ALLOC;
> +
> +		ret = prepare_l_entry_update(qcow2, qio, map->l2.md,
> +					     map->l2.index_in_page,
> +					     &map->data_clu_pos, arg_mask);
> +		if (ret <= 0)
> +			return ret;
> +
> +		if (!qcow2->ext_l2)
> +			return 1;
> +
> +		/*
> +		 * pe_page is allocated => ext_l2 update won't fail =>
> +		 * revert of prepare_l_entry_update() won't be needed.
> +		 */
> +		WARN_ON_ONCE(!map->l2.md->wbd->pe_page);
> +	}
> +
> +	subclus_mask = qio_subclus_mask(qcow2, qio);
> +	val = map->ext_l2 | subclus_mask;
> +	val &= ~((u64)subclus_mask << 32);
> +	arg_mask = LU_WANTS_PE_PAGE|LU_IGN_CHANGED_IND;
> +
> +	return prepare_l_entry_update(qcow2, qio, map->l2.md,
> +				      map->l2.index_in_page + 1,
> +				      &val, arg_mask);
> +}
> +
> +/*
> + * Set some wb index to block WRITEs to this cluster.
> + * READs also must be blocked, since they may get data
> + * from cluster, after WRITE marked it's unused. Also,
> + * we have to wait all previous READs. We do that around
> + * index wb. See md->wpc_noread_count update details.
> + */
> +static int prepare_l_entry_cow(struct qcow2 *qcow2, struct qcow2_map *map,
> +			       struct qio *qio, struct md_page *md,
> +			       u32 index_in_page, loff_t cow_clu_pos,
> +			       loff_t cow_clu_end, u8 cow_level)
> +{
> +	struct lock_desc *lockd = NULL;
> +	struct qio_ext *ext;
> +
> +	if (alloc_qio_ext(qio))
> +		return -ENOMEM;
> +
> +	ext = qio->ext;
> +	ext->cow_clu_pos = cow_clu_pos;
> +	ext->cow_clu_end = cow_clu_end;
> +	ext->cow_level = cow_level;
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	if (!md->lockd) {
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +		lockd = kzalloc(sizeof(*lockd), GFP_NOIO);
> +		if (!lockd)
> +			return -ENOMEM;
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		md->lockd = lockd;
> +	}
> +
> +	md_index_set_locked(qcow2, md, index_in_page);
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +	/* Setup ext, so qio_endio() on error will make all cleanup */
> +	ext->cleanup_mask = MD_INDEX_SET_UNLOCKED;
> +	ext->lx_index_in_page = index_in_page;
> +	return 1;
> +}
> +
> +static int prepare_l1l2_cow(struct qcow2 *qcow2, struct qio *qio,
> +			    struct qcow2_map *map)
> +{
> +	if (WARN_ON_ONCE(!(map->level & L1_LEVEL)))
> +		return -EIO; /* Sanity check: L1 must be cached */
> +
> +	if (!(map->level & L2_LEVEL)) {
> +		return prepare_l_entry_cow(qcow2, map, qio, map->l1.md,
> +					   map->l1.index_in_page,
> +					   map->cow_clu_pos,
> +					   map->cow_clu_end, L1_LEVEL);
> +	}
> +
> +	return prepare_l_entry_cow(qcow2, map, qio, map->l2.md,
> +				  map->l2.index_in_page,
> +				  map->cow_clu_pos,
> +				  map->cow_clu_end, L2_LEVEL);
> +}
> +
> +static struct qio *alloc_clu_read_qio(struct qcow2 *qcow2, u32 nr_pages,
> +				      struct qcow2_bvec **qvec)
> +{
> +	struct qcow2_target *tgt = qcow2->tgt;
> +	struct qio *qio;
> +
> +	qio = alloc_qio(tgt->qio_pool, true);
> +	if (!qio)
> +		return NULL;
> +
> +	*qvec = alloc_qvec_with_pages(nr_pages);
> +	if (!*qvec) {
> +		free_qio(qio, tgt->qio_pool);
> +		return NULL;
> +	}
> +
> +	init_qio(qio, REQ_OP_READ, qcow2);
> +	qio->bi_io_vec = (*qvec)->bvec;
> +	qio->bi_iter.bi_size = nr_pages << PAGE_SHIFT;
> +	qio->bi_iter.bi_idx = 0;
> +	qio->bi_iter.bi_bvec_done = 0;
> +	return qio;
> +}
> +
> +static void backward_merge_write_complete(struct qcow2_target *tgt, struct qio *unused,
> +					  void *qio_ptr, blk_status_t bi_status)
> +{
> +	struct qio *qio = qio_ptr;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +
> +	if (unlikely(bi_status)) {
> +		qio->bi_status = bi_status;
> +		qio_endio(qio);
> +		return;
> +	}
> +
> +	WARN_ON_ONCE(qio->flags & QIO_IS_DISCARD_FL);
> +	qio->flags |= QIO_IS_DISCARD_FL;
> +
> +	qio->queue_list_id = QLIST_COW_INDEXES;
> +	dispatch_qios(qcow2, qio, NULL);
> +}
> +
> +static void backward_merge_read_complete(struct qcow2_target *tgt, struct qio *unused,
> +					 void *qio_ptr, blk_status_t bi_status)
> +{
> +	struct qio *qio = qio_ptr;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +
> +	if (unlikely(bi_status)) {
> +		qio->bi_status = bi_status;
> +		qio_endio(qio);
> +		return;
> +	}
> +
> +	qio->queue_list_id = QLIST_BMERGE_WRITE;
> +	dispatch_qios(qcow2, qio, NULL);
> +}
> +
> +static void requeue_if_ok(struct qcow2_target *tgt, struct qio *unused,
> +			  void *qio_ptr, blk_status_t bi_status)
> +{
> +	struct qio *qio = qio_ptr;
> +
> +	if (bi_status) {
> +		qio->bi_status = bi_status;
> +		qio_endio(qio);
> +		return;
> +	}
> +
> +	dispatch_qios(qio->qcow2, qio, NULL);
> +}
> +
> +static int prepare_backward_merge(struct qcow2 *qcow2, struct qio **qio,
> +				  struct qcow2_map *map, bool write)
> +{
> +	struct qio *aux_qio;
> +	int ret;
> +
> +	if (!map->data_clu_alloced) {
> +		WARN_ON_ONCE(map->clu_is_cow); /* Strange COW at L1 */
> +		if (fake_merge_qio(*qio)) {
> +			/* Nothing is to merge */
> +			goto endio;
> +		}
> +		WARN_ON_ONCE(!maybe_mapped_in_lower_delta(qcow2, *qio));
> +		WARN_ON_ONCE((*qio)->queue_list_id != QLIST_DEFERRED);
> +		(*qio)->qcow2 = qcow2->lower;
> +		dispatch_qios((*qio)->qcow2, *qio, NULL);
> +		return 0;
> +	}
> +
> +	if (!op_is_write((*qio)->bi_op)) {
> +		/*
> +		 * READ qio may data may be contained in several deltas.
> +		 * We can't read lower delta after prepare_l1l2_cow()
> +		 * prepares us.
> +		 */
> +		aux_qio = alloc_qio(qcow2->tgt->qio_pool, true);
> +		if (!aux_qio) {
> +			(*qio)->bi_status = BLK_STS_RESOURCE;
> +			goto endio;
> +		}
> +
> +		init_qio(aux_qio, REQ_OP_WRITE, qcow2);
> +		aux_qio->flags = QIO_IS_MERGE_FL|QIO_FREE_ON_ENDIO_FL;
> +		aux_qio->bi_io_vec = (*qio)->bi_io_vec;
> +		aux_qio->bi_iter = (*qio)->bi_iter;
> +		aux_qio->bi_iter.bi_size = 0;
> +		aux_qio->endio_cb = requeue_if_ok;
> +		aux_qio->endio_cb_data = *qio;
> +		WARN_ON_ONCE(!fake_merge_qio(aux_qio));
> +		*qio = aux_qio;
> +	}
> +
> +	/*
> +	 * Mark as COW, as this completely defers any parallel qios.
> +	 * @qio is COW status holder.
> +	 */
> +	ret = prepare_l1l2_cow(qcow2, *qio, map);
> +	if (ret < 0) {
> +		(*qio)->bi_status = errno_to_blk_status(ret);
> +		goto endio;
> +	}
> +
> +	if (!map->clu_is_cow) {
> +		/* Forced set these to unuse them after discard */
> +		(*qio)->ext->cow_clu_pos = map->data_clu_pos;
> +		(*qio)->ext->cow_clu_end = map->data_clu_pos + qcow2->clu_size;
> +	}
> +
> +	return 1;
> +endio:
> +	qio_endio(*qio); /* Breaks COW set in prepare_l1l2_cow() */
> +	return 0;
> +}
> +
> +static void data_rw_complete(struct qio *qio)
> +{
> +	bool finalize_wbd = false, call_endio = true;
> +	bool write = op_is_write(qio->bi_op);
> +	blk_status_t bi_status = BLK_STS_OK;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	struct wb_desc *wbd;
> +	unsigned long flags;
> +
> +	/* FIXME: short read/write */
> +	if (qio->ret != qio->bi_iter.bi_size)
> +		bi_status = BLK_STS_IOERR;
> +
> +	wbd = qio->data;
> +	if (write && wbd) {
> +		spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +		if (!list_empty(&qio->link)) {
> +			list_del_init(&qio->link);
> +			if (wbd->completed) {
> +				if (wbd->ret != 0)
> +					bi_status = errno_to_blk_status(wbd->ret);
> +				/* Last user of wbd? */
> +				finalize_wbd = list_empty(&wbd->submitted_list);
> +			} else if (bi_status == BLK_STS_OK) {
> +				call_endio = false;
> +				list_add_tail(&qio->link, &wbd->completed_list);
> +			}
> +		}
> +		spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +	}
> +
> +	if (call_endio) {
> +		if (bi_status != BLK_STS_OK)
> +			qio->bi_status = bi_status;
> +		qio_endio(qio);
> +	}
> +	if (finalize_wbd)
> +		complete_wbd(qcow2, wbd);
> +}
> +
> +static void submit_rw_mapped(struct qcow2 *qcow2, loff_t clu_pos, struct qio *qio)
> +{
> +	unsigned int rw, nr_segs;
> +	struct bio_vec *bvec;
> +	struct iov_iter iter;
> +	loff_t pos;
> +
> +	rw = (op_is_write(qio->bi_op) ? WRITE : READ);
> +	nr_segs = qio_nr_segs(qio);
> +	bvec = __bvec_iter_bvec(qio->bi_io_vec, qio->bi_iter);
> +
> +	iov_iter_bvec(&iter, rw, bvec, nr_segs, qio->bi_iter.bi_size);
> +	iter.iov_offset = qio->bi_iter.bi_bvec_done;
> +
> +	pos = clu_pos + bytes_off_in_cluster(qcow2, qio);
> +	call_rw_iter(qcow2->file, pos, rw, &iter, qio);
> +}
> +
> +static void perform_rw_mapped(struct qcow2_map *map, struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = map->qcow2;
> +	struct md_page *md = map->l2.md;
> +	unsigned long flags;
> +	u32 index_in_page;
> +	unsigned int rw;
> +
> +	rw = (op_is_write(qio->bi_op) ? WRITE : READ);
> +	qio->complete = data_rw_complete;
> +	qio->data = NULL;
> +	INIT_LIST_HEAD(&qio->link);
> +
> +	/*
> +	 * The idea is to submit L2 update and qio data write in parallel
> +	 * for better performance. But since qio_endio() can't be called
> +	 * till both of them are written, we link qio to md to track that.
> +	 * In case of qio is not related to changed indexes, it shouldn't
> +	 * wait for md writeback completion.
> +	 *
> +	 * L1/L2 pages become set and unset dirty from main work only,
> +	 * so lock is not needed for MD_DIRTY/changed_indexes visibility.
> +	 */
> +	index_in_page = map->l2.index_in_page + !!(qcow2->ext_l2);
> +	if (rw == WRITE && (md->status & MD_DIRTY) &&
> +	    test_bit(index_in_page, md->wbd->changed_indexes)) {
> +		spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +		list_add(&qio->link, &md->wbd->submitted_list);
> +		qio->data = md->wbd;
> +		spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +	}
> +
> +	submit_rw_mapped(qcow2, map->data_clu_pos, qio);
> +}
> +
> +static void cow_read_complete(struct qio *qio)
> +{
> +	struct md_page *md = qio->ext->lx_md;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	int ret = qio->ret;
> +
> +	dec_wpc_readers(qcow2, md); /* We ended to use shared clu on disk */
> +
> +	if (qio->ret != qcow2->clu_size) {
> +		qio->bi_status = errno_to_blk_status(ret < 0 ? ret : -EIO);
> +		qio_endio(qio);
> +		return;
> +	}
> +
> +	qio->queue_list_id = QLIST_COW_DATA;
> +	dispatch_qios(qio->qcow2, qio, NULL);
> +}
> +
> +static void submit_read_whole_cow_clu(struct qcow2_map *map, struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = map->qcow2;
> +	struct md_page *md = map->l1.md;
> +	u32 clu_size = qcow2->clu_size;
> +	loff_t pos = map->cow_clu_pos;
> +	struct qcow2_bvec *qvec;
> +	struct iov_iter iter;
> +	u32 nr_pages;
> +
> +	WARN_ON_ONCE(map->level & L2_LEVEL);
> +
> +	nr_pages = clu_size >> PAGE_SHIFT;
> +	qvec = alloc_qvec_with_pages(nr_pages);
> +	if (!qvec) {
> +		qio->bi_status = BLK_STS_RESOURCE;
> +		qio_endio(qio); /* Frees ext */
> +		return;
> +	}
> +
> +	qio->complete = cow_read_complete;
> +	qio->data = qvec;
> +	qio->ext->cleanup_mask |= FREE_QIO_DATA_QVEC;
> +	qio->ext->lx_md = md;
> +	/*
> +	 * This is not obligatory, since cluster under COW can't disappear
> +	 * after we decrement its counter (another snap refers to it). We
> +	 * do that for the uniformity with compressed COW and better testing.
> +	 */
> +	inc_wpc_readers(md);
> +
> +	iov_iter_bvec(&iter, READ, qvec->bvec, nr_pages, clu_size);
> +	call_rw_iter(qcow2->file, pos, READ, &iter, qio);
> +}
> +
> +static int decompress_zlib_clu(struct qcow2 *qcow2, struct qcow2_bvec *qvec,
> +			       u16 page0_off, int count, void *buf, void *ws)
> +{
> +	unsigned int off = page0_off;
> +	struct z_stream_s strm;
> +	void *from;
> +	int i, ret;
> +
> +	memset(&strm, 0, sizeof(strm));
> +	strm.workspace = ws;
> +	strm.next_out = buf;
> +	strm.avail_out = qcow2->clu_size;
> +	strm.total_out = 0;
> +
> +	ret = zlib_inflateInit2(&strm, -MAX_WBITS); /* minus is zlib (!gzip) */
> +	if (ret != Z_OK)
> +		return -ENOMEM;
> +
> +	count -= off;
> +	for (i = 0; i < qvec->nr_pages && count > 0; i++, off = 0) {
> +		from = kmap(qvec->bvec[i].bv_page);
> +		strm.next_in = from + off;
> +		strm.avail_in = min_t(int, qvec->bvec[i].bv_len - off, count);
> +		strm.total_in = 0;
> +		count -= strm.avail_in;
> +		ret = zlib_inflate(&strm, Z_NO_FLUSH);
> +		kunmap(qvec->bvec[i].bv_page);
> +		if (ret == Z_STREAM_END) {
> +			ret = Z_OK;
> +			break;
> +		}
> +		if (ret != Z_OK)
> +			break;
> +	}
> +
> +	zlib_inflateEnd(&strm);
> +	if (ret == Z_OK && strm.total_out == qcow2->clu_size)
> +		return strm.total_out;
> +	return -EIO;
> +}
> +
> +static int extract_one_compressed(struct qcow2 *qcow2, void *buf,
> +				  struct qcow2_bvec *qvec,
> +				  u16 page0_off, u32 qvec_len)
> +{
> +	void *ws = buf + qcow2->clu_size;
> +
> +	return decompress_zlib_clu(qcow2, qvec, page0_off, qvec_len, buf, ws);
> +}
> +
> +static int copy_buf_to_bvec_iter(const struct bio_vec *bvec,
> +				 const struct bvec_iter *biter,
> +				 const void *buf, u32 max)
> +{
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +	void *to, *addr;
> +	int ret = 0;
> +
> +	/* This is equivalent of bio_for_each_bvec() */
> +	qcow2_for_each_bvec(iter, bv, *biter, bvec) {
> +		if (WARN_ON_ONCE(bv.bv_len > max)) {
> +			ret = -EIO;
> +			break;
> +		}
> +		addr = kmap(bv.bv_page);
> +		to = addr + bv.bv_offset;
> +		memcpy(to, buf, bv.bv_len);
> +		kunmap(bv.bv_page);
> +		max -= bv.bv_len;
> +		buf += bv.bv_len;
> +	}
> +
> +	return ret;
> +}
> +
> +static int copy_clu_part_to_qio(struct qcow2 *qcow2, const void *buf, struct qio *qio)
> +{
> +	u32 max, seek, clu_size = qcow2->clu_size;
> +
> +	seek = bytes_off_in_cluster(qcow2, qio);
> +	if (WARN_ON_ONCE(seek >= clu_size))
> +		return -EINVAL;
> +
> +	buf += seek;
> +	max = clu_size - seek;
> +
> +	return copy_buf_to_bvec_iter(qio->bi_io_vec, &qio->bi_iter, buf, max);
> +}
> +
> +static int copy_zcow_slice(loff_t start, loff_t end, void *qio_p,
> +			   void *buf, void *consumed_p)
> +{
> +	struct qio *qio = qio_p;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	u32 clu_size = qcow2->clu_size;
> +	loff_t *consumed = consumed_p;
> +	struct qcow2_bvec *qvec = qio->data;
> +	struct bio_vec *bvec = qvec->bvec;
> +	struct bvec_iter iter;
> +	u32 off = CLU_OFF(qcow2, start);
> +
> +	if (WARN_ON_ONCE(start >= end))
> +		return -EINVAL;
> +
> +	iter.bi_size = end - start;
> +	iter.bi_idx = *consumed / PAGE_SIZE;
> +	iter.bi_bvec_done = off & ~PAGE_MASK;
> +
> +	*consumed += round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE);
> +
> +	return copy_buf_to_bvec_iter(bvec, &iter, buf + off, clu_size - off);
> +}
> +
> +static int prepare_zcow_slices(struct qcow2 *qcow2, void *buf, struct qio *qio)
> +{
> +	loff_t consumed = 0;
> +	/* Place required slices in that pages like further COW expects */
> +	for_each_cow_interval(qio, copy_zcow_slice, qio, buf, &consumed);
> +	return 0;
> +}
> +
> +static void compressed_read_complete(struct qio *qio)
> +{
> +	struct md_page *md = qio->ext->lx_md;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	int ret = qio->ret;
> +
> +	dec_wpc_readers(qcow2, md); /* We ended to use compressed clu on disk */
> +	/*
> +	 * We don't interpret as error a positive ret, which is less,
> +	 * then submitted. Decompress will fail, if we read not enough.
> +	 */
> +	if (ret < 0) {
> +		qio->bi_status = errno_to_blk_status(ret ? : -EIO);
> +		qio_endio(qio);
> +		return;
> +	}
> +
> +	qio->queue_list_id = QLIST_ZREAD;
> +	dispatch_qios(qcow2, qio, NULL);
> +}
> +
> +static void submit_read_compressed(struct qcow2_map *map, struct qio *qio,
> +				   bool for_cow)
> +{
> +	u32 off, nr_pages, nr_alloc, nr_segs;
> +	struct md_page *l2_md = map->l2.md;
> +	struct qcow2 *qcow2 = map->qcow2;
> +	u32 clu_size = qcow2->clu_size;
> +	struct qcow2_bvec *qvec;
> +	struct iov_iter iter;
> +	loff_t pos, end;
> +
> +	WARN_ON_ONCE(!map->data_clu_pos);
> +	pos = round_down(map->data_clu_pos, PAGE_SIZE);
> +	end = compressed_clu_end_pos(map->data_clu_pos, map->compressed_sectors);
> +	end = round_up(end, PAGE_SIZE);
> +	nr_pages = (end - pos) / PAGE_SIZE;
> +
> +	nr_alloc = nr_pages;
> +	if (for_cow) {
> +		qio->ext->cow_mask = calc_cow_mask(qcow2, map->ext_l2, qio,
> +					     true, map->clu_is_cow, false);
> +
> +		/* COW reuses this qvec to write rest of cluster */
> +		nr_alloc = nr_segs = 0;
> +		for_each_cow_interval(qio, count_cow_pages,
> +				&nr_alloc, &nr_segs, NULL);
> +		if (unlikely(nr_alloc < nr_pages))
> +			nr_alloc = nr_pages;
> +		qio->ext->cow_segs = nr_segs;
> +	}
> +
> +	qvec = alloc_qvec_with_pages(nr_alloc);
> +	/* COW may already allocate qio->ext */
> +	if (!qvec || (!qio->ext && alloc_qio_ext(qio) < 0)) {
> +		free_qvec_with_pages(qvec);
> +		qio->bi_status = BLK_STS_RESOURCE;
> +		qio_endio(qio); /* Frees ext */
> +		return;
> +	}
> +	qio->ext->zdata_off = off = map->data_clu_pos - pos;
> +	WARN_ON_ONCE(off > ~(u16)0);
> +
> +	qio->complete = compressed_read_complete;
> +	qio->data = qvec;
> +	qio->ext->cleanup_mask |= FREE_QIO_DATA_QVEC;
> +	qio->ext->lx_md = l2_md;
> +	if (for_cow && qcow2->ext_l2)
> +		qio->ext->new_ext_l2 = 0x00000000FFFFFFFF;
> +	inc_wpc_readers(l2_md);
> +
> +	if (qio->bi_iter.bi_size == clu_size && for_cow) {
> +		/*
> +		 * Optimization: do not read clu from disk
> +		 * in case of here is complete clu rewrite.
> +		 * See the way process_cow_data_write()
> +		 * updates qvec.
> +		 */
> +		qio->ret = clu_size;
> +		cow_read_complete(qio); /* Also skip extract part */
> +		return;
> +	}
> +
> +	iov_iter_bvec(&iter, READ, qvec->bvec, nr_pages, end - pos);
> +	call_rw_iter(qcow2->file, pos, READ, &iter, qio);
> +}
> +
> +static void sliced_cow_read_complete(struct qcow2_target *tgt, struct qio *read_qio,
> +				     void *qio_ptr, blk_status_t bi_status)
> +{
> +	struct qio *qio = qio_ptr;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +
> +	if (unlikely(bi_status)) {
> +		qio->bi_status = bi_status;
> +		qio_endio(qio);
> +	} else {
> +		qio->queue_list_id = QLIST_COW_DATA;
> +		dispatch_qios(qcow2, qio, NULL);
> +	}
> +}
> +
> +/*
> + * This creates a chain from qios going to discontinuous
> + * slices of COW cluster. The main qio of chain will call
> + * endio_cb only after all children qios are completed.
> + */
> +static int split_sliced_cow_qio(loff_t start, loff_t end,
> +				void *qio_p, void *list_p,
> +				void *nr_segs_remaining_p)
> +{
> +	u32 *nr_segs = nr_segs_remaining_p;
> +	struct qio *split, *qio = qio_p;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	struct list_head *list = list_p;
> +	u32 size = end - start;
> +
> +	if (WARN_ON_ONCE(start >= end))
> +		return -EINVAL;
> +
> +	qio->bi_iter.bi_size = UINT_MAX; /* Silence qio_advance() */
> +
> +	if (start & ~PAGE_MASK) {
> +		/* Skip our alignment. This only advances qio->bi_io_vec */
> +		qio_advance(qio, start & ~PAGE_MASK);
> +	}
> +
> +	if (--*nr_segs > 0) {
> +		split = split_and_chain_qio(qcow2, qio, size);
> +		if (!split)
> +			return -ENOMEM;
> +		if (end & ~PAGE_MASK) {
> +			/* Skip our alignment: next does not want it */
> +			qio_advance(qio, PAGE_SIZE - (end & ~PAGE_MASK));
> +		}
> +		list_add_tail(&split->link, list);
> +		qio = split;
> +	}
> +
> +	qio->bi_iter.bi_sector = to_sector(start);
> +	qio->bi_iter.bi_size = size;
> +	return 0;
> +}
> +
> +static void submit_read_sliced_clu(struct qcow2_map *map, struct qio *qio,
> +				   qcow2_endio_t endio_cb)
> +
> +{
> +	struct qcow2 *qcow2 = map->qcow2;
> +	struct qcow2_bvec *qvec;
> +	u32 nr_pages, nr_segs;
> +	struct qio *read_qio;
> +	LIST_HEAD(list);
> +	int ret;
> +
> +	nr_pages = nr_segs = 0;
> +	for_each_cow_interval(qio, count_cow_pages,
> +			      &nr_pages, &nr_segs, NULL);
> +
> +	qio->ext->lx_md = map->l2.md;
> +	qio->ext->cow_segs = nr_segs;
> +
> +	if (!nr_segs) { /* Full overwrite */
> +		qio->data = NULL; /* qvec */
> +		endio_cb(qcow2->tgt, NULL, qio, BLK_STS_OK);
> +		goto out;
> +	}
> +
> +	read_qio = alloc_clu_read_qio(qcow2, nr_pages, &qvec);
> +	if (!read_qio)
> +		goto err_alloc;
> +	read_qio->flags |= QIO_FREE_ON_ENDIO_FL;
> +	read_qio->endio_cb = endio_cb;
> +	read_qio->endio_cb_data = qio;
> +
> +	qio->data = qvec;
> +	qio->ext->cleanup_mask |= FREE_QIO_DATA_QVEC;
> +
> +	ret = for_each_cow_interval(qio, split_sliced_cow_qio,
> +				    read_qio, &list, &nr_segs);
> +	list_add_tail(&read_qio->link, &list);
> +	if (ret)
> +		goto err_split;
> +
> +	while ((read_qio = qio_list_pop(&list)) != NULL)
> +		process_read_qio(qcow2, read_qio, map);
> +out:
> +	return;
> +err_split:
> +	end_qios(&list, BLK_STS_RESOURCE);
> +	goto out;
> +err_alloc:
> +	qio->bi_status = BLK_STS_RESOURCE;
> +	qio_endio(qio);
> +	goto out;
> +
> +}
> +
> +static void submit_read_sliced_cow_clu(struct qcow2_map *map, struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = map->qcow2;
> +	u64 mask = 0;
> +
> +	WARN_ON_ONCE(!(map->level & L2_LEVEL));
> +
> +	if (qcow2->ext_l2) {
> +		mask = map->ext_l2 & ~((u64)map->subclus_mask << 32);
> +		qio->ext->new_ext_l2 = mask | map->subclus_mask;
> +		if (map->data_clu_alloced && !map->clu_is_cow) {
> +			qio->ext->only_set_ext_l2 = true;
> +			qio->ext->allocated_clu_pos = map->data_clu_pos;
> +		}
> +		qio->ext->cow_mask = calc_cow_mask(qcow2, map->ext_l2, qio,
> +					     true, map->clu_is_cow, false);
> +	}
> +
> +	submit_read_sliced_clu(map, qio, sliced_cow_read_complete);
> +}
> +
> +static void submit_top_delta_read(struct qcow2_map *map, struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = map->qcow2;
> +
> +	if (qcow2->ext_l2) {
> +		qio->ext->cow_mask = calc_cow_mask(qcow2, map->ext_l2, qio,
> +						   false, true, true);
> +		qio->ext->new_ext_l2 = 0; /* For discard */
> +	}
> +	submit_read_sliced_clu(map, qio, backward_merge_read_complete);
> +}
> +
> +static void issue_discard(struct qcow2_map *map, struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = map->qcow2;
> +	loff_t pos;
> +	int ret;
> +
> +	WARN_ON_ONCE(!(map->level & L2_LEVEL));
> +	pos = bio_sector_to_file_pos(qcow2, qio, map);
> +	ret = punch_hole(qcow2->file, pos, qio->bi_iter.bi_size);
> +
> +	if (ret)
> +		qio->bi_status = errno_to_blk_status(ret);
> +	qio_endio(qio);
> +}
> +
> +static int handle_metadata(struct qcow2 *qcow2, struct qio **qio,
> +			   struct qcow2_map *map)
> +{
> +	bool write = op_is_write((*qio)->bi_op);
> +	int ret;
> +
> +	ret = parse_metadata(qcow2, qio, map, write);
> +	if (ret < 0 || !*qio) /* Error or postponed */
> +		goto check_err;
> +
> +	ret = 1;
> +	if (unlikely(qcow2->backward_merge_in_process)) {
> +		/* Keep in mind the below may replace *qio */
> +		ret = prepare_backward_merge(qcow2, qio, map, write);
> +	} else if (unlikely(fake_l1cow_qio(*qio)) &&
> +		(!map->clu_is_cow || (map->level & L2_LEVEL))) {
> +		/* Nothing to COW or L1 is mapped exactly once */
> +		qio_endio(*qio);
> +		ret = 0;
> +	} else if (write &&
> +		   (!qio_is_fully_alloced(qcow2, *qio, map) || map->clu_is_cow)) {
> +		if (map->clu_is_cow) {
> +			/* COW to compressed or shared with snapshot cluster */
> +			ret = prepare_l1l2_cow(qcow2, *qio, map);
> +		} else if ((map->level & L2_LEVEL) &&
> +		    qio_border_is_inside_unmapped_unit(qcow2, *qio, map) &&
> +		    maybe_mapped_in_lower_delta(qcow2, *qio)) {
> +			/*
> +			 * Backing file is about data COW, and it is
> +			 * never about metadata COW (unlike internal
> +			 * snapshots). Here is data COW on L2_LEVEL.
> +			 */
> +			map->backing_file_cow = true;
> +			ret = prepare_l1l2_cow(qcow2, *qio, map);
> +		} else if (unlikely(op_is_discard((*qio)->bi_op) &&
> +				    (map->level & L2_LEVEL))) {
> +			if (!map->data_clu_alloced) {
> +				qio_endio(*qio);
> +				ret = 0;
> +			}
> +			/* Otherwise issue_discard(). TODO: update L2 */
> +		} else {
> +			/* Wants L1 or L2 entry allocation */
> +			ret = prepare_l1l2_allocation(qcow2, *qio, map);
> +		}
> +	}
> +
> +check_err:
> +	if (ret < 0) {
> +		(*qio)->bi_status = errno_to_blk_status(ret);
> +		qio_endio(*qio);
> +		ret = 0;
> +	}
> +
> +	return ret;
> +}
> +
> +static void process_read_qio(struct qcow2 *qcow2, struct qio *qio,
> +			     struct qcow2_map *map)
> +{
> +	bool unmapped, zeroes, try_lower;
> +	struct qio *split;
> +	u32 size;
> +
> +	do {
> +		unmapped = try_lower = false;
> +		split = NULL;
> +
> +		zeroes = (size = qio_all_zeroes_size(qcow2, qio, map));
> +		if (!size)
> +			unmapped = (size = qio_unmapped_size(qcow2, qio, map));
> +		if (!size)
> +			size = qio_mapped_not_zeroes_size(qcow2, qio, map);
> +
> +		if (unmapped)
> +			try_lower = maybe_mapped_in_lower_delta(qcow2, qio);
> +
> +		if (zeroes || (unmapped && !try_lower)) {
> +			/* All zeroes or clu is not allocated */
> +			perform_zero_read(qio, size);
> +			if (size == qio->bi_iter.bi_size) {
> +				qio_endio(qio);
> +				break;
> +			}
> +			qio_advance(qio, size);
> +			continue;
> +		}
> +
> +		if (size < qio->bi_iter.bi_size) {
> +			split = split_and_chain_qio(qcow2, qio, size);
> +			if (!split)
> +				goto err;
> +			swap(qio, split);
> +		}
> +
> +		if (unmapped && try_lower) {
> +			/* Try to read from lower delta */
> +			shorten_and_zero_qio_tail(qcow2->lower, qio);
> +			qio->qcow2 = qcow2->lower;
> +			WARN_ON_ONCE(qio->queue_list_id != QLIST_DEFERRED);
> +			dispatch_qios(qio->qcow2, qio, NULL);
> +		} else {
> +			/* Mapped */
> +			perform_rw_mapped(map, qio);
> +		}
> +
> +		qio = split;
> +	} while (qio);
> +
> +	return;
> +err:
> +	qio->bi_status = BLK_STS_RESOURCE;
> +	qio_endio(qio);
> +}
> +
> +static void process_one_qio(struct qcow2 *qcow2, struct qio *qio)
> +{
> +	struct qcow2_map map = { .qcow2 = qcow2, };
> +	bool write;
> +
> +	if (!handle_metadata(qcow2, &qio, &map))
> +		return;
> +
> +	if (unlikely(qcow2->backward_merge_in_process)) {
> +		submit_top_delta_read(&map, qio);
> +		return;
> +	}
> +
> +	write = op_is_write(qio->bi_op);
> +
> +	if (unlikely(map.compressed)) {
> +		/* Compressed qio never uses sub-clus */
> +		submit_read_compressed(&map, qio, write);
> +		return;
> +	}
> +
> +	if (!write) {
> +		process_read_qio(qcow2, qio, &map);
> +	} else { /* write */
> +		if (unlikely(map.clu_is_cow && !(map.level & L2_LEVEL)))
> +			submit_read_whole_cow_clu(&map, qio);
> +		else if (unlikely(map.clu_is_cow || map.backing_file_cow))
> +			submit_read_sliced_cow_clu(&map, qio);
> +		else if (unlikely(op_is_discard(qio->bi_op)))
> +			issue_discard(&map, qio);
> +		else
> +			perform_rw_mapped(&map, qio);
> +	}
> +}
> +
> +static struct bio_vec *create_bvec_from_rq(struct request *rq)
> +{
> +	struct bio_vec bv, *bvec, *tmp;
> +	struct req_iterator rq_iter;
> +	unsigned int nr_bvec = 0;
> +
> +	rq_for_each_bvec(bv, rq, rq_iter)
> +		nr_bvec++;
> +
> +	bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
> +			     GFP_NOIO);
> +	if (!bvec)
> +		goto out;
> +
> +	tmp = bvec;
> +	rq_for_each_bvec(bv, rq, rq_iter) {
> +		*tmp = bv;
> +		tmp++;
> +	}
> +out:
> +	return bvec;
> +}
> +
> +static void prepare_one_embedded_qio(struct qcow2 *qcow2, struct qio *qio,
> +				     struct list_head *deferred_qios)
> +{
> +	struct qcow2_rq *qrq = embedded_qio_to_qrq(qio);
> +	struct request *rq = qrq->rq;
> +	struct bio_vec *bvec = NULL;
> +	LIST_HEAD(list);
> +	int ret;
> +
> +	if (rq->bio != rq->biotail) {
> +		if (req_op(rq) == REQ_OP_DISCARD)
> +			goto skip_bvec;
> +		/*
> +		 * Transform a set of bvec arrays related to bios
> +		 * into a single bvec array (which we can iterate).
> +		 */
> +		bvec = create_bvec_from_rq(rq);
> +		if (unlikely(!bvec))
> +			goto err;
> +		qrq->bvec = bvec;
> +skip_bvec:
> +		qio->bi_iter.bi_sector = blk_rq_pos(rq);
> +		qio->bi_iter.bi_size = blk_rq_bytes(rq);
> +		qio->bi_iter.bi_idx = 0;
> +		qio->bi_iter.bi_bvec_done = 0;
> +	} else {
> +		/* Single bio already provides bvec array */
> +		bvec = rq->bio->bi_io_vec;
> +
> +		qio->bi_iter = rq->bio->bi_iter;
> +	}
> +
> +	qio->bi_io_vec = bvec;
> +	qio->queue_list_id = QLIST_DEFERRED;
> +
> +	ret = split_qio_to_list(qcow2, qio, deferred_qios);
> +	if (unlikely(ret < 0))
> +		goto err;
> +
> +	return;
> +err:
> +	qio->bi_status = BLK_STS_RESOURCE;
> +	qio_endio(qio);
> +}
> +
> +static void process_embedded_qios(struct qcow2 *qcow2, struct list_head *qios,
> +				  struct list_head *deferred_qios)
> +{
> +	struct qio *qio;
> +
> +	while ((qio = qio_list_pop(qios)))
> +		prepare_one_embedded_qio(qcow2, qio, deferred_qios);
> +}
> +
> +static void process_deferred_qios(struct qcow2 *qcow2, struct list_head *qios)
> +{
> +	struct qio *qio;
> +
> +	while ((qio = qio_list_pop(qios))) {
> +		/* Sanity: on this stage we do not expect ext */
> +		if (WARN_ON_ONCE(qio->ext != NULL)) {
> +			qio->bi_status = BLK_STS_IOERR;
> +			qio_endio(qio);
> +			continue;
> +		}
> +
> +		process_one_qio(qcow2, qio);
> +	}
> +}
> +
> +static void submit_metadata_writeback(struct qcow2 *qcow2)
> +{
> +	struct md_page *md;
> +
> +	while (1) {
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		md = list_first_entry_or_null(&qcow2->wb_batch_list,
> +					      struct md_page, wb_link);
> +		if (!md) {
> +			spin_unlock_irq(&qcow2->md_pages_lock);
> +			break;
> +		}
> +		list_del_init(&md->wb_link);
> +		/* L1L2 mustn't be redirtyed, when wb in-flight! */
> +		WARN_ON_ONCE(!(md->status & MD_DIRTY) ||
> +			      (md->wbd && (md->status & MD_WRITEBACK)));
> +		md->status |= MD_WRITEBACK;
> +		md->status &= ~MD_DIRTY;
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +		submit_rw_md_page(WRITE, qcow2, md);
> +	}
> +}
> +
> +static int complete_metadata_writeback(struct qcow2 *qcow2)
> +{
> +	struct qcow2_bvec *qvec;
> +	struct md_page *md;
> +	int fsync_ret, ret;
> +	LIST_HEAD(wb_list);
> +	struct qio *qio;
> +
> +	spin_lock_irq(&qcow2->deferred_lock);
> +	list_splice_init(&qcow2->qios[QLIST_COMPLETED_WB], &wb_list);
> +	spin_unlock_irq(&qcow2->deferred_lock);
> +	if (unlikely(list_empty(&wb_list)))
> +		return -EAGAIN;
> +
> +	fsync_ret = vfs_fsync(qcow2->file, 0);
> +	/* FIXME: We should reread md page on error */
> +	if (unlikely(fsync_ret))
> +		pr_err_ratelimited("qcow2: can't sync md: %d\n", fsync_ret);
> +
> +	while (!list_empty(&wb_list)) {
> +		qio = list_first_entry(&wb_list, struct qio, link);
> +		list_del(&qio->link);
> +		md = qio->ext->md;
> +		qvec = qio->data;
> +		ret = qio->ret;
> +		if (unlikely(ret != PAGE_SIZE))
> +			ret = -EIO;
> +		else
> +			ret = fsync_ret;
> +
> +		do_md_page_write_complete(ret, qcow2, md);
> +		dec_inflight_md(qcow2, qio);
> +		kfree(qvec); /* qio and ext are tail bytes after qvec */
> +	}
> +
> +	return fsync_ret;
> +}
> +
> +/* Process completed compressed READs */
> +static void process_compressed_read(struct qcow2 *qcow2, struct list_head *read_list,
> +				    struct list_head *cow_list)
> +{
> +	struct qcow2_bvec *qvec;
> +	struct qio_ext *ext;
> +	blk_status_t ret;
> +	void *buf = NULL;
> +	struct qio *qio;
> +	bool for_cow;
> +
> +	if (list_empty(read_list))
> +		return;
> +
> +	buf = kmalloc(qcow2->clu_size + zlib_inflate_workspacesize(), GFP_NOIO);
> +	if (!buf) {
> +		end_qios(read_list, BLK_STS_RESOURCE);
> +		return;
> +	}
> +
> +	while ((qio = qio_list_pop(read_list)) != NULL) {
> +		qvec = qio->data;
> +		ext = qio->ext;
> +
> +		ret = extract_one_compressed(qcow2, buf, qvec,
> +				    ext->zdata_off, qio->ret);
> +		if (ret)
> +			goto err;
> +
> +		for_cow = op_is_write(qio->bi_op);
> +		if (!for_cow)
> +			ret = copy_clu_part_to_qio(qcow2, buf, qio);
> +		else
> +			ret = prepare_zcow_slices(qcow2, buf, qio);
> +
> +		if (!for_cow || ret) {
> +err:
> +			if (ret)
> +				qio->bi_status = errno_to_blk_status(ret);
> +			qio_endio(qio);
> +			continue;
> +		}
> +
> +		/* Further COW processing */
> +		qio->queue_list_id = QLIST_COW_DATA;
> +		list_add_tail(&qio->link, cow_list);
> +	}
> +
> +	kfree(buf);
> +}
> +
> +static int prepare_sliced_data_write(struct qcow2 *qcow2, struct qio *qio,
> +			      struct list_head *list, qcow2_endio_t endio)
> +{
> +	struct qcow2_target *tgt = qcow2->tgt;
> +	struct qcow2_bvec *qvec = qio->data;
> +	u32 nr_segs = qio->ext->cow_segs;
> +	struct qio *write_qio, *aux_qio;
> +	int ret;
> +
> +	WARN_ON_ONCE(qio->bi_op == REQ_OP_READ && nr_segs == 0);
> +
> +	write_qio = alloc_qio(tgt->qio_pool, true);
> +	if (!write_qio)
> +		goto err_qio;
> +	init_qio(write_qio, REQ_OP_WRITE, qcow2);
> +	write_qio->flags |= QIO_FREE_ON_ENDIO_FL;
> +	write_qio->endio_cb = endio;
> +	write_qio->endio_cb_data = qio;
> +
> +	if (qio->bi_op != REQ_OP_READ && !fake_merge_qio(qio)) {
> +		/* Create aux qio to chain @qio bytes write there */
> +		aux_qio = write_qio;
> +		if (nr_segs) {
> +			aux_qio = split_and_chain_qio(qcow2, write_qio, 0);
> +			if (!aux_qio) {
> +				free_qio(write_qio, tgt->qio_pool);
> +				goto err_qio;
> +			}
> +			list_add(&aux_qio->link, list);
> +		}
> +		aux_qio->bi_op = qio->bi_op;
> +		aux_qio->bi_io_vec = qio->bi_io_vec;
> +		aux_qio->bi_iter = qio->bi_iter;
> +
> +		if (!nr_segs) { /* Full overwrite */
> +			list_add(&aux_qio->link, list);
> +			goto out;
> +		}
> +	}
> +
> +	write_qio->bi_io_vec = qvec->bvec;
> +	write_qio->bi_iter.bi_idx = 0;
> +	write_qio->bi_iter.bi_bvec_done = 0;
> +
> +	ret = for_each_cow_interval(qio, split_sliced_cow_qio,
> +				    write_qio, list, &nr_segs);
> +	list_add_tail(&write_qio->link, list);
> +	if (ret)
> +		goto err_split;
> +out:
> +	return 0;
> +err_split:
> +	end_qios(list, BLK_STS_RESOURCE);
> +	goto out;
> +err_qio:
> +	qio->bi_status = BLK_STS_RESOURCE;
> +	qio_endio(qio);
> +	goto out;
> +}
> +
> +static void process_backward_merge_write(struct qcow2 *qcow2, struct list_head *qio_list)
> +{
> +	qcow2_endio_t endio = backward_merge_write_complete;
> +	struct qio *qio;
> +	LIST_HEAD(list);
> +
> +	while (1) {
> +		qio = qio_list_pop(qio_list);
> +		if (!qio)
> +			break;
> +
> +		if (prepare_sliced_data_write(qcow2->lower, qio,
> +					      &list, endio) < 0)
> +			continue;
> +
> +		dispatch_qios(qcow2->lower, NULL, &list);
> +	}
> +}
> +
> +static void cow_data_write_complete(struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	int ret = qio->ret;
> +
> +	BUG_ON(!qio->ext);
> +
> +	if (ret > 0 && ret != qcow2->clu_size)
> +		ret = -EIO;
> +	if (ret < 0) {
> +		qio->bi_status = errno_to_blk_status(ret);
> +		qio_endio(qio);
> +	} else {
> +		qio->queue_list_id = QLIST_COW_INDEXES;
> +		dispatch_qios(qcow2, qio, NULL);
> +	}
> +}
> +
> +static void submit_cow_data_write(struct qcow2 *qcow2, struct qio *qio, loff_t pos)
> +{
> +	u32 nr_segs, clu_size = qcow2->clu_size;
> +	struct qcow2_bvec *qvec = qio->data;
> +	struct iov_iter iter;
> +
> +	nr_segs = clu_size >> PAGE_SHIFT;
> +	WARN_ON_ONCE(qvec->nr_pages < nr_segs);
> +
> +	iov_iter_bvec(&iter, WRITE, qvec->bvec, nr_segs, clu_size);
> +	qio->complete = cow_data_write_complete;
> +
> +	call_rw_iter(qcow2->file, pos, WRITE, &iter, qio);
> +}
> +
> +static void sliced_cow_data_write_complete(struct qcow2_target *tgt, struct qio *unused,
> +					   void *qio_ptr, blk_status_t bi_status)
> +
> +{
> +	struct qio *qio = qio_ptr;
> +	struct qcow2 *qcow2 = qio->qcow2;
> +
> +	BUG_ON(!qio->ext);
> +
> +	if (bi_status) {
> +		qio->bi_status = bi_status;
> +		qio_endio(qio);
> +	} else {
> +		qio->queue_list_id = QLIST_COW_INDEXES;
> +		dispatch_qios(qcow2, qio, NULL);
> +	}
> +}
> +
> +static void submit_sliced_cow_data_write(struct qcow2 *qcow2, struct qio *qio, loff_t clu_pos)
> +{
> +	qcow2_endio_t endio = sliced_cow_data_write_complete;
> +	struct qio *write_qio;
> +	LIST_HEAD(list);
> +
> +	if (prepare_sliced_data_write(qcow2, qio, &list, endio) < 0)
> +		return;
> +
> +	while ((write_qio = qio_list_pop(&list)) != NULL) {
> +		write_qio->complete = data_rw_complete;
> +		write_qio->data = NULL;
> +		submit_rw_mapped(qcow2, clu_pos, write_qio);
> +	}
> +}
> +
> +static void process_cow_data_write(struct qcow2 *qcow2, struct list_head *cow_list)
> +{
> +	struct qio_ext *ext;
> +	struct qio *qio;
> +	loff_t pos;
> +
> +	while (1) {
> +		qio = qio_list_pop(cow_list);
> +		if (!qio)
> +			break;
> +		ext = qio->ext;
> +
> +		if (ext->only_set_ext_l2) {
> +			WARN_ON_ONCE(ext->cow_level != L2_LEVEL);
> +			pos = ext->allocated_clu_pos;
> +			goto submit;
> +		}
> +
> +		WARN_ON_ONCE(qio->queue_list_id != QLIST_COW_DATA);
> +		pos = allocate_cluster(qcow2, qio, &ext->r2_md,
> +				       &ext->r2_index_in_page);
> +		if (pos < 0) {
> +			qio->bi_status = errno_to_blk_status(pos);
> +			qio_endio(qio);
> +		}
> +
> +		if (pos <= 0)
> +			continue;
> +
> +		ext->allocated_clu_pos = pos;
> +		ext->cleanup_mask |= FREE_ALLOCATED_CLU;
> +submit:
> +		if (ext->cow_level == L2_LEVEL)
> +			submit_sliced_cow_data_write(qcow2, qio, pos);
> +		else
> +			submit_cow_data_write(qcow2, qio, pos);
> +	}
> +}
> +
> +static void process_cow_indexes_write(struct qcow2 *qcow2,
> +				      struct list_head *qio_list)
> +{
> +	struct qcow2_bvec *qvec;
> +	struct md_page *lx_md;
> +	struct qio_ext *ext;
> +	struct qio *qio;
> +	bool discard;
> +	u32 arg_mask;
> +	int ret;
> +
> +	while (1) {
> +		qio = qio_list_pop(qio_list);
> +		if (!qio)
> +			break;
> +		ext = qio->ext;
> +		qvec = qio->data;
> +		lx_md = ext->lx_md;
> +
> +		/* Return back to the same stage in case of writeback */
> +		qio->queue_list_id = QLIST_COW_INDEXES;
> +		if (delay_if_writeback(qcow2, lx_md, -1, &qio, true))
> +			continue;
> +
> +		discard = (qio->flags & QIO_IS_DISCARD_FL) ? true : false;
> +		WARN_ON_ONCE(discard && ext->allocated_clu_pos);
> +
> +		arg_mask = (discard ? 0 : LU_SET_ONE_MASK) | LU_WANTS_PE_PAGE;
> +		if (ext->only_set_ext_l2) {
> +			WARN_ON_ONCE(ext->cow_level != L2_LEVEL);
> +			goto set_ext_l2;
> +		}
> +
> +		/* XXX: check prealloced_pos ==> revert */
> +		ret = prepare_l_entry_update(qcow2, qio, lx_md,
> +					     ext->lx_index_in_page,
> +					     &ext->allocated_clu_pos,
> +					     arg_mask);
> +		if (ret < 0) {
> +			qio->bi_status = errno_to_blk_status(ret);
> +			qio_endio(qio);
> +			continue;
> +		}
> +set_ext_l2:
> +		if (qcow2->ext_l2 && ext->cow_level == L2_LEVEL) {
> +			arg_mask &= ~LU_SET_ONE_MASK;
> +			ret = prepare_l_entry_update(qcow2, qio, lx_md,
> +					     ext->lx_index_in_page + 1,
> +					   &ext->new_ext_l2, arg_mask);
> +			WARN_ON_ONCE(ret < 0);
> +		}
> +
> +		/* Next stage */
> +		qio->queue_list_id = QLIST_COW_END;
> +
> +		spin_lock_irq(&qcow2->md_pages_lock);
> +		/*
> +		 * Prohibit to start new reads from WP clusters.
> +		 * Otherwise, "wpc_readers == 0" never happens.
> +		 */
> +		WARN_ON_ONCE(lx_md->wpc_noread_count++ < 0);
> +		ext->cleanup_mask |= DEC_WPC_NOREAD_COUNT;
> +
> +		/* Wait md page writeback */
> +		list_add_tail(&qio->link, &lx_md->wbd->dependent_list);
> +		spin_unlock_irq(&qcow2->md_pages_lock);
> +	}
> +}
> +
> +/* Finalize successful COW */
> +static void process_cow_end(struct qcow2 *qcow2, struct list_head *qio_list)
> +{
> +	u32 mask, clu_size = qcow2->clu_size;
> +	struct qcow2_map_item r1, r2;
> +	struct qio_ext *ext;
> +	struct qio *qio;
> +	loff_t pos;
> +	int ret;
> +
> +	while (1) {
> +next:		qio = qio_list_pop(qio_list);
> +		if (!qio)
> +			break;
> +		ext = qio->ext;
> +		/* L2 index was written, cluster became used */
> +		if (ext->cleanup_mask & FREE_ALLOCATED_CLU)
> +			ext->cleanup_mask &= ~FREE_ALLOCATED_CLU;
> +
> +		/* Should be already set... */
> +		qio->queue_list_id = QLIST_COW_END;
> +		/*
> +		 * Wait last user before we (possible) mark clusters
> +		 * unused. In real only compressed COW requires this.
> +		 */
> +		if (delay_if_has_wpc_readers(qcow2, ext->lx_md, &qio))
> +			goto next;
> +
> +		pos = ext->cow_clu_pos;
> +		for (; pos < ext->cow_clu_end; pos += clu_size) {
> +			ret = __handle_r1r2_maps(qcow2, pos, &qio, &r1, &r2);
> +			if (ret == 0) /* We never shrink md pages, impossible */
> +				goto next;
> +			if (WARN_ON_ONCE(ret < 0))
> +				pr_err("qcow2: clu at %lld leaked\n", pos);
> +			else
> +				dec_cluster_usage(qcow2, r2.md, r2.index_in_page, pos);
> +			ext->cow_clu_pos += clu_size;
> +		}
> +
> +		mask = MD_INDEX_SET_UNLOCKED|DEC_WPC_NOREAD_COUNT;
> +		if (qio->data)
> +			mask |= FREE_QIO_DATA_QVEC;
> +		WARN_ON_ONCE(ext->cleanup_mask != mask); /* Sanity check */
> +
> +		if (ext->cow_level == L1_LEVEL) {
> +			finalize_qio_ext(qio);
> +			/* COW on L1 completed, it's time for COW on L2 */
> +			qio->queue_list_id = QLIST_DEFERRED;
> +			dispatch_qios(qcow2, qio, NULL);
> +		} else {
> +			/*
> +			 * This qio was already written together with clu.
> +			 * Nothing to do. See process_cow_data_write().
> +			 */
> +			qio_endio(qio); /* Makes all cleanup */
> +		}
> +	}
> +}
> +
> +void do_qcow2_work(struct work_struct *ws)
> +{
> +	struct qcow2 *qcow2 = container_of(ws, struct qcow2, worker);
> +	LIST_HEAD(embedded_qios);
> +	LIST_HEAD(deferred_qios);
> +	LIST_HEAD(zread_qios);
> +	LIST_HEAD(bwrite_qios);
> +	LIST_HEAD(cow_data_qios);
> +	LIST_HEAD(cow_indexes_qios);
> +	LIST_HEAD(cow_end_qios);
> +	unsigned int pflags = current->flags;
> +
> +	current->flags |= PF_LESS_THROTTLE|PF_MEMALLOC_NOIO;
> +	spin_lock_irq(&qcow2->deferred_lock);
> +	list_splice_init(&qcow2->qios[QLIST_EMBEDDED], &embedded_qios);
> +	list_splice_init(&qcow2->qios[QLIST_DEFERRED], &deferred_qios);
> +	list_splice_init(&qcow2->qios[QLIST_ZREAD], &zread_qios);
> +	list_splice_init(&qcow2->qios[QLIST_BMERGE_WRITE], &bwrite_qios);
> +	list_splice_init(&qcow2->qios[QLIST_COW_DATA], &cow_data_qios);
> +	list_splice_init(&qcow2->qios[QLIST_COW_INDEXES], &cow_indexes_qios);
> +	list_splice_init(&qcow2->qios[QLIST_COW_END], &cow_end_qios);
> +	spin_unlock_irq(&qcow2->deferred_lock);
> +
> +	process_embedded_qios(qcow2, &embedded_qios, &deferred_qios);
> +	process_deferred_qios(qcow2, &deferred_qios);
> +	process_compressed_read(qcow2, &zread_qios, &cow_data_qios);
> +	process_backward_merge_write(qcow2, &bwrite_qios);
> +	process_cow_data_write(qcow2, &cow_data_qios);
> +	process_cow_indexes_write(qcow2, &cow_indexes_qios);
> +	process_cow_end(qcow2, &cow_end_qios);
> +
> +	/* This actually submits batch of md writeback, initiated above */
> +	submit_metadata_writeback(qcow2);
> +
> +	current_restore_flags(pflags, PF_LESS_THROTTLE|PF_MEMALLOC_NOIO);
> +}
> +
> +void do_qcow2_fsync_work(struct work_struct *ws)
> +{
> +	struct qcow2 *qcow2 = container_of(ws, struct qcow2, fsync_worker);
> +	unsigned int pflags = current->flags;
> +	LIST_HEAD(flush_qios);
> +	int fsync_ret;
> +
> +	current->flags |= PF_LESS_THROTTLE|PF_MEMALLOC_NOIO;
> +	spin_lock_irq(&qcow2->deferred_lock);
> +	list_splice_tail_init(&qcow2->qios[QLIST_FLUSH], &flush_qios);
> +	spin_unlock_irq(&qcow2->deferred_lock);
> +
> +	fsync_ret = complete_metadata_writeback(qcow2);
> +	/*
> +	 * Metadata writeback and flush bios are independent
> +	 * each other, but we want avoid excess fsync() call,
> +	 * if it's already done.
> +	 */
> +	if (fsync_ret == -EAGAIN)
> +		fsync_ret = vfs_fsync(qcow2->file, 0);
> +
> +	end_qios(&flush_qios, errno_to_blk_status(fsync_ret));
> +
> +	current_restore_flags(pflags, PF_LESS_THROTTLE|PF_MEMALLOC_NOIO);
> +}
> +
> +static void qrq_endio(struct qcow2_target *tgt, struct qio *unused,
> +		      void *qrq_ptr, blk_status_t bi_status)
> +{
> +	struct qcow2_rq *qrq = qrq_ptr;
> +	struct request *rq = qrq->rq;
> +
> +	if (qrq->bvec)
> +		kfree(qrq->bvec);
> +	dm_complete_request(rq, bi_status);
> +}
> +
> +static void init_qrq(struct qcow2_rq *qrq, struct request *rq)
> +{
> +	qrq->rq = rq;
> +	qrq->bvec = NULL;
> +}
> +
> +void submit_embedded_qio(struct qcow2_target *tgt, struct qio *qio)
> +{
> +	struct qcow2_rq *qrq = embedded_qio_to_qrq(qio);
> +	struct request *rq = qrq->rq;
> +	u8 queue_list_id, ref_index;
> +	struct work_struct *worker;
> +	struct qcow2 *qcow2;
> +	unsigned long flags;
> +	bool queue = true;
> +
> +	qcow2 = qcow2_ref_inc(tgt, &ref_index);
> +
> +	if (blk_rq_bytes(rq)) {
> +		queue_list_id = QLIST_EMBEDDED;
> +		worker = &qcow2->worker;
> +	} else {
> +		WARN_ON_ONCE(qio->bi_op != REQ_OP_FLUSH);
> +		queue_list_id = QLIST_FLUSH;
> +		worker = &qcow2->fsync_worker;
> +	}
> +
> +	spin_lock_irqsave(&qcow2->deferred_lock, flags);
> +	if (unlikely(qcow2->pause_submitting_qios)) {
> +		qcow2_ref_dec(tgt, ref_index);
> +		list_add_tail(&qio->link, &qcow2->paused_qios);
> +		queue = false;
> +	} else {
> +		qio->qcow2 = qcow2;
> +		qio->queue_list_id = queue_list_id;
> +		qio->ref_index = ref_index;
> +		list_add_tail(&qio->link, &qcow2->qios[qio->queue_list_id]);
> +	}
> +	spin_unlock_irqrestore(&qcow2->deferred_lock, flags);
> +
> +	if (queue)
> +		queue_work(tgt->wq, worker);
> +}
> +
> +void submit_embedded_qios(struct qcow2_target *tgt, struct list_head *list)
> +{
> +	struct qio *qio;
> +
> +	while ((qio = qio_list_pop(list)) != NULL)
> +		submit_embedded_qio(tgt, qio);
> +}
> +
> +int qcow2_clone_and_map(struct dm_target *ti, struct request *rq,
> +		    union map_info *info, struct request **clone)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +	struct qcow2_rq *qrq;
> +	struct qio *qio;
> +
> +	qrq = map_info_to_embedded_qrq(info);
> +	init_qrq(qrq, rq);
> +
> +	qio = map_info_to_embedded_qio(info);
> +	init_qio(qio, req_op(rq), NULL);
> +	qio->endio_cb = qrq_endio;
> +	qio->endio_cb_data = qrq;
> +	/*
> +	 * Note, this qcow2_clone_and_map() may be called from atomic
> +	 * context, so here we just delegate qio splitting to kwork.
> +	 */
> +	submit_embedded_qio(tgt, qio);
> +	return DM_MAPIO_SUBMITTED;
> +}
> +
> +static void handle_cleanup_mask(struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	struct qio_ext *ext = qio->ext;
> +	struct lock_desc *lockd = NULL;
> +	LIST_HEAD(qio_list);
> +	unsigned long flags;
> +	bool last;
> +
> +	if (ext->cleanup_mask & MD_INDEX_SET_UNLOCKED) {
> +		struct md_page *md = ext->lx_md;
> +
> +		spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +		clear_bit(ext->lx_index_in_page, md->lockd->indexes);
> +		WARN_ON_ONCE(--md->lockd->nr < 0);
> +		if (!md->lockd->nr)
> +			swap(md->lockd, lockd);
> +		list_splice_init(&md->wait_list, &qio_list);
> +		spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +		dispatch_qios(qcow2, NULL, &qio_list);
> +		kfree(lockd);
> +		ext->cleanup_mask &= ~MD_INDEX_SET_UNLOCKED;
> +	}
> +
> +	if (ext->cleanup_mask & DEC_WPC_NOREAD_COUNT) {
> +		struct md_page *md = ext->lx_md;
> +
> +		spin_lock_irqsave(&qcow2->md_pages_lock, flags);
> +		last = !(--md->wpc_noread_count);
> +		if (last)
> +			list_splice_init(&md->wait_list, &qio_list);
> +		spin_unlock_irqrestore(&qcow2->md_pages_lock, flags);
> +		if (last)
> +			dispatch_qios(qcow2, NULL, &qio_list);
> +		ext->cleanup_mask &= ~DEC_WPC_NOREAD_COUNT;
> +	}
> +
> +	if (ext->cleanup_mask & FREE_QIO_DATA_QVEC) {
> +		struct qcow2_bvec *qvec = qio->data;
> +
> +		free_qvec_with_pages(qvec);
> +		qio->data = NULL;
> +		ext->cleanup_mask &= ~FREE_QIO_DATA_QVEC;
> +	}
> +
> +	if (ext->cleanup_mask & FREE_ALLOCATED_CLU) {
> +		u32 index_in_page = ext->r2_index_in_page;
> +		loff_t pos = ext->allocated_clu_pos;
> +		struct md_page *md = ext->r2_md;
> +
> +		mark_cluster_unused(qcow2, md, index_in_page, pos);
> +		ext->cleanup_mask &= ~FREE_ALLOCATED_CLU;
> +	}
> +}
> diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c
> new file mode 100644
> index 000000000000..895b69f0a767
> --- /dev/null
> +++ b/drivers/md/dm-qcow2-target.c
> @@ -0,0 +1,935 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + *  Copyright (C) 2021 Virtuozzo International GmbH. All rights reserved.
> + */
> +#include <linux/uio.h>
> +
> +#include "dm.h"
> +#include "dm-qcow2.h"
> +
> +static bool kernel_sets_dirty_bit; /* false */;
> +module_param(kernel_sets_dirty_bit, bool, 0444);
> +MODULE_PARM_DESC(kernel_sets_dirty_bit,
> +		"Dirty bit is set by kernel, not by userspace");
> +
> +static void qcow2_set_service_operations(struct dm_target *ti, bool allowed)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +
> +	mutex_lock(&tgt->ctl_mutex);
> +	tgt->service_operations_allowed = allowed;
> +	mutex_unlock(&tgt->ctl_mutex);
> +}
> +
> +static int rw_pages_sync(unsigned int rw, struct qcow2 *qcow2,
> +			 u64 index, struct page *pages[], int nr)
> +{
> +	struct bio_vec *bvec, bvec_on_stack;
> +	ssize_t size = nr * PAGE_SIZE, ret;
> +	struct iov_iter iter;
> +	loff_t from, pos;
> +	int i;
> +
> +	if (rw != READ && rw != WRITE)
> +		return -EINVAL;
> +
> +	bvec = &bvec_on_stack;
> +	if (nr != 1)
> +		bvec = kmalloc(nr * sizeof(*bvec), GFP_NOIO);
> +	if (!bvec)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < nr; i++) {
> +		bvec[i].bv_page = pages[i];
> +		bvec[i].bv_len = PAGE_SIZE;
> +		bvec[i].bv_offset = 0;
> +	}
> +
> +	iov_iter_bvec(&iter, rw, bvec, nr, size);
> +	pos = from = index << PAGE_SHIFT;
> +
> +	if (rw == READ)
> +		ret = vfs_iter_read(qcow2->file, &iter, &pos, 0);
> +	else
> +		ret = vfs_iter_write(qcow2->file, &iter, &pos, 0);
> +
> +	if (ret == size) {
> +		ret = 0;
> +	} else if (ret > 0 && pos == qcow2->file_size &&
> +		 from + size - qcow2->file_size < PAGE_SIZE) {
> +		/* Read near EOF? */
> +		zero_fill_page_from(pages[nr-1], ret % PAGE_SIZE);
> +		ret = 0;
> +	} else if (ret >= 0) {
> +		ret = -ENODATA;
> +	}
> +
> +	if (bvec != &bvec_on_stack)
> +		kfree(bvec);
> +	return ret;
> +}
> +
> +int rw_page_sync(unsigned int rw, struct qcow2 *qcow2,
> +		 u64 index, struct page *page)
> +{
> +	struct page *pages[] = {page};
> +
> +	return rw_pages_sync(rw, qcow2, index, pages, 1);
> +}
> +
> +static void qcow2_aio_do_completion(struct qio *qio)
> +{
> +	if (!atomic_dec_and_test(&qio->aio_ref))
> +		return;
> +	qio->complete(qio);
> +}
> +
> +static void qcow2_aio_complete(struct kiocb *iocb, long ret, long ret2)
> +{
> +	struct qio *qio = container_of(iocb, struct qio, iocb);
> +
> +	WARN_ON_ONCE(ret > INT_MAX);
> +	qio->ret = (int)ret;
> +	qcow2_aio_do_completion(qio);
> +}
> +
> +void call_rw_iter(struct file *file, loff_t pos, unsigned int rw,
> +		  struct iov_iter *iter, struct qio *qio)
> +{
> +	struct kiocb *iocb = &qio->iocb;
> +	int ret;
> +
> +	iocb->ki_pos = pos;
> +	iocb->ki_filp = file;
> +	iocb->ki_complete = qcow2_aio_complete;
> +	iocb->ki_flags = IOCB_DIRECT;
> +	iocb->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
> +
> +	atomic_set(&qio->aio_ref, 2);
> +
> +	if (rw == WRITE)
> +		ret = call_write_iter(file, iocb, iter);
> +	else
> +		ret = call_read_iter(file, iocb, iter);
> +
> +	qcow2_aio_do_completion(qio);
> +
> +	if (ret != -EIOCBQUEUED)
> +		iocb->ki_complete(iocb, ret, 0);
> +}
> +
> +void free_md_page(struct md_page *md)
> +{
> +	WARN_ON_ONCE(md->wbd || md->lockd);
> +	put_page(md->page);
> +	kfree(md);
> +}
> +
> +static void free_md_pages_tree(struct rb_root *root)
> +{
> +	struct rb_node *node;
> +	struct md_page *md;
> +
> +	while ((node = root->rb_node) != NULL) {
> +		md = rb_entry(node, struct md_page, node);
> +		rb_erase(node, root);
> +		free_md_page(md);
> +	}
> +}
> +
> +/* This flushes activity remaining after qios endio (delayed md pages wb */
> +void flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2)
> +{
> +	struct rb_node *node;
> +	struct md_page *md;
> +	int i;
> +
> +	/*
> +	 * We need second iteration, since revert_clusters_alloc()
> +	 * may start timer again after failed wb.
> +	 */
> +	for (i = 0; i < 2; i++) {
> +		del_timer_sync(&qcow2->slow_wb_timer);
> +		slow_wb_timer_fn(&qcow2->slow_wb_timer);
> +		/* Start md writeback */
> +		flush_workqueue(tgt->wq);
> +		/* Wait AIO of md wb */
> +		qcow2_inflight_ref_switch(tgt);
> +	}
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	for (node = rb_first(&qcow2->md_pages);
> +	     node; node = rb_next(node)) {
> +		md = rb_entry(node, struct md_page, node);
> +		/* FIXME: call md_make_dirty() and try once again? */
> +		if (md->status & MD_WRITEBACK_ERROR) {
> +			pr_err("qcow2: Failed to write dirty pages\n");
> +			tgt->md_writeback_error = true;
> +			break;
> +		}
> +	}
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +}
> +
> +static void flush_deferred_activity_all(struct qcow2_target *tgt)
> +{
> +	struct qcow2 *qcow2 = tgt->top;
> +
> +	while (qcow2) {
> +		flush_deferred_activity(tgt, qcow2);
> +		qcow2 = qcow2->lower;
> +	}
> +}
> +static void free_md_pages_all(struct qcow2_target *tgt)
> +{
> +	struct qcow2 *qcow2 = tgt->top;
> +
> +	while (qcow2) {
> +		free_md_pages_tree(&qcow2->md_pages);
> +		qcow2 = qcow2->lower;
> +	}
> +}
> +
> +void qcow2_destroy(struct qcow2 *qcow2)
> +{
> +	int i;
> +
> +	for (i = 0; i < QLIST_COUNT; i++)
> +		WARN(!list_empty(&qcow2->qios[i]),
> +		     "qcow2: list %d is not empty", i);
> +
> +	WARN_ON(!list_empty(&qcow2->paused_qios) ||
> +		!list_empty(&qcow2->wb_batch_list) ||
> +		!list_empty(&qcow2->slow_wb_batch_list) ||
> +		timer_pending(&qcow2->slow_wb_timer));
> +
> +	free_md_pages_tree(&qcow2->md_pages);
> +	if (qcow2->file)
> +		fput(qcow2->file);
> +
> +	kfree(qcow2);
> +}
> +
> +static void qcow2_tgt_destroy(struct qcow2_target *tgt)
> +{
> +	struct qcow2 *lower, *qcow2 = tgt->top;
> +	unsigned int i;
> +
> +	if (tgt->wq) {
> +		/*
> +		 * All activity from DM bios are already done,
> +		 * since DM waits them. Complete our deferred:
> +		 */
> +		flush_deferred_activity_all(tgt);
> +		/* Now kill the queue */
> +		destroy_workqueue(tgt->wq);
> +		mempool_destroy(tgt->qio_pool);
> +	}
> +
> +	for (i = 0; i < 2; i++)
> +		percpu_ref_exit(&tgt->inflight_ref[i]);
> +
> +	while (qcow2) {
> +		lower = qcow2->lower;
> +		qcow2_destroy(qcow2);
> +		qcow2 = lower;
> +	}
> +
> +	kfree(tgt);
> +}
> +
> +static struct md_page *__md_page_find(struct qcow2 *qcow2, unsigned int id)
> +{
> +	struct rb_node *node = qcow2->md_pages.rb_node;
> +	struct md_page *md;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +
> +	while (node) {
> +		md = rb_entry(node, struct md_page, node);
> +		if (id < md->id)
> +			node = node->rb_left;
> +		else if (id > md->id)
> +			node = node->rb_right;
> +		else
> +			return md;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct md_page *md_page_find(struct qcow2 *qcow2, unsigned int id)
> +{
> +	struct md_page *md;
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	md = __md_page_find(qcow2, id);
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +	return md;
> +}
> +
> +/*
> + * This returns md if it's found and up to date, or NULL.
> + * @qio is zeroed if it's postponed.
> + */
> +struct md_page *md_page_find_or_postpone(struct qcow2 *qcow2, unsigned int id,
> +					 struct qio **qio)
> +{
> +	struct md_page *md;
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	md = __md_page_find(qcow2, id);
> +	if (md && !(md->status & MD_UPTODATE)) {
> +		if (qio) {
> +			list_add_tail(&(*qio)->link, &md->wait_list);
> +			*qio = NULL;
> +		}
> +		md = NULL;
> +	}
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +
> +	return md;
> +}
> +
> +static void md_page_insert(struct qcow2 *qcow2, struct md_page *new_md)
> +{
> +	struct rb_root *root = &qcow2->md_pages;
> +	unsigned int new_id = new_md->id;
> +	struct rb_node *parent, **node;
> +	struct md_page *md;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	node = &root->rb_node;
> +	parent = NULL;
> +
> +	while (*node) {
> +		parent = *node;
> +		md = rb_entry(*node, struct md_page, node);
> +		if (new_id < md->id)
> +			node = &parent->rb_left;
> +		else if (new_id > md->id)
> +			node = &parent->rb_right;
> +		else
> +			BUG();
> +	}
> +
> +	rb_link_node(&new_md->node, parent, node);
> +	rb_insert_color(&new_md->node, root);
> +}
> +
> +void md_page_erase(struct qcow2 *qcow2, struct md_page *md)
> +{
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	rb_erase(&md->node, &qcow2->md_pages);
> +}
> +
> +struct md_page *md_page_renumber(struct qcow2 *qcow2, unsigned int id,
> +						      unsigned int new_id)
> +{
> +	struct md_page *md;
> +
> +	lockdep_assert_held(&qcow2->md_pages_lock);
> +	md = __md_page_find(qcow2, id);
> +	if (md) {
> +		WARN_ON_ONCE(!list_empty(&md->wait_list));
> +		md_page_erase(qcow2, md);
> +		md->id = new_id;
> +		md_page_insert(qcow2, md);
> +	}
> +	return md;
> +}
> +
> +void zero_fill_page_from(struct page *page, unsigned int from)
> +{
> +	void *addr = kmap_atomic(page);
> +
> +	memset(addr + from, 0, PAGE_SIZE - from);
> +	kunmap_atomic(addr);
> +}
> +
> +int alloc_and_insert_md_page(struct qcow2 *qcow2, u64 index, struct md_page **md)
> +{
> +	int ret = -ENOMEM;
> +
> +	*md = kmalloc(sizeof(**md), GFP_KERNEL);
> +	if (!*md)
> +		return -ENOMEM;
> +	(*md)->page = alloc_page(GFP_KERNEL);
> +	if (!(*md)->page)
> +		goto err_kfree;
> +
> +	(*md)->id = index;
> +	(*md)->status = 0;
> +	(*md)->wbd = NULL;
> +	(*md)->lockd = NULL;
> +	atomic_set(&(*md)->wpc_readers, 0);
> +	(*md)->wpc_noread_count = 0;
> +	INIT_LIST_HEAD(&(*md)->wait_list);
> +	INIT_LIST_HEAD(&(*md)->wpc_readers_wait_list);
> +	INIT_LIST_HEAD(&(*md)->wb_link);
> +
> +	spin_lock_irq(&qcow2->md_pages_lock);
> +	md_page_insert(qcow2, *md);
> +	spin_unlock_irq(&qcow2->md_pages_lock);
> +	return 0;
> +
> +err_kfree:
> +	kfree(*md);
> +	return ret;
> +}
> +
> +static void inflight_ref_exit0(struct percpu_ref *ref)
> +{
> +	struct qcow2_target *tgt = container_of(ref, struct qcow2_target,
> +						inflight_ref[0]);
> +	complete(&tgt->inflight_ref_comp);
> +}
> +
> +static void inflight_ref_exit1(struct percpu_ref *ref)
> +{
> +	struct qcow2_target *tgt = container_of(ref, struct qcow2_target,
> +						inflight_ref[1]);
> +	complete(&tgt->inflight_ref_comp);
> +}
> +
> +static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti)
> +{
> +	percpu_ref_func_t *release;
> +	struct qcow2_target *tgt;
> +	unsigned int i, flags;
> +
> +	tgt = kzalloc(sizeof(*tgt), GFP_KERNEL);
> +	if (!tgt)
> +		return NULL;
> +	tgt->qio_pool = mempool_create_kmalloc_pool(MIN_QIOS,
> +						    sizeof(struct qio));
> +	if (!tgt->qio_pool) {
> +		ti->error = "Can't create mempool";
> +		goto out_target;
> +	}
> +
> +	flags = WQ_MEM_RECLAIM|WQ_HIGHPRI|WQ_UNBOUND;
> +	tgt->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, flags, 0);
> +	if (!tgt->wq) {
> +		ti->error = "Can't create workqueue";
> +		goto out_pool;
> +	}
> +
> +	for (i = 0; i < 2; i++) {
> +		release = i ? inflight_ref_exit1 : inflight_ref_exit0;
> +		if (percpu_ref_init(&tgt->inflight_ref[i], release,
> +				    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
> +			if (i)
> +				percpu_ref_exit(&tgt->inflight_ref[0]);
> +			ti->error = "could not alloc percpu_ref";
> +			goto out_wq;
> +		}
> +	}
> +
> +	init_completion(&tgt->inflight_ref_comp);
> +	spin_lock_init(&tgt->err_status_lock);
> +	mutex_init(&tgt->ctl_mutex);
> +	init_waitqueue_head(&tgt->service_wq);
> +	ti->private = tgt;
> +	tgt->ti = ti;
> +	qcow2_set_service_operations(ti, false);
> +
> +	return tgt;
> +out_wq:
> +	destroy_workqueue(tgt->wq);
> +out_pool:
> +	mempool_destroy(tgt->qio_pool);
> +out_target:
> +	kfree(tgt);
> +	return NULL;
> +}
> +
> +static int qcow2_check_convert_hdr(struct QCowHeader *raw_hdr,
> +				   struct QCowHeader *hdr,
> +				   u64 min_len, u64 max_len)
> +{
> +	u32 clu_size;
> +	bool ext_l2;
> +
> +	hdr->magic = cpu_to_be32(raw_hdr->magic);
> +	hdr->version = be32_to_cpu(raw_hdr->version);
> +	hdr->cluster_bits = be32_to_cpu(raw_hdr->cluster_bits);
> +	hdr->size = be64_to_cpu(raw_hdr->size);
> +	/*
> +	 * In this driver we never check userspace passed correct backing
> +	 * file fd, since it's impossible: here can be name of a symlink.
> +	 */
> +	hdr->backing_file_offset = be64_to_cpu(raw_hdr->backing_file_offset);
> +	hdr->backing_file_size = be32_to_cpu(raw_hdr->backing_file_size);
> +	hdr->crypt_method = be32_to_cpu(raw_hdr->crypt_method);
> +	hdr->l1_size = be32_to_cpu(raw_hdr->l1_size);
> +	hdr->l1_table_offset = be64_to_cpu(raw_hdr->l1_table_offset);
> +	hdr->refcount_table_offset = be64_to_cpu(raw_hdr->refcount_table_offset);
> +	hdr->refcount_table_clusters = be32_to_cpu(raw_hdr->refcount_table_clusters);
> +	hdr->nb_snapshots = be32_to_cpu(raw_hdr->nb_snapshots);
> +	hdr->snapshots_offset = be64_to_cpu(raw_hdr->snapshots_offset);
> +
> +	clu_size = 1 << hdr->cluster_bits;
> +	if (hdr->size < min_len || hdr->size > max_len ||
> +	    /* Note, we do not extend L1 table: */
> +	    (u64)hdr->l1_size * clu_size / sizeof(u64) * clu_size < min_len)
> +		return -EBADSLT;
> +
> +	if (hdr->magic != QCOW_MAGIC || hdr->version < 2 || hdr->version > 3 ||
> +	    (hdr->l1_table_offset & (clu_size - 1)) ||
> +	    hdr->cluster_bits < 9 || hdr->cluster_bits > 21 ||
> +	    (hdr->refcount_table_offset & (clu_size - 1)))
> +		return -EINVAL;
> +
> +	if (hdr->crypt_method != 0)
> +		return -EOPNOTSUPP;
> +
> +	hdr->refcount_order = 4;
> +
> +	if (hdr->version == 2)
> +		return 0;
> +
> +	hdr->incompatible_features = be64_to_cpu(raw_hdr->incompatible_features);
> +	hdr->autoclear_features = be64_to_cpu(raw_hdr->autoclear_features);
> +	hdr->refcount_order = be32_to_cpu(raw_hdr->refcount_order);
> +	hdr->header_length = be32_to_cpu(raw_hdr->header_length);
> +
> +	if (kernel_sets_dirty_bit !=
> +	    !(hdr->incompatible_features & INCOMPATIBLE_FEATURES_DIRTY_BIT))
> +		return kernel_sets_dirty_bit ? -EUCLEAN : -ENOLCK;
> +	if (hdr->incompatible_features & ~INCOMPATIBLE_FEATURES_EXTL2_BIT)
> +		return -EOPNOTSUPP;
> +	ext_l2 = hdr->incompatible_features & INCOMPATIBLE_FEATURES_EXTL2_BIT;
> +
> +	if (hdr->refcount_order > 6 || (ext_l2 && hdr->cluster_bits < 14))
> +		return -EINVAL;
> +
> +	if (hdr->header_length < offsetof(struct QCowHeader, compression_type))
> +		return -EINVAL;
> +
> +	if (hdr->header_length < offsetof(struct QCowHeader, padding))
> +		return 0;
> +
> +	hdr->compression_type = (u8)raw_hdr->compression_type;
> +	if (hdr->compression_type != (u8)0)
> +		return -EOPNOTSUPP;
> +
> +	return 0;
> +}
> +
> +void calc_cached_parameters(struct qcow2 *qcow2, struct QCowHeader *hdr)
> +{
> +	s64 clu_size, reftable_clus = hdr->refcount_table_clusters;
> +	loff_t pos, tmp, max;
> +
> +	qcow2->clu_size = clu_size = 1 << hdr->cluster_bits;
> +	qcow2->ext_l2 = hdr->incompatible_features & INCOMPATIBLE_FEATURES_EXTL2_BIT;
> +	if (qcow2->ext_l2)
> +		qcow2->subclu_size = clu_size / 32;
> +	qcow2->l2_entries = clu_size / (sizeof(u64) * (1 + qcow2->ext_l2));
> +	qcow2->refblock_bits = 1 << hdr->refcount_order;
> +	qcow2->refblock_entries = clu_size * 8 / qcow2->refblock_bits;
> +	pos = div64_s64(PAGE_SIZE * 8ULL, qcow2->refblock_bits) * clu_size;
> +	qcow2->r2_page_covered_file_size = pos;
> +	max = round_down(LLONG_MAX, clu_size);
> +	tmp = div64_s64(reftable_clus * qcow2->refblock_entries, sizeof(u64));
> +	if (div64_s64(max, (u64)clu_size * clu_size) >= tmp) {
> +		tmp = div64_s64(reftable_clus * clu_size, sizeof(u64));
> +		pos = tmp * qcow2->refblock_entries * clu_size;
> +	} else {
> +		pos = max;
> +	}
> +	qcow2->reftable_max_file_size = pos;
> +}
> +
> +int qcow2_set_image_file_features(struct qcow2 *qcow2, bool dirty)
> +{
> +	u64 dirty_mask = cpu_to_be64(INCOMPATIBLE_FEATURES_DIRTY_BIT);
> +	struct QCowHeader *raw_hdr;
> +	struct md_page *md;
> +
> +	if (qcow2->hdr.version ==  2)
> +		return 0;
> +
> +	md = md_page_find(qcow2, 0);
> +	if (WARN_ON_ONCE(!md || !(md->status & MD_UPTODATE)))
> +		return -EIO;
> +
> +	raw_hdr = kmap(md->page);
> +	qcow2->hdr.autoclear_features = raw_hdr->autoclear_features = 0;
> +	if (kernel_sets_dirty_bit) {
> +		if (dirty)
> +			raw_hdr->incompatible_features |= dirty_mask;
> +		else
> +			raw_hdr->incompatible_features &= ~dirty_mask;
> +	}
> +	kunmap(md->page);
> +
> +	return rw_page_sync(WRITE, qcow2, md->id, md->page);
> +}
> +
> +static struct qcow2 *qcow2_alloc_delta(struct qcow2_target *tgt, struct qcow2 *upper)
> +{
> +	struct qcow2 *qcow2;
> +	int i;
> +
> +	qcow2 = kzalloc(sizeof(*qcow2), GFP_KERNEL);
> +	if (!qcow2)
> +		return ERR_PTR(-ENOMEM);
> +	qcow2->tgt = tgt;
> +
> +	for (i = 0; i < QLIST_COUNT; i++)
> +		INIT_LIST_HEAD(&qcow2->qios[i]);
> +	INIT_LIST_HEAD(&qcow2->paused_qios);
> +	INIT_LIST_HEAD(&qcow2->wb_batch_list);
> +	INIT_LIST_HEAD(&qcow2->slow_wb_batch_list);
> +	spin_lock_init(&qcow2->deferred_lock);
> +	spin_lock_init(&qcow2->md_pages_lock);
> +	timer_setup(&qcow2->slow_wb_timer, slow_wb_timer_fn, 0);
> +	INIT_WORK(&qcow2->worker, do_qcow2_work);
> +	INIT_WORK(&qcow2->fsync_worker, do_qcow2_fsync_work);
> +
> +	if (upper)
> +		upper->lower = qcow2;
> +	else /* Top delta */
> +		tgt->top = qcow2;
> +
> +	return qcow2;
> +}
> +
> +static int qcow2_attach_file(struct dm_target *ti, struct qcow2_target *tgt,
> +			     struct qcow2 *qcow2, int fd)
> +{
> +	struct file *file;
> +	fmode_t mode;
> +
> +	file = qcow2->file = fget(fd);
> +	if (!file) /* In case of further errors, cleanup is made by caller */
> +		return -ENOENT;
> +
> +	if (!S_ISREG(file_inode(file)->i_mode))
> +		return -EINVAL;
> +
> +	mode = tgt->top != qcow2 ? FMODE_READ : dm_table_get_mode(ti->table);
> +	mode &= (FMODE_READ|FMODE_WRITE);
> +	if (mode & ~(file->f_mode & (FMODE_READ|FMODE_WRITE)))
> +		return -EACCES;
> +
> +	return 0;
> +}
> +
> +static int qcow2_parse_header(struct dm_target *ti, struct qcow2 *qcow2,
> +			      struct qcow2 *upper, bool is_bottom)
> +{
> +	struct QCowHeader *raw_hdr, *hdr = &qcow2->hdr;
> +	loff_t min_len, max_len, new_size;
> +	struct file *file = qcow2->file;
> +	struct md_page *md;
> +	int ret;
> +
> +	qcow2->file_size = i_size_read(file_inode(file));
> +	if ((file->f_mode & FMODE_WRITE) && (qcow2->file_size & ~PAGE_MASK)) {
> +		new_size = PAGE_ALIGN(qcow2->file_size);
> +		ret = qcow2_truncate_safe(file, new_size);
> +		if (ret) {
> +			pr_err("qcow2: Can't truncate file\n");
> +			return ret;
> +		} /* See md_page_read_complete() */
> +		qcow2->file_size = new_size;
> +	}
> +	qcow2->file_preallocated_area_start = qcow2->file_size;
> +
> +	ret = alloc_and_insert_md_page(qcow2, 0, &md);
> +	if (ret)
> +		return ret;
> +	ret = rw_page_sync(READ, qcow2, md->id, md->page);
> +	if (ret)
> +		return ret;
> +	md->status |= MD_UPTODATE;
> +
> +	raw_hdr = kmap(md->page);
> +	min_len = to_bytes(ti->len);
> +	max_len = LLONG_MAX;
> +	if (upper) {
> +		min_len = PAGE_SIZE;
> +		max_len = upper->hdr.size;
> +	}
> +	ret = qcow2_check_convert_hdr(raw_hdr, hdr, min_len, max_len);
> +	kunmap(md->page);
> +	if (ret < 0)
> +		goto out;
> +
> +	calc_cached_parameters(qcow2, hdr);
> +	ret = -EOPNOTSUPP;
> +	if (qcow2->clu_size < PAGE_SIZE ||
> +	    (qcow2->ext_l2 && qcow2->clu_size < PAGE_SIZE * 32))
> +		goto out;
> +	ret = -EXDEV;
> +	if (upper && (upper->clu_size != qcow2->clu_size ||
> +		      upper->ext_l2 != qcow2->ext_l2))
> +		goto out; /* This is not supported yet */
> +	ret = -ENOENT;
> +	if (is_bottom && qcow2->hdr.backing_file_offset)
> +		goto out;
> +	qcow2->free_cluster_search_pos = qcow2->clu_size * 1;
> +
> +	ret = -EFBIG;
> +	if (qcow2->reftable_max_file_size < qcow2->file_size)
> +		goto out;
> +	ret = 0;
> +out:
> +	return ret;
> +}
> +
> +static int qcow2_parse_metadata(struct dm_target *ti, struct qcow2_target *tgt)
> +{
> +	unsigned int i, nr_images = tgt->nr_images;
> +	struct qcow2 *qcow2, *upper = NULL;
> +	int ret;
> +
> +	qcow2 = top_qcow2_protected(ti);
> +	for (i = 0; i < nr_images; i++) {
> +		ret = -ENOENT;
> +		if (!qcow2)
> +			goto out;
> +
> +		ret = qcow2_parse_header(ti, qcow2, upper, i == nr_images - 1);
> +		if (ret)
> +			goto out;
> +
> +		upper = qcow2;
> +		qcow2 = qcow2->lower;
> +	}
> +
> +	ret = 0;
> +out:
> +	if (ret)
> +		pr_err("dm-qcow2: Can't parse metadata\n");
> +	return ret;
> +}
> +
> +static int qcow2_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> +{
> +	struct qcow2 *qcow2, *upper = NULL;
> +	struct qcow2_target *tgt;
> +	int i, fd, ret;
> +
> +	if (argc < 1 || ti->begin != 0)
> +		return -EINVAL;
> +
> +	tgt = alloc_qcow2_target(ti);
> +	if (!tgt)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Userspace passes deltas in bottom, ..., top order,
> +	 * but we attach it vise versa: from top to bottom.
> +	 */
> +	for (i = argc - 1; i >= 0; i--) {
> +		ret = -EINVAL;
> +		if (kstrtos32(argv[i], 10, &fd) < 0) {
> +			ti->error = "Wrong fd";
> +			goto err;
> +		}
> +
> +		qcow2 = qcow2_alloc_delta(tgt, upper);
> +		if (IS_ERR(qcow2)) {
> +			ret = PTR_ERR(qcow2);
> +			goto err;
> +		}
> +
> +		ret = qcow2_attach_file(ti, tgt, qcow2, fd);
> +		if (ret) {
> +			ti->error = "Error attaching file";
> +			goto err;
> +		}
> +
> +		upper = qcow2;
> +	}
> +
> +	tgt->nr_images = argc;
> +
> +	ret = qcow2_parse_metadata(ti, tgt);
> +	if (ret)
> +		goto err;
> +
> +	ti->flush_supported = true;
> +	ti->num_flush_bios = 1;
> +	ti->discards_supported = true;
> +	ti->num_discard_bios = 1;
> +	ti->per_io_data_size = qcow2_per_io_data_size();
> +	return 0;
> +err:
> +	qcow2_tgt_destroy(tgt);
> +	return ret;
> +}
> +
> +static void qcow2_dtr(struct dm_target *ti)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +
> +	qcow2_tgt_destroy(tgt);
> +}
> +
> +static void qcow2_truncate_preallocations(struct dm_target *ti)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +	struct qcow2 *qcow2 = top_qcow2_protected(ti);
> +	loff_t end = qcow2->file_preallocated_area_start;
> +	int ret;
> +
> +	if (!(dm_table_get_mode(ti->table) & FMODE_WRITE))
> +		return;
> +	if (end == qcow2->file_size)
> +		return;
> +
> +	ret = qcow2_truncate_safe(qcow2->file, end);
> +	if (ret) {
> +		pr_err("dm-qcow2: Can't truncate preallocations\n");
> +		tgt->truncate_error = true;
> +		return;
> +	}
> +
> +	qcow2->file_preallocated_area_start = end;
> +	qcow2->file_size = end;
> +}
> +
> +static void qcow2_io_hints(struct dm_target *ti, struct queue_limits *limits)
> +{
> +	struct qcow2 *qcow2 = top_qcow2_protected(ti);
> +	unsigned int block_size = 512;
> +	struct super_block *sb;
> +
> +	sb = file_inode(qcow2->file)->i_sb;
> +	if (sb->s_bdev)
> +		block_size = bdev_logical_block_size(sb->s_bdev);
> +	/*
> +	 * Even if this is less than discard_granularity of bdev,
> +	 * we can free a block on filesystem.
> +	 */
> +	limits->discard_granularity = sb->s_blocksize;
> +	limits->max_discard_sectors = to_sector(qcow2->clu_size);
> +
> +	limits->logical_block_size = block_size;
> +	limits->physical_block_size = block_size;
> +
> +	blk_limits_io_min(limits, block_size);
> +	blk_limits_io_opt(limits, qcow2->clu_size);
> +}
> +
> +static void qcow2_status(struct dm_target *ti, status_type_t type,
> +			 unsigned int status_flags, char *result,
> +			 unsigned int maxlen)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +	unsigned int sz = 0;
> +
> +	switch (type) {
> +	case STATUSTYPE_INFO:
> +		result[0] = '\0';
> +		break;
> +	case STATUSTYPE_TABLE:
> +		DMEMIT("%u", tgt->nr_images);
> +		break;
> +	}
> +}
> +
> +static void qcow2_presuspend(struct dm_target *ti)
> +{
> +	qcow2_set_service_operations(ti, false);
> +}
> +static void qcow2_presuspend_undo(struct dm_target *ti)
> +{
> +	qcow2_set_service_operations(ti, true);
> +}
> +static void qcow2_postsuspend(struct dm_target *ti)
> +{
> +	struct qcow2 *qcow2 = top_qcow2_protected(ti);
> +	int ret;
> +
> +	flush_deferred_activity_all(to_qcow2_target(ti));
> +	qcow2_truncate_preallocations(ti);
> +
> +	if (dm_table_get_mode(ti->table) & FMODE_WRITE) {
> +		ret = qcow2_set_image_file_features(qcow2, false);
> +		if (ret)
> +			pr_err("qcow2: Can't set features\n");
> +	}
> +}
> +static int qcow2_preresume(struct dm_target *ti)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +	int ret = 0;
> +
> +	if (qcow2_wants_check(tgt)) {
> +		pr_err("qcow2: image check and target reload are required\n");
> +		return -EIO;
> +	}
> +
> +	free_md_pages_all(tgt);
> +	/*
> +	 * Reading metadata here allows userspace to modify images
> +	 * of suspended device without reloading target. We also
> +	 * want to do this in .ctr to break device creation early
> +	 * if images are not valid.
> +	 */
> +	ret = qcow2_parse_metadata(ti, tgt);
> +	if (ret)
> +		return ret;
> +	/*
> +	 * Despite .preresume has no undo, our target is singleton,
> +	 * so we can set features uncoditionally here.
> +	 */
> +	if (dm_table_get_mode(ti->table) & FMODE_WRITE) {
> +		ret = qcow2_set_image_file_features(tgt->top, true);
> +		if (ret)
> +			pr_err("qcow2: Can't set features\n");
> +	}
> +
> +	return ret;
> +}
> +static void qcow2_resume(struct dm_target *ti)
> +{
> +	qcow2_set_service_operations(ti, true);
> +}
> +
> +static struct target_type qcow2_target = {
> +	.name = "qcow2",
> +	.version = {1, 0, 0},
> +	.features = DM_TARGET_SINGLETON|DM_TARGET_IMMUTABLE,
> +	.module = THIS_MODULE,
> +	.ctr = qcow2_ctr,
> +	.dtr = qcow2_dtr,
> +	.io_hints = qcow2_io_hints,
> +	.status = qcow2_status,
> +	.presuspend = qcow2_presuspend,
> +	.presuspend_undo = qcow2_presuspend_undo,
> +	.postsuspend = qcow2_postsuspend,
> +	.preresume = qcow2_preresume,
> +	.resume = qcow2_resume,
> +	.clone_and_map_rq = qcow2_clone_and_map,
> +	.message = qcow2_message,
> +};
> +
> +static int __init dm_qcow2_init(void)
> +{
> +	int ret;
> +
> +	ret = dm_register_target(&qcow2_target);
> +	if (ret)
> +		DMERR("qcow2 target registration failed: %d", ret);
> +
> +	return ret;
> +}
> +
> +static void __exit dm_qcow2_exit(void)
> +{
> +	dm_unregister_target(&qcow2_target);
> +}
> +
> +module_init(dm_qcow2_init);
> +module_exit(dm_qcow2_exit);
> +
> +MODULE_DESCRIPTION("QCOW2 block device driver");
> +MODULE_AUTHOR("Kirill Tkhai <ktkhai at virtuozzo.com>");
> +MODULE_LICENSE("GPL");
> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h
> new file mode 100644
> index 000000000000..9d4a9f8a1453
> --- /dev/null
> +++ b/drivers/md/dm-qcow2.h
> @@ -0,0 +1,360 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __DM_QCOW2_H
> +#define __DM_QCOW2_H
> +
> +#include <linux/percpu-refcount.h>
> +#include <linux/device-mapper.h>
> +#include <linux/fs.h>
> +
> +#define DM_MSG_PREFIX "qcow2"
> +
> +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
> +/*
> + * QEMU has this limit, so we should follow it to keep our images
> + * mountable in VMs... Note, that it's possible to create a disk
> + * with parameters in QEMU, whose size can't be covered by refcount table.
> + */
> +#define REFCOUNT_TABLE_MAX_SIZE (8 * 1024 * 1024)
> +
> +#define MIN_QIOS 512
> +#define WB_TIMEOUT_JI (60 * HZ)
> +#define PREALLOC_SIZE (128ULL * 1024 * 1024)
> +
> +struct QCowHeader {
> +	uint32_t magic;
> +	uint32_t version;
> +	uint64_t backing_file_offset;
> +	uint32_t backing_file_size;
> +	uint32_t cluster_bits;
> +	uint64_t size; /* in bytes */
> +	uint32_t crypt_method;
> +	uint32_t l1_size; /* number of entries in the active L1 table (not clusters) */
> +	uint64_t l1_table_offset;
> +	uint64_t refcount_table_offset;
> +	uint32_t refcount_table_clusters;
> +	uint32_t nb_snapshots;
> +	uint64_t snapshots_offset;
> +
> +	/* The following fields are only valid for version >= 3 */
> +#define INCOMPATIBLE_FEATURES_DIRTY_BIT	(1 << 0)
> +#define INCOMPATIBLE_FEATURES_EXTL2_BIT	(1 << 4)
> +	uint64_t incompatible_features;
> +	uint64_t compatible_features;
> +	uint64_t autoclear_features;
> +
> +	uint32_t refcount_order;
> +	uint32_t header_length;
> +
> +	/* Additional fields */
> +	uint8_t compression_type;
> +
> +	/* header must be a multiple of 8 */
> +	uint8_t padding[7];
> +} __packed;
> +
> +struct wb_desc {
> +	struct md_page *md;
> +#define LX_INDEXES_PER_PAGE (PAGE_SIZE / sizeof(u64))
> +#define LX_INDEXES_BYTES (BITS_TO_LONGS(LX_INDEXES_PER_PAGE) * sizeof(unsigned long))
> +	unsigned long *changed_indexes;
> +	/*
> +	 * Contains old stable values of preallocated/cow entries
> +	 * to restore them in case of md writeback fails.
> +	 */
> +	struct page *pe_page;
> +	struct list_head submitted_list;
> +	struct list_head completed_list;
> +	/*
> +	 * These bios want to be dispatched in case of writeback
> +	 * success, or bio_endio() in case of error.
> +	 * XXX: Possible we need the same for plain struct md_page.
> +	 */
> +	struct list_head dependent_list;
> +	bool completed;
> +	int ret;
> +};
> +
> +struct lock_desc {
> +	int nr; /* Number of set bits */
> +	unsigned long indexes[LX_INDEXES_BYTES/sizeof(unsigned long)];
> +};
> +
> +struct md_page {
> +	struct rb_node node;
> +	u64 id; /* Number of this page starting from hdr */
> +#define MD_UPTODATE	(1U << 0) /* Page was read from disk */
> +#define MD_DIRTY	(1U << 1) /* Page contains changes and wants writeback */
> +#define MD_WRITEBACK	(1U << 2) /* Writeback was submitted */
> +#define MD_WRITEBACK_ERROR (1U << 3) /* Last writeback failed with error */
> +	unsigned int status;
> +	struct page *page;
> +	struct list_head wait_list;
> +	/* To link in qcow2::{,slow}wb_batch_list and qcow2::QLIST_COMPLETED_WB */
> +	struct list_head wb_link;
> +	struct wb_desc *wbd; /* For L1 and L2 update */
> +	struct lock_desc *lockd; /* Locked clus map */
> +	/*
> +	 * Readers of clusters, WRITE to which results in COW.
> +	 * These are compressed clusters, snapshot clusters, etc.
> +	 */
> +	atomic_t wpc_readers;
> +	int wpc_noread_count; /* Read is prohibited, if positive */
> +	struct list_head wpc_readers_wait_list;
> +};
> +
> +struct qcow2_target {
> +	struct dm_target *ti;
> +	mempool_t *qio_pool;
> +	/*
> +	 * start_processing_qrq() is the only place during IO handling,
> +	 * where it's allowed to dereference @top. See backward merge.
> +	 */
> +	struct qcow2 *top;
> +	struct workqueue_struct *wq;
> +
> +	struct completion inflight_ref_comp;
> +	struct percpu_ref inflight_ref[2];
> +	unsigned int inflight_ref_index:1;
> +
> +	bool service_operations_allowed;
> +	bool md_writeback_error;
> +	bool truncate_error;
> +
> +	unsigned int nr_images;
> +
> +	atomic_t service_qios;
> +	struct wait_queue_head service_wq;
> +
> +	spinlock_t err_status_lock;
> +	struct mutex ctl_mutex;
> +};
> +
> +enum {
> +	QLIST_EMBEDDED = 0, /*
> +			     * List for initial setup embedded qios
> +			     * related to prq (in process context).
> +			     * This is used only for top qcow2 image.
> +			     */
> +	QLIST_DEFERRED,
> +	QLIST_FLUSH,
> +	QLIST_COMPLETED_WB,
> +	QLIST_ZREAD,
> +	QLIST_BMERGE_WRITE,
> +	QLIST_COW_DATA,
> +	QLIST_COW_INDEXES,
> +	QLIST_COW_END,
> +
> +	QLIST_COUNT,
> +	QLIST_INVALID = QLIST_COUNT,
> +};
> +
> +struct qcow2 {
> +	struct qcow2_target *tgt;
> +	struct file *file;
> +	loff_t file_size;
> +	loff_t file_preallocated_area_start;
> +	/* Maximum file size covered by refcount table */
> +	loff_t reftable_max_file_size;
> +	/* Position to search next unused cluster */
> +	loff_t free_cluster_search_pos;
> +
> +	struct qcow2 *lower; /* Lower delta (backing file) */
> +
> +	struct rb_root md_pages; /* Metadata pages */
> +	struct QCowHeader hdr;
> +	u32 clu_size;
> +	u32 subclu_size;
> +	u32 l2_entries;
> +	u32 refblock_bits;
> +	u32 refblock_entries;
> +	bool ext_l2;
> +
> +	bool pause_submitting_qios; /* This is used only on top qcow2 image */
> +	bool backward_merge_in_process;
> +	/* File size covered by single page of block entries */
> +	loff_t r2_page_covered_file_size;
> +
> +	spinlock_t deferred_lock ____cacheline_aligned;
> +	spinlock_t md_pages_lock;
> +
> +	struct list_head qios[QLIST_COUNT];
> +	struct list_head paused_qios; /* For pause_submitting_qios */
> +
> +	/* For batching md update: */
> +	struct list_head wb_batch_list;
> +	struct list_head slow_wb_batch_list;
> +	struct timer_list slow_wb_timer;
> +
> +	struct work_struct worker;
> +	struct work_struct fsync_worker;
> +};
> +
> +/*
> + * struct qio is embedded in every incoming bio, so we keep it
> + * as small as possible. It is aimed to fit enough bytes only
> + * for the most likely actions. To process COW, compressed
> + * clusters and other rare actions we need more auxiliary bytes,
> + * so we introduce this struct qio_ext in addition to struct qio.
> + */
> +struct qio_ext {
> +	struct md_page *lx_md, *r2_md, *md;
> +	u32 lx_index_in_page, r2_index_in_page;
> +	u64 allocated_clu_pos;
> +
> +	loff_t cow_clu_pos;
> +	loff_t cow_clu_end;
> +	u64 new_ext_l2;
> +	u32 cow_mask;
> +	bool only_set_ext_l2:1;
> +
> +	u8 cow_level;
> +
> +#define MD_INDEX_SET_UNLOCKED	(1ULL << 0)
> +#define DEC_WPC_NOREAD_COUNT	(1ULL << 1)
> +#define FREE_QIO_DATA_QVEC	(1ULL << 2)
> +#define FREE_ALLOCATED_CLU	(1ULL << 3)
> +	u8 cleanup_mask;
> +	u16 zdata_off; /* Offset in first page: */
> +	u16 cow_segs;
> +};
> +
> +struct qcow2_rq {
> +	struct request *rq;
> +	struct bio_vec *bvec;
> +};
> +
> +struct qio;
> +typedef void (*qcow2_endio_t)(struct qcow2_target *, struct qio *,
> +			      void *, blk_status_t);
> +
> +struct qio {
> +	struct bvec_iter bi_iter;
> +	struct bio_vec *bi_io_vec;
> +	unsigned int bi_op;
> +	blk_status_t bi_status;
> +#define QIO_FREE_ON_ENDIO_FL	(1 << 0) /* Free this qio memory from qio_endio() */
> +#define QIO_IS_MERGE_FL		(1 << 3) /* This is service merge qio */
> +#define QIO_IS_DISCARD_FL	(1 << 4) /* This zeroes index on backward merge */
> +#define QIO_IS_L1COW_FL		(1 << 5) /* This qio only wants COW at L1 */
> +#define QIO_SPLIT_INHERITED_FLAGS (QIO_IS_DISCARD_FL)
> +	u8 flags;
> +#define REF_INDEX_INVALID 2
> +	u8 ref_index:2;
> +	/*
> +	 * Some operations (say, COW) have more than one stage.
> +	 * In case of a stage may delay bio (say, it may want
> +	 * to wait reading md page from disk, or when some counter
> +	 * becomes zero), this queue_list_id shows the place, where
> +	 * bio processing should resume.
> +	 */
> +	u8 queue_list_id:4;
> +
> +	atomic_t remaining;
> +
> +	struct kiocb iocb;
> +	atomic_t aio_ref;
> +	int ret; /* iocb result */
> +	void (*complete)(struct qio *me);
> +	void *data;
> +	/* Some operations (COW) require special destruction or requeue */
> +	struct qio_ext *ext;
> +	struct list_head link;
> +	struct qcow2 *qcow2;
> +	qcow2_endio_t endio_cb;
> +	void *endio_cb_data;
> +};
> +
> +#define CLU_OFF(qcow2, pos) (pos & (qcow2->clu_size - 1))
> +#define SUBCLU_OFF(qcow2, pos) (pos & (qcow2->subclu_size - 1))
> +
> +void qcow2_destroy(struct qcow2 *qcow2);
> +int qcow2_set_image_file_features(struct qcow2 *qcow2, bool dirty);
> +int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv,
> +		  char *result, unsigned int maxlen);
> +int qcow2_clone_and_map(struct dm_target *ti, struct request *rq,
> +		   union map_info *info, struct request **clone);
> +
> +void do_qcow2_work(struct work_struct *ws);
> +void do_qcow2_fsync_work(struct work_struct *ws);
> +int alloc_and_insert_md_page(struct qcow2 *qcow2, u64 index, struct md_page **md);
> +struct md_page *md_page_find_or_postpone(struct qcow2 *qcow2, unsigned int id, struct qio **qio);
> +struct md_page *md_page_renumber(struct qcow2 *qcow2, unsigned int id, unsigned int new_id);
> +void md_page_erase(struct qcow2 *qcow2, struct md_page *md);
> +void free_md_page(struct md_page *md);
> +void zero_fill_page_from(struct page *page, unsigned int from);
> +int rw_page_sync(unsigned int rw, struct qcow2 *qcow2, u64 index, struct page *page);
> +void call_rw_iter(struct file *file, loff_t pos, unsigned int rw,
> +		  struct iov_iter *iter, struct qio *qio);
> +void calc_cached_parameters(struct qcow2 *qcow2, struct QCowHeader *hdr);
> +void slow_wb_timer_fn(struct timer_list *t);
> +struct qio *alloc_qio(mempool_t *pool, bool zero);
> +void init_qio(struct qio *qio, unsigned int bi_op, struct qcow2 *qcow2);
> +void dispatch_qios(struct qcow2 *qcow2, struct qio *qio,
> +		   struct list_head *qio_list);
> +void submit_embedded_qios(struct qcow2_target *tgt, struct list_head *list);
> +struct qcow2 *qcow2_ref_inc(struct qcow2_target *tgt, u8 *ref_index);
> +void qcow2_ref_dec(struct qcow2_target *tgt, u8 ref_index);
> +int qcow2_inflight_ref_switch(struct qcow2_target *tgt);
> +void flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2);
> +int qcow2_truncate_safe(struct file *file, loff_t new_len);
> +
> +static inline ssize_t qcow2_per_io_data_size(void)
> +{
> +	return sizeof(struct qcow2_rq) + sizeof(struct qio);
> +}
> +static inline struct qcow2_rq *map_info_to_embedded_qrq(union map_info *info)
> +{
> +	return (void *)info->ptr;
> +}
> +static inline struct qio *map_info_to_embedded_qio(union map_info *info)
> +{
> +	return (void *)info->ptr + sizeof(struct qcow2_rq);
> +}
> +static inline struct qcow2_rq *embedded_qio_to_qrq(struct qio *qio)
> +{
> +	return (void *)qio - sizeof(struct qcow2_rq);
> +}
> +
> +static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti)
> +{
> +	return ti->private;
> +}
> +
> +static inline struct qcow2 *top_qcow2_protected(struct dm_target *ti)
> +{
> +	struct qcow2_target *tgt = to_qcow2_target(ti);
> +
> +	return tgt->top;
> +}
> +
> +static inline struct qio *qio_list_pop(struct list_head *qio_list)
> +{
> +	struct qio *qio;
> +
> +	qio = list_first_entry_or_null(qio_list, struct qio, link);
> +	if (qio)
> +		list_del_init(&qio->link);
> +	return qio;
> +}
> +
> +static inline bool fake_merge_qio(struct qio *qio)
> +{
> +	return (qio->bi_op == REQ_OP_WRITE &&
> +		qio->bi_iter.bi_size == 0 &&
> +		(qio->flags & QIO_IS_MERGE_FL));
> +}
> +
> +static inline bool fake_l1cow_qio(struct qio *qio)
> +{
> +	return (qio->bi_op == REQ_OP_WRITE &&
> +		qio->bi_iter.bi_size == 0 &&
> +		(qio->flags & QIO_IS_L1COW_FL));
> +}
> +
> +static inline bool qcow2_wants_check(struct qcow2_target *tgt)
> +{
> +
> +	return !!(tgt->md_writeback_error|tgt->truncate_error);
> +}
> +
> +#endif
> diff --git a/scripts/qcow2-dm.sh b/scripts/qcow2-dm.sh
> new file mode 100755
> index 000000000000..e29205e3f973
> --- /dev/null
> +++ b/scripts/qcow2-dm.sh
> @@ -0,0 +1,103 @@
> +#!/bin/bash
> +#
> +# This requires parameter dm_qcow2.kernel_sets_dirty_bit=y
> +
> +usage () {
> +    cat <<EOF
> +Usage:
> +	$prog_name create <file.qcow2> <dev_name>
> +	$prog_name remove <file.qcow2>
> +EOF
> +}
> +
> +create () {
> +	if [ "$#" -ne 2 ]; then
> +		echo >&2 "Wrong number of arguments."; usage; exit 1;
> +	fi
> +
> +	file=$1
> +	dev=$2
> +	files=()
> +	fds=""
> +
> +	disk_sz=`qemu-img info -f qcow2 $file | grep "virtual size" | sed 's/.*(\(.*\) bytes)/\1/'`
> +	if [ -z "$disk_sz" ]; then
> +		echo "Can't get disk size."; exit 1;
> +	fi
> +
> +	while :; do
> +		if [ ! -f "$file" ]; then
> +			echo "$file does not exist."; exit 1;
> +		fi
> +
> +		files+=("$file")
> +
> +		exec {fd}<>$file || exit 1
> +		flock -x $fd || exit 1
> +		fds="$fd $fds"
> +
> +		file=`qemu-img info $file | grep "backing file:" | sed "s/backing file: //"`
> +		if [ -z "$file" ]; then
> +			break
> +		fi
> +	done
> +
> +	echo "Create device [$dev] of size $disk_sz from [${files[*]}]."
> +	dmsetup create $dev --table "0 $((disk_sz / 512)) qcow2 ${fds}"
> +}
> +
> +remove () {
> +	if [ "$#" -ne 1 ]; then
> +		echo >&2 "Wrong number of arguments."; usage; exit 1;
> +	fi
> +	user_path=$1
> +	path=`realpath $user_path`
> +
> +	while read line; do
> +		dev=`echo $line | sed "s/:.*//"`
> +		nr_imgs=`echo $line | sed "s/.*\(\w\)$/\1/"`
> +		top_img_id=$((nr_imgs - 1))
> +
> +		top_img_path=`dmsetup message $dev 0 get_img_name $top_img_id`
> +		if [ -z "$top_img_path" ]; then
> +			echo "Can't get image path."; exit 1;
> +		fi
> +
> +		if [ "$path" != "$top_img_path" ]; then
> +			continue
> +		fi
> +
> +		echo "Removing device [$dev]."
> +		dmsetup remove $dev
> +		ret=$?
> +
> +		if [ $? -eq 0 ]; then
> +			#Sanity check
> +			echo "Checking [$top_img_path]."
> +			qemu-img check $top_img_path
> +		fi
> +		exit $ret
> +
> +	done < <(LANG=C dmsetup table --target=qcow2 | grep -v "No devices found")
> +
> +	echo "Can't find device with [$user_path] top image."
> +	exit 1
> +}
> +
> +prog_name=$(basename $0)
> +
> +case $1 in
> +	"create")
> +		shift
> +		create "$@"
> +		exit 0
> +		;;
> +	"remove")
> +		shift
> +		remove "$@"
> +		;;
> +	*)
> +		usage
> +		exit 1
> +	        ;;
> +esac
>
>
> .
>