[Devel] [PATCH RH9 v2 5/8] dm/dm-qcow2: add llseek_hole

Alexander Atanasov alexander.atanasov at virtuozzo.com
Mon Aug 21 20:20:52 MSK 2023


On 16.08.23 12:32, Andrey Zhadchenko wrote:
> Implement llseek_hole() for dm-qcow2 target.
> Iterate over ranges with cluster granularity until hole or data is found.
> To reduce code duplication, we should use already existing parse_metadata()
> We can pretend that seek request is read request for metadata purposes
> and than interpret parsing result in our favor.
> Since parse_metadata() support request postponing (for example when the
> requested L2 cluster is absent in RAM), we should create separate qio
> list for our queries.
> 
> Feature: dm: implement SEEK_HOLE for dm-qcow2 and dm-ploop
> https://jira.vzint.dev/browse/PSBM-145746
> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
> 
> ---
> v2 major rework:
>   - move some code to new functions
>   - alloc seek qios dynamically
>   - handle chaining images with more care
>   - do not skip L2 if backing image exists
> 
>   drivers/md/dm-qcow2-map.c    | 257 +++++++++++++++++++++++++++++++++++
>   drivers/md/dm-qcow2-target.c |   1 +
>   drivers/md/dm-qcow2.h        |   2 +
>   3 files changed, 260 insertions(+)
> 
> diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c
> index 8c1620c11137..b15caaff8c98 100644
> --- a/drivers/md/dm-qcow2-map.c
> +++ b/drivers/md/dm-qcow2-map.c
> @@ -4000,6 +4000,14 @@ static void process_resubmit_qios(struct qcow2 *qcow2, struct list_head *qios)
>   	}
>   }
>   
> +static void process_seek_qios(struct qcow2 *qcow, struct list_head *qios)
> +{
> +	struct qio *qio;
> +
> +	while ((qio = qio_list_pop(qios)) != NULL)
> +		complete(qio->data);
> +}
> +
>   void do_qcow2_work(struct work_struct *ws)
>   {
>   	struct qcow2 *qcow2 = container_of(ws, struct qcow2, worker);
> @@ -4011,6 +4019,7 @@ void do_qcow2_work(struct work_struct *ws)
>   	LIST_HEAD(cow_indexes_qios);
>   	LIST_HEAD(cow_end_qios);
>   	LIST_HEAD(resubmit_qios);
> +	LIST_HEAD(seek_qios);
>   	unsigned int pflags = current->flags;
>   
>   	current->flags |= PF_LOCAL_THROTTLE|PF_MEMALLOC_NOIO;
> @@ -4023,6 +4032,7 @@ void do_qcow2_work(struct work_struct *ws)
>   	list_splice_init(&qcow2->qios[QLIST_COW_INDEXES], &cow_indexes_qios);
>   	list_splice_init(&qcow2->qios[QLIST_COW_END], &cow_end_qios);
>   	list_splice_init(&qcow2->resubmit_qios, &resubmit_qios);
> +	list_splice_init(&qcow2->qios[QLIST_SEEK], &seek_qios);
>   	spin_unlock_irq(&qcow2->deferred_lock);
>   
>   	process_embedded_qios(qcow2, &embedded_qios, &deferred_qios);
> @@ -4033,6 +4043,7 @@ void do_qcow2_work(struct work_struct *ws)
>   	process_cow_indexes_write(qcow2, &cow_indexes_qios);
>   	process_cow_end(qcow2, &cow_end_qios);
>   	process_resubmit_qios(qcow2, &resubmit_qios);
> +	process_seek_qios(qcow2, &seek_qios);
>   
>   	/* This actually submits batch of md writeback, initiated above */
>   	submit_metadata_writeback(qcow2);
> @@ -4255,3 +4266,249 @@ static void handle_cleanup_mask(struct qio *qio)
>   		ext->cleanup_mask &= ~FREE_ALLOCATED_CLU;
>   	}
>   }
> +
> +struct qio_data_llseek_hole {
> +	struct completion compl;
> +	struct qio *higher;
> +	loff_t lim;
> +};
> +
> +#define SEEK_QIO_DATA(qio) ((struct qio_data_llseek_hole *)qio->data)
> +
> +struct qio *alloc_seek_qio(struct qcow2 *qcow2, struct qio *parent, loff_t new_lim)
> +{
> +	struct qio_data_llseek_hole *data;
> +	struct qio *qio;
> +
> +	qio = qcow2_alloc_qio(qcow2->tgt->qio_pool, true);
> +	if (!qio)
> +		return NULL;
> +
> +	qcow2_init_qio(qio, REQ_OP_READ, qcow2);
> +	qio->queue_list_id = QLIST_SEEK;
> +
> +	data = kzalloc(sizeof(struct qio_data_llseek_hole), GFP_KERNEL);
> +	if (!data) {
> +		qcow2_free_qio(qio, qcow2->tgt->qio_pool);
> +		return NULL;
> +	}
> +
> +	qio->data = data;
> +	init_completion(&data->compl);
> +	data->lim = new_lim;
> +
> +	if (parent) {
> +		data->higher = parent;
> +		qio->bi_iter.bi_sector = parent->bi_iter.bi_sector;
> +
> +		if (to_bytes(qio->bi_iter.bi_sector) + parent->bi_iter.bi_size > new_lim)
> +			qio->bi_iter.bi_size = new_lim - to_bytes(qio->bi_iter.bi_sector);
> +		else
> +			qio->bi_iter.bi_size = parent->bi_iter.bi_size;
> +	}
> +
> +	return qio;
> +}
> +
> +struct qio *free_seek_qio_ret_higher(struct qio *qio)
> +{
> +	struct qio *ret = SEEK_QIO_DATA(qio)->higher;
> +
> +	kfree(qio->data);
> +	qcow2_free_qio(qio, qio->qcow2->tgt->qio_pool);
> +
> +	return ret;
> +}
> +
> +static inline sector_t get_next_l2(struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	loff_t start, add;
> +
> +	start = to_bytes(qio->bi_iter.bi_sector);
> +	add = qcow2->l2_entries - (start / qcow2->clu_size) % qcow2->l2_entries;
> +
> +	return qio->bi_iter.bi_sector + (qcow2->clu_size / to_bytes(1)) * add;
> +}
> +
> +static inline sector_t get_next_clu(struct qio *qio)
> +{
> +	struct qcow2 *qcow2 = qio->qcow2;
> +	loff_t offset;
> +
> +	offset = to_bytes(qio->bi_iter.bi_sector);
> +	offset = (offset + qcow2->clu_size) / qcow2->clu_size;
> +	offset *= qcow2->clu_size;
> +
> +	return to_sector(offset);
> +}
> +
> +static inline void seek_qio_next_clu(struct qio *qio, struct qcow2_map *map)
> +{
> +	/*
> +	 * Whole L2 table is unmapped - skip to next l2 table,
> +	 * but only if there is no backing image
> +	 */
> +	if (map && !(map->level & L2_LEVEL) && !qio->qcow2->lower)
> +		qio->bi_iter.bi_sector = get_next_l2(qio);
> +	else
> +		qio->bi_iter.bi_sector = get_next_clu(qio);
> +
> +	qio->bi_iter.bi_size = qio->qcow2->clu_size;
> +}
> +
> +static struct qio *advance_and_spawn_lower_seek_qio(struct qio *old_qio, u32 size)
> +{
> +	struct qio *new_qio;
> +	loff_t start, old_end;
> +
> +	start = to_bytes(old_qio->bi_iter.bi_sector);
> +	old_end = start + old_qio->bi_iter.bi_size;
> +
> +	if (old_end > old_qio->qcow2->lower->hdr.size)
> +		size = old_qio->qcow2->lower->hdr.size - start;
> +
> +	new_qio = alloc_seek_qio(old_qio->qcow2->lower, old_qio, start + size);
> +	if (!new_qio)
> +		return NULL;
> +
> +	if (old_qio->bi_iter.bi_size == size) {
> +		seek_qio_next_clu(old_qio, NULL);
> +	} else {
> +		old_qio->bi_iter.bi_sector += to_sector(size);
> +		old_qio->bi_iter.bi_size -= size;
> +	}
> +
> +	return new_qio;
> +}
> +
> +int qcow2_llseek_hole_qio(struct qio *qio, int whence, loff_t *result)
> +{
> +	struct calc_front_bytes_ret arg;
> +	struct qcow2_map map;
> +	struct qio *qptr;
> +	int ret;
> +	u32 size;
> +
> +	while (1) {
> +		if (to_bytes(qio->bi_iter.bi_sector) >= SEEK_QIO_DATA(qio)->lim) {
> +			if (SEEK_QIO_DATA(qio)->higher) {
> +				qio = free_seek_qio_ret_higher(qio);
> +				continue;
> +			}
> +			ret = 0;
> +			*result = (loff_t)-1;
> +			break;
> +		}
> +
> +		memset(&map, 0, sizeof(map));
> +		map.qcow2 = qio->qcow2;
> +		qptr = qio;
> +
> +		ret = parse_metadata(qio->qcow2, &qptr, &map);
> +		if (ret < 0)
> +			break;
> +		if (qptr == NULL) {
> +			/* one of metadata pages is not loaded and qio is postponed*/
> +			wait_for_completion(&SEEK_QIO_DATA(qio)->compl);
> +			reinit_completion(&SEEK_QIO_DATA(qio)->compl);
> +			continue;
> +		}
> +
> +calc_subclu:
> +		size = calc_front_qio_bytes(qio->qcow2, qio, &map, &arg);
> +
> +		if (arg.unmapped && arg.try_lower) {
> +			/*
> +			 * Check if the backing image is big enough, then advance current qio
> +			 * and spawn a new one for lower image
> +			 */
> +			if (to_bytes(qio->bi_iter.bi_sector) < qio->qcow2->lower->hdr.size) {
> +				struct qio *new_qio;
> +
> +				new_qio = advance_and_spawn_lower_seek_qio(qio, size);
> +				if (!new_qio) {
> +					ret = -ENOMEM;
> +					break;
> +				}
> +
> +				qio = new_qio;
> +				continue;
> +			}
> +		}
> +
> +		if (whence & SEEK_HOLE) {
> +			if (arg.zeroes || arg.unmapped) {
> +				*result = to_bytes(qio->bi_iter.bi_sector);
> +				ret = 0;
> +				break;
> +			} else if (size != qio->bi_iter.bi_size) {
> +				/*
> +				 * range starts with data subclusters and after that
> +				 * some subclusters are zero or unmapped
> +				 */
> +				*result = to_bytes(qio->bi_iter.bi_sector) + size;
> +				ret = 0;
> +				break;
> +			}
> +		}
> +
> +		if (whence & SEEK_DATA) {
> +			if (!arg.zeroes && !arg.unmapped) {
> +				*result = to_bytes(qio->bi_iter.bi_sector);
> +				ret = 0;
> +				break;
> +			} else if (size != qio->bi_iter.bi_size) {
> +				/*
> +				 * range starts with zero or unmapped subclusters
> +				 * but after that it still can be unmapped or zero
> +				 * We do not need to parse metadata again but we should
> +				 * skip this sublusters and look onto next ones
> +				 */
> +				qio->bi_iter.bi_sector += to_sector(size);
> +				qio->bi_iter.bi_size -= size;
> +				goto calc_subclu;
> +			}
> +		}
> +
> +		seek_qio_next_clu(qio, &map);
> +	}
> +
> +	while (qio)
> +		qio = free_seek_qio_ret_higher(qio);
> +
> +	return ret;
> +}
> +
> +loff_t qcow2_llseek_hole(struct dm_target *ti, loff_t offset, int whence)
> +{
> +	struct qcow2 *qcow2 = to_qcow2_target(ti)->top;
> +	loff_t result = -EINVAL;
> +	struct qio *qio;
> +	int ret;
> +
> +	qio = alloc_seek_qio(qcow2, NULL, qcow2->hdr.size);
> +	if (!qio)
> +		return -ENOMEM;
> +
> +	qio->bi_iter.bi_sector = to_sector(offset);
> +	qio->bi_iter.bi_size = qcow2->clu_size -
> +			       to_bytes(qio->bi_iter.bi_sector) % qcow2->clu_size;
> +
> +	ret = qcow2_llseek_hole_qio(qio, whence, &result);
> +	/* In case of error remap ENXIO as it have special meaning for llseek */
> +	if (ret < 0)
> +		return (ret == -ENXIO) ? -EINVAL : ret;
> +
> +	if (result >= 0 && result < offset)
> +		result = offset;
> +
> +	if (result == (loff_t)-1) {
> +		if (whence & SEEK_HOLE)
> +			result = qcow2->hdr.size;
> +		if (whence & SEEK_DATA)
> +			result = -ENXIO;
> +	}

from the llseek man page :
ENXIO
For SEEK_DATA, there are no more data regions past the supplied offset. 
For SEEK_HOLE, there are no more holes past the supplied offset.

shouldn't SEEK_HOLE return -ENXIO  instead of hdr.size ?



-- 
Regards,
Alexander Atanasov



More information about the Devel mailing list