[Devel] [PATCH RH9 4/7] dm/dm-qcow2: add find_hole
Andrey Zhadchenko
andrey.zhadchenko at virtuozzo.com
Mon Jul 24 15:57:16 MSK 2023
On 7/24/23 14:20, Alexander Atanasov wrote:
> On 24.07.23 11:03, Konstantin Khorenko wrote:
>> Implement find_hole() for dm-qcow2 target.
>> Iterate over ranges with cluster granularity until hole or data is found.
>> To reduce code duplication, we should use already existing
>> parse_metadata()
>> We can pretend that seek request is read request for metadata purposes
>> and than interpret parsing result in our favor.
>> Since parse_metadata() support request postponing (for example when the
>> requested L2 cluster is absent in RAM), we should create separate qio
>> list for our queries.
>>
>> https://jira.vzint.dev/browse/PSBM-145746
>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>> ---
>> drivers/md/dm-qcow2-map.c | 140 +++++++++++++++++++++++++++++++++++
>> drivers/md/dm-qcow2-target.c | 1 +
>> drivers/md/dm-qcow2.h | 2 +
>> 3 files changed, 143 insertions(+)
>>
>> diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c
>> index a779889c6970..f728a52ab5e4 100644
>> --- a/drivers/md/dm-qcow2-map.c
>> +++ b/drivers/md/dm-qcow2-map.c
>> @@ -3980,6 +3980,14 @@ static void process_resubmit_qios(struct qcow2
>> *qcow2, struct list_head *qios)
>> }
>> }
>> +static void process_seek_qios(struct qcow2 *qcow, struct list_head
>> *qios)
>> +{
>> + struct qio *qio;
>> +
>> + while ((qio = qio_list_pop(qios)) != NULL)
>> + complete(qio->data);
>> +}
>> +
>> void do_qcow2_work(struct work_struct *ws)
>> {
>> struct qcow2 *qcow2 = container_of(ws, struct qcow2, worker);
>> @@ -3991,6 +3999,7 @@ void do_qcow2_work(struct work_struct *ws)
>> LIST_HEAD(cow_indexes_qios);
>> LIST_HEAD(cow_end_qios);
>> LIST_HEAD(resubmit_qios);
>> + LIST_HEAD(seek_qios);
>> unsigned int pflags = current->flags;
>> current->flags |= PF_LOCAL_THROTTLE|PF_MEMALLOC_NOIO;
>> @@ -4003,6 +4012,7 @@ void do_qcow2_work(struct work_struct *ws)
>> list_splice_init(&qcow2->qios[QLIST_COW_INDEXES],
>> &cow_indexes_qios);
>> list_splice_init(&qcow2->qios[QLIST_COW_END], &cow_end_qios);
>> list_splice_init(&qcow2->resubmit_qios, &resubmit_qios);
>> + list_splice_init(&qcow2->qios[QLIST_SEEK], &seek_qios);
>> spin_unlock_irq(&qcow2->deferred_lock);
>> process_embedded_qios(qcow2, &embedded_qios, &deferred_qios);
>> @@ -4013,6 +4023,7 @@ void do_qcow2_work(struct work_struct *ws)
>> process_cow_indexes_write(qcow2, &cow_indexes_qios);
>> process_cow_end(qcow2, &cow_end_qios);
>> process_resubmit_qios(qcow2, &resubmit_qios);
>> + process_seek_qios(qcow2, &seek_qios);
>> /* This actually submits batch of md writeback, initiated above */
>> submit_metadata_writeback(qcow2);
>> @@ -4235,3 +4246,132 @@ static void handle_cleanup_mask(struct qio *qio)
>> ext->cleanup_mask &= ~FREE_ALLOCATED_CLU;
>> }
>> }
>> +
>> +static sector_t get_next_l2(struct qio *qio)
>> +{
>> + struct qcow2 *qcow2 = qio->qcow2;
>> + loff_t start, add;
>> +
>> + start = to_bytes(qio->bi_iter.bi_sector);
>> + add = qcow2->l2_entries - (start / qcow2->clu_size) %
>> qcow2->l2_entries;
>> +
>> + return qio->bi_iter.bi_sector + (qcow2->clu_size / to_bytes(1)) *
>> add;
>> +}
>> +
>> +static sector_t get_next_clu(struct qio *qio)
>> +{
>> + struct qcow2 *qcow2 = qio->qcow2;
>> + loff_t offset;
>> +
>> + offset = to_bytes(qio->bi_iter.bi_sector);
>> + offset = offset / qcow2->clu_size;
>> + offset = (offset + 1) * qcow2->clu_size;
>> +
>
> Isn't
>
> offset = (offset + qcow2->clu_size) / qcow2->clu_size;
> offset *= qcow2->clu_size;
>
> more optimal ? If clu_size is known to be power of two it may be
> possible to rewritte it in another way.
I think it is the same three operations of sum, division and multiplication
As far as I am aware, cluster size is not guaranteed to be a power of two
>
>> + return to_sector(offset);
>> +}
>> +
>> +loff_t qcow2_find_hole(struct dm_target *ti, loff_t offset, int whence)
>> +{
>> + struct qcow2 *qcow2 = to_qcow2_target(ti)->top;
>> + DECLARE_COMPLETION_ONSTACK(compl);
>> + bool unmapped, zeroes, try_lower;
>> + struct qio qio = {0}, *qptr;
>> + loff_t result = -EINVAL;
>> + struct qcow2_map map;
>> + u32 size;
>> + int ret;
>> +
>> + qio.bi_iter.bi_sector = to_sector(offset);
>> + qio.bi_iter.bi_size = qcow2->clu_size - offset % qcow2->clu_size;
>> +
>> + qcow2_init_qio(&qio, REQ_OP_READ, qcow2);
>> + qio.queue_list_id = QLIST_SEEK;
>> + qio.data = &compl;
>> +
>> + while (qio.bi_iter.bi_sector < to_sector(qcow2->hdr.size)) {
>> + qio.qcow2 = qcow2;
>> +retry:
>> + memset(&map, 0, sizeof(map));
>> + map.qcow2 = qio.qcow2;
>> + qptr = &qio;
>> + ret = parse_metadata(qio.qcow2, &qptr, &map);
>> + /* ENXIO has a special meaning for llseek so remap it to
>> EINVAL*/
>> + if (ret < 0)
>> + return (ret == -ENXIO) ? -EINVAL : ret;
>> + if (qptr == NULL) {
>> + wait_for_completion(&compl);
>> + reinit_completion(&compl);
>
> What's the point of this ? compl is local , it is assigned to qio.data,
> who will complete it here after qptr is already null and parse_metadata
> is done? Looks like the places that use completion manage their own
> completion function and data - submit_rw_md_page, perform_rw_mapped,
> submit_read_whole_cow_clu.
>
parse_metadata() -> __handle_md_page() -> qcow2_md_page_find_or_postpone()
here it is checked if L1 or L2 table (which is also a cluster) is loaded
from disk. If it is not, the qio is inserted into md->wait_list, which
is popped after the mapping is loaded. Qio goes into qio.queue_list_id
queue. After that process_seek_qios() is called which releases completion.
So parse_metadata() automatically postpone the request until the
metadata pages are loaded and we wait on completion for it. After that
completion is rearmed for further requests.
>> + goto retry;
>> + }
>> +
>> +calc_subclu:
>> + zeroes = unmapped = try_lower = false;
>> + zeroes = (size = qio_all_zeroes_size(qio.qcow2, &qio, &map));
>> + if (!size)
>> + unmapped = (size = qio_unmapped_size(qio.qcow2, &qio,
>> &map));
>> + if (!size)
>> + size = qio_mapped_not_zeroes_size(qio.qcow2, &qio, &map);
>> + if (unmapped)
>> + try_lower = maybe_mapped_in_lower_delta(qio.qcow2, &qio);
>> +
>> + if (unmapped && try_lower) {
>> + loff_t end = to_bytes(qio.bi_iter.bi_sector) +
>> qio.bi_iter.bi_size;
>> +
>> + if (end < qio.qcow2->hdr.size) {
>> + qio.qcow2 = qio.qcow2->lower;
>> + goto retry;
>> + }
>> + }
>> +
>> + if (whence & SEEK_HOLE) {
>> + if (zeroes || unmapped) {
>> + result = to_bytes(qio.bi_iter.bi_sector);
>> + break;
>> + } else if (size != qio.bi_iter.bi_size) {
>> + /*
>> + * range starts with data subclusters and after that
>> + * some subclusters are zero or unmapped
>> + */
>> + result = to_bytes(qio.bi_iter.bi_sector) + size;
>> + break;
>> + }
>> + }
>> +
>> + if (whence & SEEK_DATA) {
>> + if (!zeroes && !unmapped) {
>> + result = to_bytes(qio.bi_iter.bi_sector);
>> + break;
>> + } else if (size != qio.bi_iter.bi_size) {
>> + /*
>> + * range starts with zero or unmapped subclusters
>> + * but after that it still can be unmapped or zero
>> + * We do not need to parse metadata again but we should
>> + * skip this sublusters and look onto next ones
>> + */
>> + qio.bi_iter.bi_sector += to_sector(size);
>> + qio.bi_iter.bi_size -= size;
>> + goto calc_subclu;
>> + }
>> + }
>> +
>> + /* whole L2 table is unmapped - skip to next l2 table */
>> + if (!(map.level & L2_LEVEL))
>> + qio.bi_iter.bi_sector = get_next_l2(&qio);
>> + else
>> + qio.bi_iter.bi_sector = get_next_clu(&qio);
>> +
>> + qio.bi_iter.bi_size = qcow2->clu_size;
>> + }
>> +
>
>
> This whole retry logic is very long and it is hard to follow, could it
> be split into functions ?
Probably, but it will look miserably due to bunch of 2-3 line functions
with 5+ arguments and introduce recursion (which will add extra pain
with different level qios and its completions), as for some mappings we
need to go deeper into qcow2 backing images.
Also subclusters add a lot of hassle unfortunately.
Inspiration was taken from process_read_qio().
Also, maybe you should pinpoint some hard-to-follow places so I can add
more explaining comments?
Probably first goto retry after completion can be replaced by continue
and retry renamed into try_lower_image, if it will ease your pain :)
>
>> + if (result >= 0 && result < offset)
>> + result = offset;
>> +
>> + if (qio.bi_iter.bi_sector >= to_sector(qcow2->hdr.size)) {
>> + if (whence & SEEK_HOLE)
>> + result = qcow2->hdr.size;
>> + if (whence & SEEK_DATA)
>> + result = -ENXIO;
>> + }
>> +
>> + return result;
>> +}
More information about the Devel
mailing list