[Devel] [PATCH RH9 4/7] dm/dm-qcow2: add find_hole
Alexander Atanasov
alexander.atanasov at virtuozzo.com
Mon Jul 24 16:54:38 MSK 2023
On 24.07.23 15:57, Andrey Zhadchenko wrote:
>
>
> On 7/24/23 14:20, Alexander Atanasov wrote:
>> On 24.07.23 11:03, Konstantin Khorenko wrote:
>>> Implement find_hole() for dm-qcow2 target.
>>> Iterate over ranges with cluster granularity until hole or data is
>>> found.
>>> To reduce code duplication, we should use already existing
>>> parse_metadata()
>>> We can pretend that seek request is read request for metadata purposes
>>> and than interpret parsing result in our favor.
>>> Since parse_metadata() support request postponing (for example when the
>>> requested L2 cluster is absent in RAM), we should create separate qio
>>> list for our queries.
>>>
>>> https://jira.vzint.dev/browse/PSBM-145746
>>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>> ---
>>> drivers/md/dm-qcow2-map.c | 140 +++++++++++++++++++++++++++++++++++
>>> drivers/md/dm-qcow2-target.c | 1 +
>>> drivers/md/dm-qcow2.h | 2 +
>>> 3 files changed, 143 insertions(+)
>>>
>>> diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c
>>> index a779889c6970..f728a52ab5e4 100644
>>> --- a/drivers/md/dm-qcow2-map.c
>>> +++ b/drivers/md/dm-qcow2-map.c
>>> @@ -3980,6 +3980,14 @@ static void process_resubmit_qios(struct qcow2
>>> *qcow2, struct list_head *qios)
>>> }
>>> }
>>> +static void process_seek_qios(struct qcow2 *qcow, struct list_head
>>> *qios)
>>> +{
>>> + struct qio *qio;
>>> +
>>> + while ((qio = qio_list_pop(qios)) != NULL)
>>> + complete(qio->data);
>>> +}
>>> +
>>> void do_qcow2_work(struct work_struct *ws)
>>> {
>>> struct qcow2 *qcow2 = container_of(ws, struct qcow2, worker);
>>> @@ -3991,6 +3999,7 @@ void do_qcow2_work(struct work_struct *ws)
>>> LIST_HEAD(cow_indexes_qios);
>>> LIST_HEAD(cow_end_qios);
>>> LIST_HEAD(resubmit_qios);
>>> + LIST_HEAD(seek_qios);
>>> unsigned int pflags = current->flags;
>>> current->flags |= PF_LOCAL_THROTTLE|PF_MEMALLOC_NOIO;
>>> @@ -4003,6 +4012,7 @@ void do_qcow2_work(struct work_struct *ws)
>>> list_splice_init(&qcow2->qios[QLIST_COW_INDEXES],
>>> &cow_indexes_qios);
>>> list_splice_init(&qcow2->qios[QLIST_COW_END], &cow_end_qios);
>>> list_splice_init(&qcow2->resubmit_qios, &resubmit_qios);
>>> + list_splice_init(&qcow2->qios[QLIST_SEEK], &seek_qios);
>>> spin_unlock_irq(&qcow2->deferred_lock);
>>> process_embedded_qios(qcow2, &embedded_qios, &deferred_qios);
>>> @@ -4013,6 +4023,7 @@ void do_qcow2_work(struct work_struct *ws)
>>> process_cow_indexes_write(qcow2, &cow_indexes_qios);
>>> process_cow_end(qcow2, &cow_end_qios);
>>> process_resubmit_qios(qcow2, &resubmit_qios);
>>> + process_seek_qios(qcow2, &seek_qios);
>>> /* This actually submits batch of md writeback, initiated
>>> above */
>>> submit_metadata_writeback(qcow2);
>>> @@ -4235,3 +4246,132 @@ static void handle_cleanup_mask(struct qio *qio)
>>> ext->cleanup_mask &= ~FREE_ALLOCATED_CLU;
>>> }
>>> }
>>> +
>>> +static sector_t get_next_l2(struct qio *qio)
>>> +{
>>> + struct qcow2 *qcow2 = qio->qcow2;
>>> + loff_t start, add;
>>> +
>>> + start = to_bytes(qio->bi_iter.bi_sector);
>>> + add = qcow2->l2_entries - (start / qcow2->clu_size) %
>>> qcow2->l2_entries;
>>> +
>>> + return qio->bi_iter.bi_sector + (qcow2->clu_size / to_bytes(1))
>>> * add;
>>> +}
>>> +
>>> +static sector_t get_next_clu(struct qio *qio)
>>> +{
>>> + struct qcow2 *qcow2 = qio->qcow2;
>>> + loff_t offset;
>>> +
>>> + offset = to_bytes(qio->bi_iter.bi_sector);
>>> + offset = offset / qcow2->clu_size;
>>> + offset = (offset + 1) * qcow2->clu_size;
>>> +
>>
>> Isn't
>>
>> offset = (offset + qcow2->clu_size) / qcow2->clu_size;
>> offset *= qcow2->clu_size;
>>
>> more optimal ? If clu_size is known to be power of two it may be
>> possible to rewritte it in another way.
>
> I think it is the same three operations of sum, division and multiplication
> As far as I am aware, cluster size is not guaranteed to be a power of two
Academically you are correct but there is still a small difference - one
register load.
>
>>
>>> + return to_sector(offset);
>>> +}
>>> +
>>> +loff_t qcow2_find_hole(struct dm_target *ti, loff_t offset, int whence)
>>> +{
>>> + struct qcow2 *qcow2 = to_qcow2_target(ti)->top;
>>> + DECLARE_COMPLETION_ONSTACK(compl);
>>> + bool unmapped, zeroes, try_lower;
>>> + struct qio qio = {0}, *qptr;
>>> + loff_t result = -EINVAL;
>>> + struct qcow2_map map;
>>> + u32 size;
>>> + int ret;
>>> +
>>> + qio.bi_iter.bi_sector = to_sector(offset);
>>> + qio.bi_iter.bi_size = qcow2->clu_size - offset % qcow2->clu_size;
>>> +
>>> + qcow2_init_qio(&qio, REQ_OP_READ, qcow2);
>>> + qio.queue_list_id = QLIST_SEEK;
>>> + qio.data = &compl;
>>> +
>>> + while (qio.bi_iter.bi_sector < to_sector(qcow2->hdr.size)) {
>>> + qio.qcow2 = qcow2;
>>> +retry:
>>> + memset(&map, 0, sizeof(map));
>>> + map.qcow2 = qio.qcow2;
>>> + qptr = &qio;
>>> + ret = parse_metadata(qio.qcow2, &qptr, &map);
>>> + /* ENXIO has a special meaning for llseek so remap it to
>>> EINVAL*/
>>> + if (ret < 0)
>>> + return (ret == -ENXIO) ? -EINVAL : ret;
>>> + if (qptr == NULL) {
>>> + wait_for_completion(&compl);
>>> + reinit_completion(&compl);
>>
>> What's the point of this ? compl is local , it is assigned to qio.data,
>> who will complete it here after qptr is already null and
>> parse_metadata is done? Looks like the places that use completion
>> manage their own completion function and data - submit_rw_md_page,
>> perform_rw_mapped, submit_read_whole_cow_clu.
>>
>
> parse_metadata() -> __handle_md_page() -> qcow2_md_page_find_or_postpone()
> here it is checked if L1 or L2 table (which is also a cluster) is loaded
> from disk. If it is not, the qio is inserted into md->wait_list, which
> is popped after the mapping is loaded. Qio goes into qio.queue_list_id
> queue. After that process_seek_qios() is called which releases completion.
> So parse_metadata() automatically postpone the request until the
> metadata pages are loaded and we wait on completion for it. After that
> completion is rearmed for further requests.
yes, got it - i've missed that a new qio is allocated and that the
initial one goes into the wait list.
>
>>> + goto retry;
>>> + }
>>> +
>>> +calc_subclu:
>>> + zeroes = unmapped = try_lower = false;
>>> + zeroes = (size = qio_all_zeroes_size(qio.qcow2, &qio, &map));
>>> + if (!size)
>>> + unmapped = (size = qio_unmapped_size(qio.qcow2, &qio,
>>> &map));
>>> + if (!size)
>>> + size = qio_mapped_not_zeroes_size(qio.qcow2, &qio, &map);
Why two if (!size) ?
>>> + if (unmapped)
>>> + try_lower = maybe_mapped_in_lower_delta(qio.qcow2, &qio);
This above needs comments more comments - why it is done that way ?
>>> +
>>> + if (unmapped && try_lower) {
>>> + loff_t end = to_bytes(qio.bi_iter.bi_sector) +
>>> qio.bi_iter.bi_size;
>>> +
>>> + if (end < qio.qcow2->hdr.size) {
>>> + qio.qcow2 = qio.qcow2->lower;
>>> + goto retry;
>>> + }
>>> + }
>>> +
>>> + if (whence & SEEK_HOLE) {
>>> + if (zeroes || unmapped) {
>>> + result = to_bytes(qio.bi_iter.bi_sector);
>>> + break;
>>> + } else if (size != qio.bi_iter.bi_size) {
>>> + /*
>>> + * range starts with data subclusters and after that
>>> + * some subclusters are zero or unmapped
>>> + */
>>> + result = to_bytes(qio.bi_iter.bi_sector) + size;
>>> + break;
>>> + }
>>> + }
>>> +
>>> + if (whence & SEEK_DATA) {
>>> + if (!zeroes && !unmapped) {
>>> + result = to_bytes(qio.bi_iter.bi_sector);
>>> + break;
>>> + } else if (size != qio.bi_iter.bi_size) {
>>> + /*
>>> + * range starts with zero or unmapped subclusters
>>> + * but after that it still can be unmapped or zero
>>> + * We do not need to parse metadata again but we should
>>> + * skip this sublusters and look onto next ones
>>> + */
>>> + qio.bi_iter.bi_sector += to_sector(size);
>>> + qio.bi_iter.bi_size -= size;
>>> + goto calc_subclu;
>>> + }
>>> + }
>>> +
>>> + /* whole L2 table is unmapped - skip to next l2 table */
>>> + if (!(map.level & L2_LEVEL))
>>> + qio.bi_iter.bi_sector = get_next_l2(&qio);
>>> + else
>>> + qio.bi_iter.bi_sector = get_next_clu(&qio);
>>> +
>>> + qio.bi_iter.bi_size = qcow2->clu_size;
>>> + }
>>> +
>>
>>
>> This whole retry logic is very long and it is hard to follow, could it
>> be split into functions ?
>
> Probably, but it will look miserably due to bunch of 2-3 line functions
> with 5+ arguments and introduce recursion (which will add extra pain
a helper struct instead of the 5 arguments can do.
> with different level qios and its completions), as for some mappings we
> need to go deeper into qcow2 backing images.
> Also subclusters add a lot of hassle unfortunately.
> Inspiration was taken from process_read_qio().
>
> Also, maybe you should pinpoint some hard-to-follow places so I can add
> more explaining comments?
> Probably first goto retry after completion can be replaced by continue
> and retry renamed into try_lower_image, if it will ease your pain :)
I am okay with it. But i am worried about you looking at that code after
an year :)
>
>>
>>> + if (result >= 0 && result < offset)
>>> + result = offset;
>>> +
>>> + if (qio.bi_iter.bi_sector >= to_sector(qcow2->hdr.size)) {
>>> + if (whence & SEEK_HOLE)
>>> + result = qcow2->hdr.size;
>>> + if (whence & SEEK_DATA)
>>> + result = -ENXIO;
>>> + }
>>> +
>>> + return result;
>>> +}
Ok, with the explaination of wait_list - LGTM.
--
Regards,
Alexander Atanasov
More information about the Devel
mailing list