[Devel] [PATCH RH9 v2 5/8] dm/dm-qcow2: add llseek_hole
Andrey Zhadchenko
andrey.zhadchenko at virtuozzo.com
Mon Aug 21 20:34:37 MSK 2023
On 8/21/23 19:20, Alexander Atanasov wrote:
> On 16.08.23 12:32, Andrey Zhadchenko wrote:
>> Implement llseek_hole() for dm-qcow2 target.
>> Iterate over ranges with cluster granularity until hole or data is found.
>> To reduce code duplication, we should use already existing
>> parse_metadata()
>> We can pretend that seek request is read request for metadata purposes
>> and than interpret parsing result in our favor.
>> Since parse_metadata() support request postponing (for example when the
>> requested L2 cluster is absent in RAM), we should create separate qio
>> list for our queries.
>>
>> Feature: dm: implement SEEK_HOLE for dm-qcow2 and dm-ploop
>> https://jira.vzint.dev/browse/PSBM-145746
>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>
>> ---
>> v2 major rework:
>> - move some code to new functions
>> - alloc seek qios dynamically
>> - handle chaining images with more care
>> - do not skip L2 if backing image exists
>>
>> drivers/md/dm-qcow2-map.c | 257 +++++++++++++++++++++++++++++++++++
>> drivers/md/dm-qcow2-target.c | 1 +
>> drivers/md/dm-qcow2.h | 2 +
>> 3 files changed, 260 insertions(+)
>>
>> diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c
>> index 8c1620c11137..b15caaff8c98 100644
>> --- a/drivers/md/dm-qcow2-map.c
>> +++ b/drivers/md/dm-qcow2-map.c
>> @@ -4000,6 +4000,14 @@ static void process_resubmit_qios(struct qcow2
>> *qcow2, struct list_head *qios)
>> }
>> }
>> +static void process_seek_qios(struct qcow2 *qcow, struct list_head
>> *qios)
>> +{
>> + struct qio *qio;
>> +
>> + while ((qio = qio_list_pop(qios)) != NULL)
>> + complete(qio->data);
>> +}
>> +
>> void do_qcow2_work(struct work_struct *ws)
>> {
>> struct qcow2 *qcow2 = container_of(ws, struct qcow2, worker);
>> @@ -4011,6 +4019,7 @@ void do_qcow2_work(struct work_struct *ws)
>> LIST_HEAD(cow_indexes_qios);
>> LIST_HEAD(cow_end_qios);
>> LIST_HEAD(resubmit_qios);
>> + LIST_HEAD(seek_qios);
>> unsigned int pflags = current->flags;
>> current->flags |= PF_LOCAL_THROTTLE|PF_MEMALLOC_NOIO;
>> @@ -4023,6 +4032,7 @@ void do_qcow2_work(struct work_struct *ws)
>> list_splice_init(&qcow2->qios[QLIST_COW_INDEXES],
>> &cow_indexes_qios);
>> list_splice_init(&qcow2->qios[QLIST_COW_END], &cow_end_qios);
>> list_splice_init(&qcow2->resubmit_qios, &resubmit_qios);
>> + list_splice_init(&qcow2->qios[QLIST_SEEK], &seek_qios);
>> spin_unlock_irq(&qcow2->deferred_lock);
>> process_embedded_qios(qcow2, &embedded_qios, &deferred_qios);
>> @@ -4033,6 +4043,7 @@ void do_qcow2_work(struct work_struct *ws)
>> process_cow_indexes_write(qcow2, &cow_indexes_qios);
>> process_cow_end(qcow2, &cow_end_qios);
>> process_resubmit_qios(qcow2, &resubmit_qios);
>> + process_seek_qios(qcow2, &seek_qios);
>> /* This actually submits batch of md writeback, initiated above */
>> submit_metadata_writeback(qcow2);
>> @@ -4255,3 +4266,249 @@ static void handle_cleanup_mask(struct qio *qio)
>> ext->cleanup_mask &= ~FREE_ALLOCATED_CLU;
>> }
>> }
>> +
>> +struct qio_data_llseek_hole {
>> + struct completion compl;
>> + struct qio *higher;
>> + loff_t lim;
>> +};
>> +
>> +#define SEEK_QIO_DATA(qio) ((struct qio_data_llseek_hole *)qio->data)
>> +
>> +struct qio *alloc_seek_qio(struct qcow2 *qcow2, struct qio *parent,
>> loff_t new_lim)
>> +{
>> + struct qio_data_llseek_hole *data;
>> + struct qio *qio;
>> +
>> + qio = qcow2_alloc_qio(qcow2->tgt->qio_pool, true);
>> + if (!qio)
>> + return NULL;
>> +
>> + qcow2_init_qio(qio, REQ_OP_READ, qcow2);
>> + qio->queue_list_id = QLIST_SEEK;
>> +
>> + data = kzalloc(sizeof(struct qio_data_llseek_hole), GFP_KERNEL);
>> + if (!data) {
>> + qcow2_free_qio(qio, qcow2->tgt->qio_pool);
>> + return NULL;
>> + }
>> +
>> + qio->data = data;
>> + init_completion(&data->compl);
>> + data->lim = new_lim;
>> +
>> + if (parent) {
>> + data->higher = parent;
>> + qio->bi_iter.bi_sector = parent->bi_iter.bi_sector;
>> +
>> + if (to_bytes(qio->bi_iter.bi_sector) +
>> parent->bi_iter.bi_size > new_lim)
>> + qio->bi_iter.bi_size = new_lim -
>> to_bytes(qio->bi_iter.bi_sector);
>> + else
>> + qio->bi_iter.bi_size = parent->bi_iter.bi_size;
>> + }
>> +
>> + return qio;
>> +}
>> +
>> +struct qio *free_seek_qio_ret_higher(struct qio *qio)
>> +{
>> + struct qio *ret = SEEK_QIO_DATA(qio)->higher;
>> +
>> + kfree(qio->data);
>> + qcow2_free_qio(qio, qio->qcow2->tgt->qio_pool);
>> +
>> + return ret;
>> +}
>> +
>> +static inline sector_t get_next_l2(struct qio *qio)
>> +{
>> + struct qcow2 *qcow2 = qio->qcow2;
>> + loff_t start, add;
>> +
>> + start = to_bytes(qio->bi_iter.bi_sector);
>> + add = qcow2->l2_entries - (start / qcow2->clu_size) %
>> qcow2->l2_entries;
>> +
>> + return qio->bi_iter.bi_sector + (qcow2->clu_size / to_bytes(1)) *
>> add;
>> +}
>> +
>> +static inline sector_t get_next_clu(struct qio *qio)
>> +{
>> + struct qcow2 *qcow2 = qio->qcow2;
>> + loff_t offset;
>> +
>> + offset = to_bytes(qio->bi_iter.bi_sector);
>> + offset = (offset + qcow2->clu_size) / qcow2->clu_size;
>> + offset *= qcow2->clu_size;
>> +
>> + return to_sector(offset);
>> +}
>> +
>> +static inline void seek_qio_next_clu(struct qio *qio, struct
>> qcow2_map *map)
>> +{
>> + /*
>> + * Whole L2 table is unmapped - skip to next l2 table,
>> + * but only if there is no backing image
>> + */
>> + if (map && !(map->level & L2_LEVEL) && !qio->qcow2->lower)
>> + qio->bi_iter.bi_sector = get_next_l2(qio);
>> + else
>> + qio->bi_iter.bi_sector = get_next_clu(qio);
>> +
>> + qio->bi_iter.bi_size = qio->qcow2->clu_size;
>> +}
>> +
>> +static struct qio *advance_and_spawn_lower_seek_qio(struct qio
>> *old_qio, u32 size)
>> +{
>> + struct qio *new_qio;
>> + loff_t start, old_end;
>> +
>> + start = to_bytes(old_qio->bi_iter.bi_sector);
>> + old_end = start + old_qio->bi_iter.bi_size;
>> +
>> + if (old_end > old_qio->qcow2->lower->hdr.size)
>> + size = old_qio->qcow2->lower->hdr.size - start;
>> +
>> + new_qio = alloc_seek_qio(old_qio->qcow2->lower, old_qio, start +
>> size);
>> + if (!new_qio)
>> + return NULL;
>> +
>> + if (old_qio->bi_iter.bi_size == size) {
>> + seek_qio_next_clu(old_qio, NULL);
>> + } else {
>> + old_qio->bi_iter.bi_sector += to_sector(size);
>> + old_qio->bi_iter.bi_size -= size;
>> + }
>> +
>> + return new_qio;
>> +}
>> +
>> +int qcow2_llseek_hole_qio(struct qio *qio, int whence, loff_t *result)
>> +{
>> + struct calc_front_bytes_ret arg;
>> + struct qcow2_map map;
>> + struct qio *qptr;
>> + int ret;
>> + u32 size;
>> +
>> + while (1) {
>> + if (to_bytes(qio->bi_iter.bi_sector) >=
>> SEEK_QIO_DATA(qio)->lim) {
>> + if (SEEK_QIO_DATA(qio)->higher) {
>> + qio = free_seek_qio_ret_higher(qio);
>> + continue;
>> + }
>> + ret = 0;
>> + *result = (loff_t)-1;
>> + break;
>> + }
>> +
>> + memset(&map, 0, sizeof(map));
>> + map.qcow2 = qio->qcow2;
>> + qptr = qio;
>> +
>> + ret = parse_metadata(qio->qcow2, &qptr, &map);
>> + if (ret < 0)
>> + break;
>> + if (qptr == NULL) {
>> + /* one of metadata pages is not loaded and qio is
>> postponed*/
>> + wait_for_completion(&SEEK_QIO_DATA(qio)->compl);
>> + reinit_completion(&SEEK_QIO_DATA(qio)->compl);
>> + continue;
>> + }
>> +
>> +calc_subclu:
>> + size = calc_front_qio_bytes(qio->qcow2, qio, &map, &arg);
>> +
>> + if (arg.unmapped && arg.try_lower) {
>> + /*
>> + * Check if the backing image is big enough, then advance
>> current qio
>> + * and spawn a new one for lower image
>> + */
>> + if (to_bytes(qio->bi_iter.bi_sector) <
>> qio->qcow2->lower->hdr.size) {
>> + struct qio *new_qio;
>> +
>> + new_qio = advance_and_spawn_lower_seek_qio(qio, size);
>> + if (!new_qio) {
>> + ret = -ENOMEM;
>> + break;
>> + }
>> +
>> + qio = new_qio;
>> + continue;
>> + }
>> + }
>> +
>> + if (whence & SEEK_HOLE) {
>> + if (arg.zeroes || arg.unmapped) {
>> + *result = to_bytes(qio->bi_iter.bi_sector);
>> + ret = 0;
>> + break;
>> + } else if (size != qio->bi_iter.bi_size) {
>> + /*
>> + * range starts with data subclusters and after that
>> + * some subclusters are zero or unmapped
>> + */
>> + *result = to_bytes(qio->bi_iter.bi_sector) + size;
>> + ret = 0;
>> + break;
>> + }
>> + }
>> +
>> + if (whence & SEEK_DATA) {
>> + if (!arg.zeroes && !arg.unmapped) {
>> + *result = to_bytes(qio->bi_iter.bi_sector);
>> + ret = 0;
>> + break;
>> + } else if (size != qio->bi_iter.bi_size) {
>> + /*
>> + * range starts with zero or unmapped subclusters
>> + * but after that it still can be unmapped or zero
>> + * We do not need to parse metadata again but we should
>> + * skip this sublusters and look onto next ones
>> + */
>> + qio->bi_iter.bi_sector += to_sector(size);
>> + qio->bi_iter.bi_size -= size;
>> + goto calc_subclu;
>> + }
>> + }
>> +
>> + seek_qio_next_clu(qio, &map);
>> + }
>> +
>> + while (qio)
>> + qio = free_seek_qio_ret_higher(qio);
>> +
>> + return ret;
>> +}
>> +
>> +loff_t qcow2_llseek_hole(struct dm_target *ti, loff_t offset, int
>> whence)
>> +{
>> + struct qcow2 *qcow2 = to_qcow2_target(ti)->top;
>> + loff_t result = -EINVAL;
>> + struct qio *qio;
>> + int ret;
>> +
>> + qio = alloc_seek_qio(qcow2, NULL, qcow2->hdr.size);
>> + if (!qio)
>> + return -ENOMEM;
>> +
>> + qio->bi_iter.bi_sector = to_sector(offset);
>> + qio->bi_iter.bi_size = qcow2->clu_size -
>> + to_bytes(qio->bi_iter.bi_sector) % qcow2->clu_size;
>> +
>> + ret = qcow2_llseek_hole_qio(qio, whence, &result);
>> + /* In case of error remap ENXIO as it have special meaning for
>> llseek */
>> + if (ret < 0)
>> + return (ret == -ENXIO) ? -EINVAL : ret;
>> +
>> + if (result >= 0 && result < offset)
>> + result = offset;
>> +
>> + if (result == (loff_t)-1) {
>> + if (whence & SEEK_HOLE)
>> + result = qcow2->hdr.size;
>> + if (whence & SEEK_DATA)
>> + result = -ENXIO;
>> + }
>
> from the llseek man page :
> ENXIO
> For SEEK_DATA, there are no more data regions past the supplied offset.
> For SEEK_HOLE, there are no more holes past the supplied offset.
>
> shouldn't SEEK_HOLE return -ENXIO instead of hdr.size ?
End of file is always considered as a hole, so it is fine.
However, I missed one corner case: if starting offset is past end of the
file, we still need to return ENXIO
Will be fixed in next iteration
>
>
>
More information about the Devel
mailing list