[Devel] [PATCH VZ9] dm-qcow2: allow specifying depth to merge intermediate images

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Wed Sep 17 14:12:12 MSK 2025



On 9/17/25 10:11, Pavel Tikhomirov wrote:
> 
> 
> On 9/16/25 00:18, Andrey Zhadchenko wrote:
>>
>>
>> On 9/15/25 17:00, Konstantin Khorenko wrote:
>>> On 08.09.2025 19:27, Andrey Zhadchenko wrote:
>>>> dm-qcow2 is almost ready to merge intermediate images, thanks
>>>> to separated metadata tables and qio being per-qcow2.
>>>> So this patch mostly propagates which qcow2 we want to use
>>>> instead of tgt->top.
>>>>
>>>> qcow2_merge_backward_complete is a bit harder due to in-middle
>>>> replacement and needing to recalculate img_id for a part of
>>>> image chain.
>>>>
>>>> https://virtuozzo.atlassian.net/browse/VSTOR-101375
>>>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>>> ---
>>>>   drivers/md/dm-qcow2-cmd.c | 82 +++++++++++++++++++++++++++ 
>>>> +-----------
>>>>   drivers/md/dm-qcow2.h     |  1 +
>>>>   2 files changed, 61 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c
>>>> index e77052ad61e6b..d59fa82dbffbc 100644
>>>> --- a/drivers/md/dm-qcow2-cmd.c
>>>> +++ b/drivers/md/dm-qcow2-cmd.c
>>>> @@ -116,9 +116,9 @@ static int qcow2_service_iter(struct 
>>>> qcow2_target *tgt, struct qcow2 *qcow2,
>>>>   }
>>>>   ALLOW_ERROR_INJECTION(qcow2_service_iter, ERRNO);
>>>> -static int qcow2_merge_common(struct qcow2_target *tgt)
>>>> +static int qcow2_merge_common(struct qcow2_target *tgt, struct 
>>>> qcow2 *qcow2)
>>>>   {
>>>> -    struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
>>>> +    struct qcow2 *lower = qcow2->lower;
>>>>       u32 clu_size = qcow2->clu_size;
>>>>       loff_t end = lower->hdr.size;
>>>> @@ -139,9 +139,8 @@ static int qcow2_merge_forward(struct 
>>>> qcow2_target *tgt)
>>>>   }
>>>>   ALLOW_ERROR_INJECTION(qcow2_merge_forward, ERRNO);
>>>> -static int qcow2_break_l1cow(struct qcow2_target *tgt)
>>>> +static int qcow2_break_l1cow(struct qcow2_target *tgt, struct qcow2 
>>>> *qcow2)
>>>>   {
>>>> -    struct qcow2 *qcow2 = tgt->top;
>>>>       loff_t end = qcow2->hdr.size;
>>>>       loff_t step = (u64)qcow2->l2_entries * qcow2->clu_size;
>>>> @@ -152,25 +151,35 @@ static int qcow2_break_l1cow(struct 
>>>> qcow2_target *tgt)
>>>>   static void set_backward_merge_in_process(struct qcow2_target *tgt,
>>>>                        struct qcow2 *qcow2, bool set)
>>>>   {
>>>> +    struct qcow2 *top = tgt->top;
>>>>       LIST_HEAD(list);
>>>> +    /*
>>>> +     * There are no writes if it is not the top qcow2 image.
>>>> +     * so we do not need to stop and flush requests when setting
>>>> +     */
>>>> +    if (qcow2 != top && set) {
>>>> +        qcow2->backward_merge_in_process = set;
>>>> +        return;
>>>> +    }
>>>> +
>>>>       /*
>>>>        * To avoid race between allocations and COWS
>>>>        * we completely stop queueing qios and wait
>>>>        * for pending qios. Lock is for visability.
>>>>        */
>>>> -    spin_lock_irq(&qcow2->deferred_lock);
>>>> -    qcow2->pause_submitting_qios = true;
>>>> -    spin_unlock_irq(&qcow2->deferred_lock);
>>>> +    spin_lock_irq(&top->deferred_lock);
>>>> +    top->pause_submitting_qios = true;
>>>> +    spin_unlock_irq(&top->deferred_lock);
>>>>       qcow2_inflight_ref_switch(tgt);
>>>>       /* queue is stopped */
>>>> -    spin_lock_irq(&qcow2->deferred_lock);
>>>> +    spin_lock_irq(&top->deferred_lock);
>>>>       WARN_ON_ONCE(qcow2->backward_merge_in_process == set);
>>>>       qcow2->backward_merge_in_process = set;
>>>> -    qcow2->pause_submitting_qios = false;
>>>> -    list_splice_init(&qcow2->paused_qios, &list);
>>>> -    spin_unlock_irq(&qcow2->deferred_lock);
>>>> +    top->pause_submitting_qios = false;
>>>> +    list_splice_init(&top->paused_qios, &list);
>>>> +    spin_unlock_irq(&top->deferred_lock);
>>>>       qcow2_submit_embedded_qios(tgt, &list);
>>>>   }
>>>> @@ -240,11 +249,15 @@ static int 
>>>> qcow2_merge_backward_progress(struct qcow2_target *tgt,
>>>>   static int qcow2_merge_backward_set_eventfd(struct qcow2_target 
>>>> *tgt, int efd);
>>>> -static int qcow2_merge_backward_start(struct qcow2_target *tgt, int 
>>>> efd)
>>>> +static int qcow2_merge_backward_start(struct qcow2_target *tgt, int 
>>>> efd, int depth)
>>>>   {
>>>> -    struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
>>>> +    struct qcow2 *qcow2 = tgt->top, *lower;
>>>>       int ret;
>>>> +    while (depth-- > 0 && qcow2->lower)
>>>> +        qcow2 = qcow2->lower;
>>>> +    lower = qcow2->lower;
>>>> +
>>>>       lockdep_assert_held(&tgt->ctl_mutex);
>>>>       if (!lower)
>>>> @@ -263,6 +276,7 @@ static int qcow2_merge_backward_start(struct 
>>>> qcow2_target *tgt, int efd)
>>>>       if (ret)
>>>>           return ret;
>>>> +    tgt->backward_merge.qcow2 = qcow2;
>>>>       tgt->backward_merge.state = BACKWARD_MERGE_START;
>>>>       __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_START);
>>>>       tgt->backward_merge.error = 0;
>>>> @@ -291,7 +305,7 @@ void qcow2_merge_backward_work(struct 
>>>> work_struct *work)
>>>>       __backward_merge_update_stage(tgt, 
>>>> BACKWARD_MERGE_STAGE_BREAK_L1COW);
>>>>       mutex_unlock(&tgt->ctl_mutex);
>>>> -    qcow2 = tgt->top;
>>>> +    qcow2 = tgt->backward_merge.qcow2;
>>>>       lower = qcow2->lower;
>>>>       /*
>>>> @@ -300,7 +314,7 @@ void qcow2_merge_backward_work(struct 
>>>> work_struct *work)
>>>>        * we'd have to freeze IO going to all data clusters
>>>>        * under every L1 entry related to several snapshots.
>>>>        */
>>>> -    ret = qcow2_break_l1cow(tgt);
>>>> +    ret = qcow2_break_l1cow(tgt, qcow2);
>>>>       if (ret) {
>>>>           QC_ERR(tgt->ti, "Can't break L1 COW");
>>>>           goto out_err;
>>>> @@ -316,7 +330,7 @@ void qcow2_merge_backward_work(struct 
>>>> work_struct *work)
>>>>       /* Start merge */
>>>>       backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_RUNNING);
>>>> -    ret = qcow2_merge_common(tgt);
>>>> +    ret = qcow2_merge_common(tgt, qcow2);
>>>>       if (ret) {
>>>>           set_backward_merge_in_process(tgt, qcow2, false);
>>>>           ret2 = qcow2_set_image_file_features(lower, false);
>>>> @@ -357,9 +371,29 @@ static int qcow2_merge_backward_complete(struct 
>>>> qcow2_target *tgt)
>>>>       if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION)
>>>>           return -EBUSY;
>>>> +
>>>> +    if (tgt->top == tgt->backward_merge.qcow2) {
>>>> +        tgt->top = lower;
>>>> +    } else {
>>>> +        struct qcow2 *top = tgt->top;
>>>> +
>>>> +        while (top && top->lower != tgt->backward_merge.qcow2)
>>>> +            top = top->lower;
> 
> This algorithm can be simplified, we really only need to search the 
> place in delta list once on "start", save it and use it directly later.
> 
> To do that we need to save a double pointer to where qcow2 pointer is to 
> tgt->backward_merge.pqcow2 (e.g.: struct qcow2 **).
> 
> on start:
> 
> tgt->backward_merge.pqcow2 = &qcow2;
> 
> on complete:
> 
> *tgt->backward_merge.pqcow2 = (*tgt->backward_merge.pqcow2)->lower;
> 
> And we can still get qcow2 and lower like:
> 
> qcow2 = *tgt->backward_merge.pqcow2;
> lower = qcow2->lower;
> 
> There is img_id change loop though, which will be still needed, but at 
> least we don't have two loops.
> 
> Note: By the way we might be able to replace qcow2->img_id on each delta 
> with just one dm_target->depth per target, as we only use img_id from 
> top delta to find the depth AFAICS (or search for (depth-n)-th delta 
> from top).

There are also some delta metadata re-reads during resume and they break 
if imd_id's are not in place

> 
>>>> +        if (!top)
>>>> +            return -EFAULT;
> 
> We should not get to EFAULT as this list is only updated by us 
> ("complete") and qcow2_ctr->qcow2_alloc_delta. This way we can have no 
> error here and move BACKWARD_MERGE_STAGE_COMPLETING stage setting above 
> this code, indicating that we are in the process of completion.
> 
>>>> +
>>>> +        qcow2 = tgt->backward_merge.qcow2;
>>>> +        lower = qcow2->lower;
>>>> +        top->lower = lower;
>>>> +
>>>> +        top = tgt->top;
>>>> +        while (top != lower) {
>>>> +            top->img_id--;
>>>> +            top = top->lower;
>>>> +        }
>>>
>>> May be to check here (in while) if top == NULL and return an error is 
>>> yes?
>>
>> Why? We did some checks early. And also if we want to return error 
>> from here we need to increment id's back and revert top->lower change. 
>> I would rather not do that.
> 
> As we've already checked that we have all needed deltas on desired depth 
> as early as on backwardmerge start, check here is not needed, agreed.
> 
> But we can have BUG_ON here, to be extra careful in case delta list was 
> corrupted or some other unexpected thing happened.
> 
>>
>>>
>>>> +    }
>>>> +
>>>>       __backward_merge_update_stage(tgt, 
>>>> BACKWARD_MERGE_STAGE_COMPLETING);
>>>> -
>>>> -    tgt->top = lower;
>>>>       smp_wmb(); /* Pairs with qcow2_ref_inc() */
>>>>       qcow2_inflight_ref_switch(tgt); /* Pending qios */
>>>>       qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */
>>>> @@ -396,7 +430,7 @@ void qcow2_merge_backward_cancel(struct 
>>>> qcow2_target *tgt)
>>>>       } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) {
>>>>           flush = true;
>>>>       } else if (tgt->backward_merge.state == 
>>>> BACKWARD_MERGE_WAIT_COMPLETION) {
>>>> -        set_backward_merge_in_process(tgt, tgt->top, false);
>>>> +        set_backward_merge_in_process(tgt, tgt- 
>>>> >backward_merge.qcow2, false);
>>>>           tgt->backward_merge.state = BACKWARD_MERGE_STOPPED;
>>>>           __backward_merge_update_stage(tgt, 
>>>> BACKWARD_MERGE_STAGE_NONE);
>>>>       }
>>>> @@ -572,7 +606,7 @@ int qcow2_message(struct dm_target *ti, unsigned 
>>>> int argc, char **argv,
>>>>       struct qcow2_target *tgt = to_qcow2_target(ti);
>>>>       int ret = -EPERM;
>>>>       u32 val, val2;
>>>> -    int efd;
>>>> +    int efd, depth = 0;
>>>>       if (!capable(CAP_SYS_ADMIN))
>>>>           goto out;
>>>> @@ -652,11 +686,15 @@ int qcow2_message(struct dm_target *ti, 
>>>> unsigned int argc, char **argv,
>>>>       } else if (!strcmp(argv[0], "merge_backward")) {
>>>>           /* argc >= 2 */
>>>>           if (!strcmp(argv[1], "start")) {
>>>> -            if (argc != 3 || kstrtoint(argv[2], 10, &efd) || efd < 
>>>> 0) {
>>>> +            if (argc < 3 || argc > 4 || kstrtoint(argv[2], 10, 
>>>> &efd) || efd < 0) {
>>>>                   ret = -EINVAL;
>>>>                   goto out_unlock;
>>>>               }
>>>> -            ret = qcow2_merge_backward_start(tgt, efd);
>>>> +            if (argc == 4 && kstrtou32(argv[3], 10, &depth)) {
>>>> +                ret = -EINVAL;
>>>> +                goto out_unlock;
>>>> +            }
>>>> +            ret = qcow2_merge_backward_start(tgt, efd, depth);
>>>>           } else if (!strcmp(argv[1], "complete")) {
>>>>               if (argc != 2) {
>>>>                   ret = -EINVAL;
>>>> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h
>>>> index 5aa00c6a5ebd5..c2b2193561461 100644
>>>> --- a/drivers/md/dm-qcow2.h
>>>> +++ b/drivers/md/dm-qcow2.h
>>>> @@ -172,6 +172,7 @@ enum qcow2_backward_merge_stage {
>>>>   struct qcow2_backward_merge {
>>>>       struct work_struct work;
>>>> +    struct qcow2 *qcow2;
>>>>       enum qcow2_backward_merge_state state;
>>>>       int error;
>>>>       struct eventfd_ctx *eventfd_ctx;
>>>
>>
> 



More information about the Devel mailing list