[Devel] [PATCH VZ9] dm-qcow2: allow specifying depth to merge intermediate images
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Wed Sep 17 11:11:04 MSK 2025
On 9/16/25 00:18, Andrey Zhadchenko wrote:
>
>
> On 9/15/25 17:00, Konstantin Khorenko wrote:
>> On 08.09.2025 19:27, Andrey Zhadchenko wrote:
>>> dm-qcow2 is almost ready to merge intermediate images, thanks
>>> to separated metadata tables and qio being per-qcow2.
>>> So this patch mostly propagates which qcow2 we want to use
>>> instead of tgt->top.
>>>
>>> qcow2_merge_backward_complete is a bit harder due to in-middle
>>> replacement and needing to recalculate img_id for a part of
>>> image chain.
>>>
>>> https://virtuozzo.atlassian.net/browse/VSTOR-101375
>>> Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
>>> ---
>>> drivers/md/dm-qcow2-cmd.c | 82 ++++++++++++++++++++++++++++-----------
>>> drivers/md/dm-qcow2.h | 1 +
>>> 2 files changed, 61 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c
>>> index e77052ad61e6b..d59fa82dbffbc 100644
>>> --- a/drivers/md/dm-qcow2-cmd.c
>>> +++ b/drivers/md/dm-qcow2-cmd.c
>>> @@ -116,9 +116,9 @@ static int qcow2_service_iter(struct qcow2_target
>>> *tgt, struct qcow2 *qcow2,
>>> }
>>> ALLOW_ERROR_INJECTION(qcow2_service_iter, ERRNO);
>>> -static int qcow2_merge_common(struct qcow2_target *tgt)
>>> +static int qcow2_merge_common(struct qcow2_target *tgt, struct qcow2
>>> *qcow2)
>>> {
>>> - struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
>>> + struct qcow2 *lower = qcow2->lower;
>>> u32 clu_size = qcow2->clu_size;
>>> loff_t end = lower->hdr.size;
>>> @@ -139,9 +139,8 @@ static int qcow2_merge_forward(struct
>>> qcow2_target *tgt)
>>> }
>>> ALLOW_ERROR_INJECTION(qcow2_merge_forward, ERRNO);
>>> -static int qcow2_break_l1cow(struct qcow2_target *tgt)
>>> +static int qcow2_break_l1cow(struct qcow2_target *tgt, struct qcow2
>>> *qcow2)
>>> {
>>> - struct qcow2 *qcow2 = tgt->top;
>>> loff_t end = qcow2->hdr.size;
>>> loff_t step = (u64)qcow2->l2_entries * qcow2->clu_size;
>>> @@ -152,25 +151,35 @@ static int qcow2_break_l1cow(struct
>>> qcow2_target *tgt)
>>> static void set_backward_merge_in_process(struct qcow2_target *tgt,
>>> struct qcow2 *qcow2, bool set)
>>> {
>>> + struct qcow2 *top = tgt->top;
>>> LIST_HEAD(list);
>>> + /*
>>> + * There are no writes if it is not the top qcow2 image.
>>> + * so we do not need to stop and flush requests when setting
>>> + */
>>> + if (qcow2 != top && set) {
>>> + qcow2->backward_merge_in_process = set;
>>> + return;
>>> + }
>>> +
>>> /*
>>> * To avoid race between allocations and COWS
>>> * we completely stop queueing qios and wait
>>> * for pending qios. Lock is for visability.
>>> */
>>> - spin_lock_irq(&qcow2->deferred_lock);
>>> - qcow2->pause_submitting_qios = true;
>>> - spin_unlock_irq(&qcow2->deferred_lock);
>>> + spin_lock_irq(&top->deferred_lock);
>>> + top->pause_submitting_qios = true;
>>> + spin_unlock_irq(&top->deferred_lock);
>>> qcow2_inflight_ref_switch(tgt);
>>> /* queue is stopped */
>>> - spin_lock_irq(&qcow2->deferred_lock);
>>> + spin_lock_irq(&top->deferred_lock);
>>> WARN_ON_ONCE(qcow2->backward_merge_in_process == set);
>>> qcow2->backward_merge_in_process = set;
>>> - qcow2->pause_submitting_qios = false;
>>> - list_splice_init(&qcow2->paused_qios, &list);
>>> - spin_unlock_irq(&qcow2->deferred_lock);
>>> + top->pause_submitting_qios = false;
>>> + list_splice_init(&top->paused_qios, &list);
>>> + spin_unlock_irq(&top->deferred_lock);
>>> qcow2_submit_embedded_qios(tgt, &list);
>>> }
>>> @@ -240,11 +249,15 @@ static int qcow2_merge_backward_progress(struct
>>> qcow2_target *tgt,
>>> static int qcow2_merge_backward_set_eventfd(struct qcow2_target
>>> *tgt, int efd);
>>> -static int qcow2_merge_backward_start(struct qcow2_target *tgt, int
>>> efd)
>>> +static int qcow2_merge_backward_start(struct qcow2_target *tgt, int
>>> efd, int depth)
>>> {
>>> - struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower;
>>> + struct qcow2 *qcow2 = tgt->top, *lower;
>>> int ret;
>>> + while (depth-- > 0 && qcow2->lower)
>>> + qcow2 = qcow2->lower;
>>> + lower = qcow2->lower;
>>> +
>>> lockdep_assert_held(&tgt->ctl_mutex);
>>> if (!lower)
>>> @@ -263,6 +276,7 @@ static int qcow2_merge_backward_start(struct
>>> qcow2_target *tgt, int efd)
>>> if (ret)
>>> return ret;
>>> + tgt->backward_merge.qcow2 = qcow2;
>>> tgt->backward_merge.state = BACKWARD_MERGE_START;
>>> __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_START);
>>> tgt->backward_merge.error = 0;
>>> @@ -291,7 +305,7 @@ void qcow2_merge_backward_work(struct work_struct
>>> *work)
>>> __backward_merge_update_stage(tgt,
>>> BACKWARD_MERGE_STAGE_BREAK_L1COW);
>>> mutex_unlock(&tgt->ctl_mutex);
>>> - qcow2 = tgt->top;
>>> + qcow2 = tgt->backward_merge.qcow2;
>>> lower = qcow2->lower;
>>> /*
>>> @@ -300,7 +314,7 @@ void qcow2_merge_backward_work(struct work_struct
>>> *work)
>>> * we'd have to freeze IO going to all data clusters
>>> * under every L1 entry related to several snapshots.
>>> */
>>> - ret = qcow2_break_l1cow(tgt);
>>> + ret = qcow2_break_l1cow(tgt, qcow2);
>>> if (ret) {
>>> QC_ERR(tgt->ti, "Can't break L1 COW");
>>> goto out_err;
>>> @@ -316,7 +330,7 @@ void qcow2_merge_backward_work(struct work_struct
>>> *work)
>>> /* Start merge */
>>> backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_RUNNING);
>>> - ret = qcow2_merge_common(tgt);
>>> + ret = qcow2_merge_common(tgt, qcow2);
>>> if (ret) {
>>> set_backward_merge_in_process(tgt, qcow2, false);
>>> ret2 = qcow2_set_image_file_features(lower, false);
>>> @@ -357,9 +371,29 @@ static int qcow2_merge_backward_complete(struct
>>> qcow2_target *tgt)
>>> if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION)
>>> return -EBUSY;
>>> +
>>> + if (tgt->top == tgt->backward_merge.qcow2) {
>>> + tgt->top = lower;
>>> + } else {
>>> + struct qcow2 *top = tgt->top;
>>> +
>>> + while (top && top->lower != tgt->backward_merge.qcow2)
>>> + top = top->lower;
This algorithm can be simplified, we really only need to search the
place in delta list once on "start", save it and use it directly later.
To do that we need to save a double pointer to where qcow2 pointer is to
tgt->backward_merge.pqcow2 (e.g.: struct qcow2 **).
on start:
tgt->backward_merge.pqcow2 = &qcow2;
on complete:
*tgt->backward_merge.pqcow2 = (*tgt->backward_merge.pqcow2)->lower;
And we can still get qcow2 and lower like:
qcow2 = *tgt->backward_merge.pqcow2;
lower = qcow2->lower;
There is img_id change loop though, which will be still needed, but at
least we don't have two loops.
Note: By the way we might be able to replace qcow2->img_id on each delta
with just one dm_target->depth per target, as we only use img_id from
top delta to find the depth AFAICS (or search for (depth-n)-th delta
from top).
>>> + if (!top)
>>> + return -EFAULT;
We should not get to EFAULT as this list is only updated by us
("complete") and qcow2_ctr->qcow2_alloc_delta. This way we can have no
error here and move BACKWARD_MERGE_STAGE_COMPLETING stage setting above
this code, indicating that we are in the process of completion.
>>> +
>>> + qcow2 = tgt->backward_merge.qcow2;
>>> + lower = qcow2->lower;
>>> + top->lower = lower;
>>> +
>>> + top = tgt->top;
>>> + while (top != lower) {
>>> + top->img_id--;
>>> + top = top->lower;
>>> + }
>>
>> May be to check here (in while) if top == NULL and return an error is
>> yes?
>
> Why? We did some checks early. And also if we want to return error from
> here we need to increment id's back and revert top->lower change. I
> would rather not do that.
As we've already checked that we have all needed deltas on desired depth
as early as on backwardmerge start, check here is not needed, agreed.
But we can have BUG_ON here, to be extra careful in case delta list was
corrupted or some other unexpected thing happened.
>
>>
>>> + }
>>> +
>>> __backward_merge_update_stage(tgt,
>>> BACKWARD_MERGE_STAGE_COMPLETING);
>>> -
>>> - tgt->top = lower;
>>> smp_wmb(); /* Pairs with qcow2_ref_inc() */
>>> qcow2_inflight_ref_switch(tgt); /* Pending qios */
>>> qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */
>>> @@ -396,7 +430,7 @@ void qcow2_merge_backward_cancel(struct
>>> qcow2_target *tgt)
>>> } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) {
>>> flush = true;
>>> } else if (tgt->backward_merge.state ==
>>> BACKWARD_MERGE_WAIT_COMPLETION) {
>>> - set_backward_merge_in_process(tgt, tgt->top, false);
>>> + set_backward_merge_in_process(tgt, tgt-
>>> >backward_merge.qcow2, false);
>>> tgt->backward_merge.state = BACKWARD_MERGE_STOPPED;
>>> __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE);
>>> }
>>> @@ -572,7 +606,7 @@ int qcow2_message(struct dm_target *ti, unsigned
>>> int argc, char **argv,
>>> struct qcow2_target *tgt = to_qcow2_target(ti);
>>> int ret = -EPERM;
>>> u32 val, val2;
>>> - int efd;
>>> + int efd, depth = 0;
>>> if (!capable(CAP_SYS_ADMIN))
>>> goto out;
>>> @@ -652,11 +686,15 @@ int qcow2_message(struct dm_target *ti,
>>> unsigned int argc, char **argv,
>>> } else if (!strcmp(argv[0], "merge_backward")) {
>>> /* argc >= 2 */
>>> if (!strcmp(argv[1], "start")) {
>>> - if (argc != 3 || kstrtoint(argv[2], 10, &efd) || efd < 0) {
>>> + if (argc < 3 || argc > 4 || kstrtoint(argv[2], 10, &efd)
>>> || efd < 0) {
>>> ret = -EINVAL;
>>> goto out_unlock;
>>> }
>>> - ret = qcow2_merge_backward_start(tgt, efd);
>>> + if (argc == 4 && kstrtou32(argv[3], 10, &depth)) {
>>> + ret = -EINVAL;
>>> + goto out_unlock;
>>> + }
>>> + ret = qcow2_merge_backward_start(tgt, efd, depth);
>>> } else if (!strcmp(argv[1], "complete")) {
>>> if (argc != 2) {
>>> ret = -EINVAL;
>>> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h
>>> index 5aa00c6a5ebd5..c2b2193561461 100644
>>> --- a/drivers/md/dm-qcow2.h
>>> +++ b/drivers/md/dm-qcow2.h
>>> @@ -172,6 +172,7 @@ enum qcow2_backward_merge_stage {
>>> struct qcow2_backward_merge {
>>> struct work_struct work;
>>> + struct qcow2 *qcow2;
>>> enum qcow2_backward_merge_state state;
>>> int error;
>>> struct eventfd_ctx *eventfd_ctx;
>>
>
--
Best regards, Pavel Tikhomirov
Senior Software Developer, Virtuozzo.
More information about the Devel
mailing list