From ptikhomirov at virtuozzo.com Mon Mar 3 10:24:31 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 15:24:31 +0800 Subject: [Devel] [PATCH v3 VZ9 0/5] dm-qcow2: make backward merge asyncronous Message-ID: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. v2: rebase on top of vz9.80.19, make completion event consistent, fix deadlock when cancel after start and before work run v3: weaken locking in progress printing a bit to decrease possible lock contention https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov Pavel Tikhomirov (5): dm-qcow2: fix warning about wrong printk format for size_t dm-qcow2: cleanup error handling in qcow2_merge_backward dm-qcow2: make merge_backward command asyncronous dm-qcow2: add merge_backward set_eventfd command dm-qcow2: add merge_backward progress command drivers/md/dm-qcow2-cmd.c | 278 ++++++++++++++++++++++++++++++++--- drivers/md/dm-qcow2-map.c | 4 +- drivers/md/dm-qcow2-target.c | 6 + drivers/md/dm-qcow2.h | 35 +++++ 4 files changed, 297 insertions(+), 26 deletions(-) -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 10:24:32 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 15:24:32 +0800 Subject: [Devel] [PATCH v3 VZ9 1/5] dm-qcow2: fix warning about wrong printk format for size_t In-Reply-To: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> References: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303072506.1206960-2-ptikhomirov@virtuozzo.com> In file included from ./include/linux/kernel.h:20, from ./include/linux/list.h:9, from ./include/linux/preempt.h:12, from ./include/linux/spinlock.h:56, from drivers/md/dm-qcow2-map.c:5: drivers/md/dm-qcow2-map.c: In function ?process_compressed_read?: ./include/linux/kern_levels.h:5:25: warning: format ?%d? expects argument of type ?int?, but argument 3 has type ?size_t? {aka ?long unsigned int?} [-Wformat=] 5 | #define KERN_SOH "\001" /* ASCII Start Of Header */ | ^~~~~~ ./include/linux/printk.h:497:25: note: in definition of macro ?printk_index_wrap? 497 | _p_func(_fmt, ##__VA_ARGS__); \ | ^~~~ ./include/linux/printk.h:568:9: note: in expansion of macro ?printk? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~ ./include/linux/kern_levels.h:11:25: note: in expansion of macro ?KERN_SOH? 11 | #define KERN_ERR KERN_SOH "3" /* error conditions */ | ^~~~~~~~ ./include/linux/printk.h:568:16: note: in expansion of macro ?KERN_ERR? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~~~ drivers/md/dm-qcow2.h:215:33: note: in expansion of macro ?pr_err? 215 | #define QC_ERR(dmti, fmt, ...) pr_err (QCOW2_FMT(fmt), \ | ^~~~~~ drivers/md/dm-qcow2-map.c:3691:41: note: in expansion of macro ?QC_ERR? 3691 | QC_ERR(qcow2->tgt->ti, | ^~~~~~ While on it fix line wrap alignment. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Rebase on top of vz9.80.19, "%lu" is also incorrect, see Documentation/core-api/printk-formats.rst. --- drivers/md/dm-qcow2-map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c index 7a1312a74e9fb..f7cb036bb416e 100644 --- a/drivers/md/dm-qcow2-map.c +++ b/drivers/md/dm-qcow2-map.c @@ -3689,8 +3689,8 @@ static void process_compressed_read(struct list_head *read_list, buf = kvmalloc(qcow2->clu_size + dctxlen, GFP_NOIO); if (!buf) { QC_ERR(qcow2->tgt->ti, - "can not allocate decompression buffer:%lu", - qcow2->clu_size + dctxlen); + "can not allocate decompression buffer:%zu", + qcow2->clu_size + dctxlen); end_qios(read_list, BLK_STS_RESOURCE); return; } -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 10:24:33 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 15:24:33 +0800 Subject: [Devel] [PATCH v3 VZ9 2/5] dm-qcow2: cleanup error handling in qcow2_merge_backward In-Reply-To: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> References: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303072506.1206960-3-ptikhomirov@virtuozzo.com> The label "out" is excess, lets remove it in accordance with: "If there is no cleanup needed then just return directly." https://www.kernel.org/doc/html/v4.10/process/coding-style.html#centralized-exiting-of-functions https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov --- drivers/md/dm-qcow2-cmd.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 6dc7e07220557..7b4b0ee68ad9f 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -166,18 +166,14 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; int ret, ret2; - ret = -ENOENT; if (!lower) - goto out; - ret = -EACCES; + return -ENOENT; if (!(lower->file->f_mode & FMODE_WRITE)) - goto out; - ret = -EOPNOTSUPP; + return -EACCES; if (qcow2->clu_size != lower->clu_size) - goto out; - ret = -EBADSLT; + return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) - goto out; + return -EBADSLT; /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -187,13 +183,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - goto out; + return ret; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - goto out; + return ret; } set_backward_merge_in_process(tgt, qcow2, true); @@ -204,7 +200,7 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - goto out; + return ret; } tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -216,8 +212,8 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); qcow2_destroy(qcow2); -out: - return ret; + + return 0; } ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 10:24:34 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 15:24:34 +0800 Subject: [Devel] [PATCH v3 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> References: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303072506.1206960-4-ptikhomirov@virtuozzo.com> This adds merge_backward "start", "complete" and "cancel" commands. By that we are able to split single merge_backward into two stages: start asyncronous merging and completion. That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. The "start" command runs merging preparations in workqueue work. After it finishes, the "complete" command can be called to finish the process and actually replace the top qcow2 with it's lower. The "cancel" command forces the work to stop and flushes it. In case we are in completion waiting state already and there is no work running, the "cancel" command also reverts merging preparations. Locking: Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" and "complete" commands are fully under this lock, and the "cancel" operation takes the lock explicitly and releases it for work flushing. The work also takes the lock but only when updating tgt->backward_merge data. For checks, if the work was caneled in the middle, we read the state without locking as we don't modify the state there, also we would re-check the state again before exiting the work function under lock. Now on target suspend we "cancel" currently running backward merge, previously we were just hanging untill backward merge have been finished for possibly a long time, cancelling seems cleaner. Though we don't really expect hypervisor suspending the target in the middle of backward merge that it by itself started. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Cancel from BACKWARD_MERGE_START state should not try to stop the work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. --- drivers/md/dm-qcow2-cmd.c | 142 +++++++++++++++++++++++++++++++---- drivers/md/dm-qcow2-target.c | 6 ++ drivers/md/dm-qcow2.h | 19 +++++ 3 files changed, 153 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 7b4b0ee68ad9f..04a992f3ebba6 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); + static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) { @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); for (pos = 0; pos < end; pos += step) { - if (fatal_signal_pending(current)) { + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; } @@ -161,10 +163,11 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } -static int qcow2_merge_backward(struct qcow2_target *tgt) +static int qcow2_merge_backward_start(struct qcow2_target *tgt) { struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; - int ret, ret2; + + lockdep_assert_held(&tgt->ctl_mutex); if (!lower) return -ENOENT; @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) return -EBADSLT; + + if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) + return -EBUSY; + tgt->backward_merge.state = BACKWARD_MERGE_START; + tgt->backward_merge.error = 0; + + schedule_work(&tgt->backward_merge.work); + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); + +void qcow2_merge_backward_work(struct work_struct *work) +{ + struct qcow2_target *tgt = container_of(work, struct qcow2_target, + backward_merge.work); + struct qcow2 *qcow2, *lower; + int ret, ret2; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state != BACKWARD_MERGE_START) { + mutex_unlock(&tgt->ctl_mutex); + return; + } + tgt->backward_merge.state = BACKWARD_MERGE_RUN; + mutex_unlock(&tgt->ctl_mutex); + + qcow2 = tgt->top; + lower = qcow2->lower; + /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - return ret; + goto out_err; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - return ret; + goto out_err; } set_backward_merge_in_process(tgt, qcow2, true); @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - return ret; } + +out_err: + mutex_lock(&tgt->ctl_mutex); + if (ret) { + /* Error */ + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = ret; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + /* Merge is canceled */ + set_backward_merge_in_process(tgt, qcow2, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = -EINTR; + } else { + /* Finish merge */ + tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + } + mutex_unlock(&tgt->ctl_mutex); +} + +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) +{ + struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); + + if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) + return -EBUSY; + tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ qcow2_inflight_ref_switch(tgt); /* Pending qios */ qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ qcow2->lower = NULL; - ret2 = qcow2_set_image_file_features(qcow2, false); - if (ret2 < 0) - QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); + ret = qcow2_set_image_file_features(qcow2, false); + if (ret < 0) + QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); qcow2_destroy(qcow2); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + return 0; } -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); + +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) +{ + bool flush = false; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { + mutex_unlock(&tgt->ctl_mutex); + return; + } + + if (tgt->backward_merge.state == BACKWARD_MERGE_START) { + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { + tgt->backward_merge.state = BACKWARD_MERGE_STOP; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { + set_backward_merge_in_process(tgt, tgt->top, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + } + mutex_unlock(&tgt->ctl_mutex); + + if (flush) + flush_work(&tgt->backward_merge.work); +} + +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) +{ + return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; +} static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) { @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_get_event(tgt, result, maxlen); goto out; + } else if (!strcmp(argv[0], "merge_backward")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + if (!strcmp(argv[1], "cancel")) { + qcow2_merge_backward_cancel(tgt); + ret = 0; + goto out; + } } - ret = mutex_lock_killable(&tgt->ctl_mutex); - if (ret) - goto out; + mutex_lock(&tgt->ctl_mutex); if (!strcmp(argv[0], "get_errors")) { ret = qcow2_get_errors(tgt, result, maxlen); @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcmp(argv[0], "merge_forward")) { ret = qcow2_merge_forward(tgt); } else if (!strcmp(argv[0], "merge_backward")) { - ret = qcow2_merge_backward(tgt); + if (argc != 2) { + ret = -EINVAL; + mutex_unlock(&tgt->ctl_mutex); + goto out; + } + if (!strcmp(argv[1], "start")) { + ret = qcow2_merge_backward_start(tgt); + } else if (!strcmp(argv[1], "complete")) { + ret = qcow2_merge_backward_complete(tgt); + } else { + ret = -ENOTTY; + } } else { ret = -ENOTTY; } diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c index 540c03cb3c44f..6e2e583ba0b8b 100644 --- a/drivers/md/dm-qcow2-target.c +++ b/drivers/md/dm-qcow2-target.c @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct dm_target *ti, bool allowed) mutex_lock(&tgt->ctl_mutex); tgt->service_operations_allowed = allowed; mutex_unlock(&tgt->ctl_mutex); + if (!allowed) + qcow2_merge_backward_cancel(tgt); } static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) { @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target *tgt) /* Now kill the queue */ destroy_workqueue(tgt->wq); } + qcow2_merge_backward_cancel(tgt); mempool_destroy(tgt->qio_pool); mempool_destroy(tgt->qrq_pool); @@ -494,6 +497,9 @@ static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti) timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); ti->private = tgt; tgt->ti = ti; + + INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); + qcow2_set_service_operations(ti, false); return tgt; diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index a89fe3db2196d..bebfdc50ed6d4 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -149,6 +149,20 @@ struct md_page { struct list_head wpc_readers_wait_list; }; +enum qcow2_backward_merge_state { + BACKWARD_MERGE_STOPPED = 0, + BACKWARD_MERGE_START, + BACKWARD_MERGE_RUN, + BACKWARD_MERGE_WAIT_COMPLETION, + BACKWARD_MERGE_STOP, +}; + +struct qcow2_backward_merge { + struct work_struct work; + enum qcow2_backward_merge_state state; + int error; +}; + struct qcow2_target { struct dm_target *ti; #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */ @@ -180,6 +194,8 @@ struct qcow2_target { struct work_struct event_work; spinlock_t event_lock; struct mutex ctl_mutex; + + struct qcow2_backward_merge backward_merge; }; enum { @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target *tgt); void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2); int qcow2_truncate_safe(struct file *file, loff_t new_len); +void qcow2_merge_backward_work(struct work_struct *work); +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); + static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti) { return ti->private; -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 10:24:35 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 15:24:35 +0800 Subject: [Devel] [PATCH v3 VZ9 4/5] dm-qcow2: add merge_backward set_eventfd command In-Reply-To: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> References: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303072506.1206960-5-ptikhomirov@virtuozzo.com> This eventfd can be used to get an event when merge_backward start work have finished and is waiting for completion. Note: The eventfd can be changed even while work is running. Locking: The backward_merge.eventfd_ctx is protected from being released by tgt->ctl_mutex. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Always report that work finished, e.g. also on error or then it was canceled, this should be more consistent from the userspace perspective. --- drivers/md/dm-qcow2-cmd.c | 39 ++++++++++++++++++++++++++++++++++++++- drivers/md/dm-qcow2.h | 2 ++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 04a992f3ebba6..7f9c582778d5f 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include "dm-qcow2.h" @@ -197,6 +199,8 @@ void qcow2_merge_backward_work(struct work_struct *work) mutex_lock(&tgt->ctl_mutex); if (tgt->backward_merge.state != BACKWARD_MERGE_START) { + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); mutex_unlock(&tgt->ctl_mutex); return; } @@ -249,6 +253,8 @@ void qcow2_merge_backward_work(struct work_struct *work) /* Finish merge */ tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; } + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); mutex_unlock(&tgt->ctl_mutex); } @@ -312,6 +318,24 @@ static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; } +#define QCOW2_FILE_UNBIND -1 + +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd) +{ + struct eventfd_ctx *ctx = NULL; + + ctx = efd == QCOW2_FILE_UNBIND ? NULL : eventfd_ctx_fdget(efd); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&tgt->ctl_mutex); + swap(ctx, tgt->backward_merge.eventfd_ctx); + if (ctx) + eventfd_ctx_put(ctx); + mutex_unlock(&tgt->ctl_mutex); + return 0; +} + static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) { struct qcow2 *qcow2; @@ -470,14 +494,27 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, ret = qcow2_get_event(tgt, result, maxlen); goto out; } else if (!strcmp(argv[0], "merge_backward")) { - if (argc != 2) { + if (argc < 2) { ret = -EINVAL; goto out; } if (!strcmp(argv[1], "cancel")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } qcow2_merge_backward_cancel(tgt); ret = 0; goto out; + } else if (!strcmp(argv[1], "set_eventfd")) { + int efd; + + if (argc != 3 || kstrtoint(argv[2], 10, &efd)) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + goto out; } } diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index bebfdc50ed6d4..c4956e3fd0eb7 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "dm-core.h" #define DM_MSG_PREFIX "qcow2" @@ -161,6 +162,7 @@ struct qcow2_backward_merge { struct work_struct work; enum qcow2_backward_merge_state state; int error; + struct eventfd_ctx *eventfd_ctx; }; struct qcow2_target { -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 10:24:36 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 15:24:36 +0800 Subject: [Devel] [PATCH v3 VZ9 5/5] dm-qcow2: add merge_backward progress command In-Reply-To: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> References: <20250303072506.1206960-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303072506.1206960-6-ptikhomirov@virtuozzo.com> This allows to see progress of backward merge. It shows the stage we are at and for iterative stages it provides progress in form of how many iteratious are done and how many iterations there are in total. Locking: The progress data consistency is protected by tgt->ctl_mutex, we always update stage and error consistently under lock. Inside iterative stages for progress updating we have xchg instead of lock so that changes to progress are atomic and imply memory barrier (this way we would not see progress greater than max_progress in progress reporting), but at the same time there is less contention on tgt->ctl_mutex. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v3: Adress Kostya's review comments: move progress printing out of lock, remove excess updates of max_progress, make progress updates without lock. --- drivers/md/dm-qcow2-cmd.c | 83 +++++++++++++++++++++++++++++++++++++++ drivers/md/dm-qcow2.h | 14 +++++++ 2 files changed, 97 insertions(+) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 7f9c582778d5f..b9d37e78b7577 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -54,6 +54,10 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress); +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress); static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, @@ -66,7 +70,10 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); + backward_merge_update_max_progress(tgt, end); for (pos = 0; pos < end; pos += step) { + backward_merge_update_progress(tgt, pos); + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; @@ -165,6 +172,66 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } +static void __backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + tgt->backward_merge.stage = stage; + tgt->backward_merge.progress = 0; + tgt->backward_merge.max_progress = 0; +} + +static void backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + mutex_lock(&tgt->ctl_mutex); + __backward_merge_update_stage(tgt, stage); + mutex_unlock(&tgt->ctl_mutex); +} + +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress) +{ + xchg(&tgt->backward_merge.max_progress, max_progress); +} + +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress) +{ + xchg(&tgt->backward_merge.progress, progress); +} + +char *backward_merge_stage_names[] = { + "none", + "break_l1cow", + "set_dirty", + "running", + "waiting_completion", + "completing", + "fail", +}; + +static int qcow2_merge_backward_progress(struct qcow2_target *tgt, + char *result, unsigned int maxlen) +{ + struct qcow2_backward_merge backward_merge; + unsigned int sz = 0; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(backward_merge_stage_names) != BACKWARD_MERGE_STAGE_MAX); + + mutex_lock(&tgt->ctl_mutex); + backward_merge = tgt->backward_merge; + mutex_unlock(&tgt->ctl_mutex); + + ret = DMEMIT("stage=%s\nprogress=%lld\nmax_progress=%lld\nerror=%d\n", + backward_merge_stage_names[backward_merge.stage], + backward_merge.progress, + backward_merge.max_progress, + backward_merge.error); + + return ret ? 1 : 0; +} + static int qcow2_merge_backward_start(struct qcow2_target *tgt) { struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; @@ -205,6 +272,7 @@ void qcow2_merge_backward_work(struct work_struct *work) return; } tgt->backward_merge.state = BACKWARD_MERGE_RUN; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_BREAK_L1COW); mutex_unlock(&tgt->ctl_mutex); qcow2 = tgt->top; @@ -222,6 +290,7 @@ void qcow2_merge_backward_work(struct work_struct *work) goto out_err; } + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_SET_DIRTY); ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); @@ -230,6 +299,7 @@ void qcow2_merge_backward_work(struct work_struct *work) set_backward_merge_in_process(tgt, qcow2, true); /* Start merge */ + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_RUNNING); ret = qcow2_merge_common(tgt); if (ret) { set_backward_merge_in_process(tgt, qcow2, false); @@ -244,14 +314,17 @@ void qcow2_merge_backward_work(struct work_struct *work) /* Error */ tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = ret; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { /* Merge is canceled */ set_backward_merge_in_process(tgt, qcow2, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = -EINTR; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else { /* Finish merge */ tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_WAITING_COMPLETION); } if (tgt->backward_merge.eventfd_ctx) eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); @@ -267,6 +340,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) return -EBUSY; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_COMPLETING); tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -280,6 +354,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) qcow2_destroy(qcow2); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); return 0; } @@ -306,6 +381,7 @@ void qcow2_merge_backward_cancel(struct qcow2_target *tgt) } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { set_backward_merge_in_process(tgt, tgt->top, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); } mutex_unlock(&tgt->ctl_mutex); @@ -515,6 +591,13 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_merge_backward_set_eventfd(tgt, efd); goto out; + } else if (!strcmp(argv[1], "progress")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_progress(tgt, result, maxlen); + goto out; } } diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index c4956e3fd0eb7..ed7cf79348052 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -158,11 +158,25 @@ enum qcow2_backward_merge_state { BACKWARD_MERGE_STOP, }; +enum qcow2_backward_merge_stage { + BACKWARD_MERGE_STAGE_NONE = 0, + BACKWARD_MERGE_STAGE_BREAK_L1COW, + BACKWARD_MERGE_STAGE_SET_DIRTY, + BACKWARD_MERGE_STAGE_RUNNING, + BACKWARD_MERGE_STAGE_WAITING_COMPLETION, + BACKWARD_MERGE_STAGE_COMPLETING, + BACKWARD_MERGE_STAGE_FAIL, + BACKWARD_MERGE_STAGE_MAX, +}; + struct qcow2_backward_merge { struct work_struct work; enum qcow2_backward_merge_state state; int error; struct eventfd_ctx *eventfd_ctx; + enum qcow2_backward_merge_stage stage; + long long progress; + long long max_progress; }; struct qcow2_target { -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 10:37:04 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 15:37:04 +0800 Subject: [Devel] [PATCH VZ9] vhost/vsock: remove unused variable i in VHOST_RESET_OWNER ioctl Message-ID: <20250303073842.1209656-1-ptikhomirov@virtuozzo.com> Fixes compilation warning: drivers/vhost/vsock.c: In function ?vhost_vsock_reset_owner?: drivers/vhost/vsock.c:846:16: warning: unused variable ?i? [-Wunused-variable] 846 | size_t i; | ^ Fixes: ad35221ad1341 ("vhost/vsock: add VHOST_RESET_OWNER ioctl") Signed-off-by: Pavel Tikhomirov Feature: vhost-vsock: VHOST_RESET_OWNER ioctl --- drivers/vhost/vsock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 3654fa0fd5849..36750e163052f 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -843,7 +843,6 @@ static int vhost_vsock_reset_owner(struct vhost_vsock *vsock) { struct vhost_iotlb *umem; long err; - size_t i; mutex_lock(&vsock->dev.mutex); err = vhost_dev_check_owner(&vsock->dev); -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 12:37:20 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 17:37:20 +0800 Subject: [Devel] [PATCH v4 VZ9 0/5] dm-qcow2: make backward merge asyncronous Message-ID: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. v2: rebase on top of vz9.80.19, make completion event consistent, fix deadlock when cancel after start and before work run v3: weaken locking in progress printing a bit to decrease possible lock contention v4: signal that we are at completion waiting on change of eventfd https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov Pavel Tikhomirov (5): dm-qcow2: fix warning about wrong printk format for size_t dm-qcow2: cleanup error handling in qcow2_merge_backward dm-qcow2: make merge_backward command asyncronous dm-qcow2: add merge_backward set_eventfd command dm-qcow2: add merge_backward progress command drivers/md/dm-qcow2-cmd.c | 281 ++++++++++++++++++++++++++++++++--- drivers/md/dm-qcow2-map.c | 4 +- drivers/md/dm-qcow2-target.c | 6 + drivers/md/dm-qcow2.h | 35 +++++ 4 files changed, 300 insertions(+), 26 deletions(-) -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 12:37:21 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 17:37:21 +0800 Subject: [Devel] [PATCH v4 VZ9 1/5] dm-qcow2: fix warning about wrong printk format for size_t In-Reply-To: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303093802.1233834-2-ptikhomirov@virtuozzo.com> In file included from ./include/linux/kernel.h:20, from ./include/linux/list.h:9, from ./include/linux/preempt.h:12, from ./include/linux/spinlock.h:56, from drivers/md/dm-qcow2-map.c:5: drivers/md/dm-qcow2-map.c: In function ?process_compressed_read?: ./include/linux/kern_levels.h:5:25: warning: format ?%d? expects argument of type ?int?, but argument 3 has type ?size_t? {aka ?long unsigned int?} [-Wformat=] 5 | #define KERN_SOH "\001" /* ASCII Start Of Header */ | ^~~~~~ ./include/linux/printk.h:497:25: note: in definition of macro ?printk_index_wrap? 497 | _p_func(_fmt, ##__VA_ARGS__); \ | ^~~~ ./include/linux/printk.h:568:9: note: in expansion of macro ?printk? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~ ./include/linux/kern_levels.h:11:25: note: in expansion of macro ?KERN_SOH? 11 | #define KERN_ERR KERN_SOH "3" /* error conditions */ | ^~~~~~~~ ./include/linux/printk.h:568:16: note: in expansion of macro ?KERN_ERR? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~~~ drivers/md/dm-qcow2.h:215:33: note: in expansion of macro ?pr_err? 215 | #define QC_ERR(dmti, fmt, ...) pr_err (QCOW2_FMT(fmt), \ | ^~~~~~ drivers/md/dm-qcow2-map.c:3691:41: note: in expansion of macro ?QC_ERR? 3691 | QC_ERR(qcow2->tgt->ti, | ^~~~~~ While on it fix line wrap alignment. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Rebase on top of vz9.80.19, "%lu" is also incorrect, see Documentation/core-api/printk-formats.rst. --- drivers/md/dm-qcow2-map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c index 7a1312a74e9fb..f7cb036bb416e 100644 --- a/drivers/md/dm-qcow2-map.c +++ b/drivers/md/dm-qcow2-map.c @@ -3689,8 +3689,8 @@ static void process_compressed_read(struct list_head *read_list, buf = kvmalloc(qcow2->clu_size + dctxlen, GFP_NOIO); if (!buf) { QC_ERR(qcow2->tgt->ti, - "can not allocate decompression buffer:%lu", - qcow2->clu_size + dctxlen); + "can not allocate decompression buffer:%zu", + qcow2->clu_size + dctxlen); end_qios(read_list, BLK_STS_RESOURCE); return; } -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 12:37:22 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 17:37:22 +0800 Subject: [Devel] [PATCH v4 VZ9 2/5] dm-qcow2: cleanup error handling in qcow2_merge_backward In-Reply-To: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303093802.1233834-3-ptikhomirov@virtuozzo.com> The label "out" is excess, lets remove it in accordance with: "If there is no cleanup needed then just return directly." https://www.kernel.org/doc/html/v4.10/process/coding-style.html#centralized-exiting-of-functions https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov --- drivers/md/dm-qcow2-cmd.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 6dc7e07220557..7b4b0ee68ad9f 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -166,18 +166,14 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; int ret, ret2; - ret = -ENOENT; if (!lower) - goto out; - ret = -EACCES; + return -ENOENT; if (!(lower->file->f_mode & FMODE_WRITE)) - goto out; - ret = -EOPNOTSUPP; + return -EACCES; if (qcow2->clu_size != lower->clu_size) - goto out; - ret = -EBADSLT; + return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) - goto out; + return -EBADSLT; /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -187,13 +183,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - goto out; + return ret; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - goto out; + return ret; } set_backward_merge_in_process(tgt, qcow2, true); @@ -204,7 +200,7 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - goto out; + return ret; } tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -216,8 +212,8 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); qcow2_destroy(qcow2); -out: - return ret; + + return 0; } ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 12:37:23 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 17:37:23 +0800 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> This adds merge_backward "start", "complete" and "cancel" commands. By that we are able to split single merge_backward into two stages: start asyncronous merging and completion. That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. The "start" command runs merging preparations in workqueue work. After it finishes, the "complete" command can be called to finish the process and actually replace the top qcow2 with it's lower. The "cancel" command forces the work to stop and flushes it. In case we are in completion waiting state already and there is no work running, the "cancel" command also reverts merging preparations. Locking: Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" and "complete" commands are fully under this lock, and the "cancel" operation takes the lock explicitly and releases it for work flushing. The work also takes the lock but only when updating tgt->backward_merge data. For checks, if the work was caneled in the middle, we read the state without locking as we don't modify the state there, also we would re-check the state again before exiting the work function under lock. Now on target suspend we "cancel" currently running backward merge, previously we were just hanging untill backward merge have been finished for possibly a long time, cancelling seems cleaner. Though we don't really expect hypervisor suspending the target in the middle of backward merge that it by itself started. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Cancel from BACKWARD_MERGE_START state should not try to stop the work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. --- drivers/md/dm-qcow2-cmd.c | 142 +++++++++++++++++++++++++++++++---- drivers/md/dm-qcow2-target.c | 6 ++ drivers/md/dm-qcow2.h | 19 +++++ 3 files changed, 153 insertions(+), 14 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 7b4b0ee68ad9f..04a992f3ebba6 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); + static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) { @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); for (pos = 0; pos < end; pos += step) { - if (fatal_signal_pending(current)) { + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; } @@ -161,10 +163,11 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } -static int qcow2_merge_backward(struct qcow2_target *tgt) +static int qcow2_merge_backward_start(struct qcow2_target *tgt) { struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; - int ret, ret2; + + lockdep_assert_held(&tgt->ctl_mutex); if (!lower) return -ENOENT; @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) return -EBADSLT; + + if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) + return -EBUSY; + tgt->backward_merge.state = BACKWARD_MERGE_START; + tgt->backward_merge.error = 0; + + schedule_work(&tgt->backward_merge.work); + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); + +void qcow2_merge_backward_work(struct work_struct *work) +{ + struct qcow2_target *tgt = container_of(work, struct qcow2_target, + backward_merge.work); + struct qcow2 *qcow2, *lower; + int ret, ret2; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state != BACKWARD_MERGE_START) { + mutex_unlock(&tgt->ctl_mutex); + return; + } + tgt->backward_merge.state = BACKWARD_MERGE_RUN; + mutex_unlock(&tgt->ctl_mutex); + + qcow2 = tgt->top; + lower = qcow2->lower; + /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - return ret; + goto out_err; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - return ret; + goto out_err; } set_backward_merge_in_process(tgt, qcow2, true); @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - return ret; } + +out_err: + mutex_lock(&tgt->ctl_mutex); + if (ret) { + /* Error */ + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = ret; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + /* Merge is canceled */ + set_backward_merge_in_process(tgt, qcow2, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = -EINTR; + } else { + /* Finish merge */ + tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + } + mutex_unlock(&tgt->ctl_mutex); +} + +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) +{ + struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); + + if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) + return -EBUSY; + tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ qcow2_inflight_ref_switch(tgt); /* Pending qios */ qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ qcow2->lower = NULL; - ret2 = qcow2_set_image_file_features(qcow2, false); - if (ret2 < 0) - QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); + ret = qcow2_set_image_file_features(qcow2, false); + if (ret < 0) + QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); qcow2_destroy(qcow2); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + return 0; } -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); + +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) +{ + bool flush = false; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { + mutex_unlock(&tgt->ctl_mutex); + return; + } + + if (tgt->backward_merge.state == BACKWARD_MERGE_START) { + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { + tgt->backward_merge.state = BACKWARD_MERGE_STOP; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { + set_backward_merge_in_process(tgt, tgt->top, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + } + mutex_unlock(&tgt->ctl_mutex); + + if (flush) + flush_work(&tgt->backward_merge.work); +} + +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) +{ + return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; +} static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) { @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_get_event(tgt, result, maxlen); goto out; + } else if (!strcmp(argv[0], "merge_backward")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + if (!strcmp(argv[1], "cancel")) { + qcow2_merge_backward_cancel(tgt); + ret = 0; + goto out; + } } - ret = mutex_lock_killable(&tgt->ctl_mutex); - if (ret) - goto out; + mutex_lock(&tgt->ctl_mutex); if (!strcmp(argv[0], "get_errors")) { ret = qcow2_get_errors(tgt, result, maxlen); @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcmp(argv[0], "merge_forward")) { ret = qcow2_merge_forward(tgt); } else if (!strcmp(argv[0], "merge_backward")) { - ret = qcow2_merge_backward(tgt); + if (argc != 2) { + ret = -EINVAL; + mutex_unlock(&tgt->ctl_mutex); + goto out; + } + if (!strcmp(argv[1], "start")) { + ret = qcow2_merge_backward_start(tgt); + } else if (!strcmp(argv[1], "complete")) { + ret = qcow2_merge_backward_complete(tgt); + } else { + ret = -ENOTTY; + } } else { ret = -ENOTTY; } diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c index 540c03cb3c44f..6e2e583ba0b8b 100644 --- a/drivers/md/dm-qcow2-target.c +++ b/drivers/md/dm-qcow2-target.c @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct dm_target *ti, bool allowed) mutex_lock(&tgt->ctl_mutex); tgt->service_operations_allowed = allowed; mutex_unlock(&tgt->ctl_mutex); + if (!allowed) + qcow2_merge_backward_cancel(tgt); } static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) { @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target *tgt) /* Now kill the queue */ destroy_workqueue(tgt->wq); } + qcow2_merge_backward_cancel(tgt); mempool_destroy(tgt->qio_pool); mempool_destroy(tgt->qrq_pool); @@ -494,6 +497,9 @@ static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti) timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); ti->private = tgt; tgt->ti = ti; + + INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); + qcow2_set_service_operations(ti, false); return tgt; diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index a89fe3db2196d..bebfdc50ed6d4 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -149,6 +149,20 @@ struct md_page { struct list_head wpc_readers_wait_list; }; +enum qcow2_backward_merge_state { + BACKWARD_MERGE_STOPPED = 0, + BACKWARD_MERGE_START, + BACKWARD_MERGE_RUN, + BACKWARD_MERGE_WAIT_COMPLETION, + BACKWARD_MERGE_STOP, +}; + +struct qcow2_backward_merge { + struct work_struct work; + enum qcow2_backward_merge_state state; + int error; +}; + struct qcow2_target { struct dm_target *ti; #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */ @@ -180,6 +194,8 @@ struct qcow2_target { struct work_struct event_work; spinlock_t event_lock; struct mutex ctl_mutex; + + struct qcow2_backward_merge backward_merge; }; enum { @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target *tgt); void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2); int qcow2_truncate_safe(struct file *file, loff_t new_len); +void qcow2_merge_backward_work(struct work_struct *work); +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); + static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti) { return ti->private; -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 12:37:24 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 17:37:24 +0800 Subject: [Devel] [PATCH v4 VZ9 4/5] dm-qcow2: add merge_backward set_eventfd command In-Reply-To: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303093802.1233834-5-ptikhomirov@virtuozzo.com> This eventfd can be used to get an event when merge_backward start work have finished and is waiting for completion. Note: The eventfd can be changed even while work is running. Locking: The backward_merge.eventfd_ctx is protected from being released by tgt->ctl_mutex. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Always report that work finished, e.g. also on error or then it was canceled, this should be more consistent from the userspace perspective. v4: Address Andrey's reveiw: signal that we are at completion waiting on change of eventfd. --- drivers/md/dm-qcow2-cmd.c | 42 ++++++++++++++++++++++++++++++++++++++- drivers/md/dm-qcow2.h | 2 ++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 04a992f3ebba6..f16b4f731ca5a 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include "dm-qcow2.h" @@ -197,6 +199,8 @@ void qcow2_merge_backward_work(struct work_struct *work) mutex_lock(&tgt->ctl_mutex); if (tgt->backward_merge.state != BACKWARD_MERGE_START) { + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); mutex_unlock(&tgt->ctl_mutex); return; } @@ -249,6 +253,8 @@ void qcow2_merge_backward_work(struct work_struct *work) /* Finish merge */ tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; } + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); mutex_unlock(&tgt->ctl_mutex); } @@ -312,6 +318,27 @@ static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; } +#define QCOW2_FILE_UNBIND -1 + +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd) +{ + struct eventfd_ctx *ctx = NULL; + + ctx = efd == QCOW2_FILE_UNBIND ? NULL : eventfd_ctx_fdget(efd); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + mutex_lock(&tgt->ctl_mutex); + swap(ctx, tgt->backward_merge.eventfd_ctx); + if (ctx) + eventfd_ctx_put(ctx); + if (tgt->backward_merge.eventfd_ctx && + tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); + mutex_unlock(&tgt->ctl_mutex); + return 0; +} + static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) { struct qcow2 *qcow2; @@ -470,14 +497,27 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, ret = qcow2_get_event(tgt, result, maxlen); goto out; } else if (!strcmp(argv[0], "merge_backward")) { - if (argc != 2) { + if (argc < 2) { ret = -EINVAL; goto out; } if (!strcmp(argv[1], "cancel")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } qcow2_merge_backward_cancel(tgt); ret = 0; goto out; + } else if (!strcmp(argv[1], "set_eventfd")) { + int efd; + + if (argc != 3 || kstrtoint(argv[2], 10, &efd)) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + goto out; } } diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index bebfdc50ed6d4..c4956e3fd0eb7 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "dm-core.h" #define DM_MSG_PREFIX "qcow2" @@ -161,6 +162,7 @@ struct qcow2_backward_merge { struct work_struct work; enum qcow2_backward_merge_state state; int error; + struct eventfd_ctx *eventfd_ctx; }; struct qcow2_target { -- 2.48.1 From ptikhomirov at virtuozzo.com Mon Mar 3 12:37:25 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Mon, 3 Mar 2025 17:37:25 +0800 Subject: [Devel] [PATCH v4 VZ9 5/5] dm-qcow2: add merge_backward progress command In-Reply-To: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> Message-ID: <20250303093802.1233834-6-ptikhomirov@virtuozzo.com> This allows to see progress of backward merge. It shows the stage we are at and for iterative stages it provides progress in form of how many iteratious are done and how many iterations there are in total. Locking: The progress data consistency is protected by tgt->ctl_mutex, we always update stage and error consistently under lock. Inside iterative stages for progress updating we have xchg instead of lock so that changes to progress are atomic and imply memory barrier (this way we would not see progress greater than max_progress in progress reporting), but at the same time there is less contention on tgt->ctl_mutex. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v3: Address Kostya's review comments: move progress printing out of lock, remove excess updates of max_progress, make progress updates without lock. --- drivers/md/dm-qcow2-cmd.c | 83 +++++++++++++++++++++++++++++++++++++++ drivers/md/dm-qcow2.h | 14 +++++++ 2 files changed, 97 insertions(+) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index f16b4f731ca5a..4d50b2f9284e4 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -54,6 +54,10 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress); +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress); static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, @@ -66,7 +70,10 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); + backward_merge_update_max_progress(tgt, end); for (pos = 0; pos < end; pos += step) { + backward_merge_update_progress(tgt, pos); + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; @@ -165,6 +172,66 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } +static void __backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + tgt->backward_merge.stage = stage; + tgt->backward_merge.progress = 0; + tgt->backward_merge.max_progress = 0; +} + +static void backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + mutex_lock(&tgt->ctl_mutex); + __backward_merge_update_stage(tgt, stage); + mutex_unlock(&tgt->ctl_mutex); +} + +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress) +{ + xchg(&tgt->backward_merge.max_progress, max_progress); +} + +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress) +{ + xchg(&tgt->backward_merge.progress, progress); +} + +char *backward_merge_stage_names[] = { + "none", + "break_l1cow", + "set_dirty", + "running", + "waiting_completion", + "completing", + "fail", +}; + +static int qcow2_merge_backward_progress(struct qcow2_target *tgt, + char *result, unsigned int maxlen) +{ + struct qcow2_backward_merge backward_merge; + unsigned int sz = 0; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(backward_merge_stage_names) != BACKWARD_MERGE_STAGE_MAX); + + mutex_lock(&tgt->ctl_mutex); + backward_merge = tgt->backward_merge; + mutex_unlock(&tgt->ctl_mutex); + + ret = DMEMIT("stage=%s\nprogress=%lld\nmax_progress=%lld\nerror=%d\n", + backward_merge_stage_names[backward_merge.stage], + backward_merge.progress, + backward_merge.max_progress, + backward_merge.error); + + return ret ? 1 : 0; +} + static int qcow2_merge_backward_start(struct qcow2_target *tgt) { struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; @@ -205,6 +272,7 @@ void qcow2_merge_backward_work(struct work_struct *work) return; } tgt->backward_merge.state = BACKWARD_MERGE_RUN; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_BREAK_L1COW); mutex_unlock(&tgt->ctl_mutex); qcow2 = tgt->top; @@ -222,6 +290,7 @@ void qcow2_merge_backward_work(struct work_struct *work) goto out_err; } + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_SET_DIRTY); ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); @@ -230,6 +299,7 @@ void qcow2_merge_backward_work(struct work_struct *work) set_backward_merge_in_process(tgt, qcow2, true); /* Start merge */ + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_RUNNING); ret = qcow2_merge_common(tgt); if (ret) { set_backward_merge_in_process(tgt, qcow2, false); @@ -244,14 +314,17 @@ void qcow2_merge_backward_work(struct work_struct *work) /* Error */ tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = ret; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { /* Merge is canceled */ set_backward_merge_in_process(tgt, qcow2, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = -EINTR; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else { /* Finish merge */ tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_WAITING_COMPLETION); } if (tgt->backward_merge.eventfd_ctx) eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); @@ -267,6 +340,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) return -EBUSY; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_COMPLETING); tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -280,6 +354,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) qcow2_destroy(qcow2); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); return 0; } @@ -306,6 +381,7 @@ void qcow2_merge_backward_cancel(struct qcow2_target *tgt) } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { set_backward_merge_in_process(tgt, tgt->top, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); } mutex_unlock(&tgt->ctl_mutex); @@ -518,6 +594,13 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_merge_backward_set_eventfd(tgt, efd); goto out; + } else if (!strcmp(argv[1], "progress")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_progress(tgt, result, maxlen); + goto out; } } diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index c4956e3fd0eb7..ed7cf79348052 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -158,11 +158,25 @@ enum qcow2_backward_merge_state { BACKWARD_MERGE_STOP, }; +enum qcow2_backward_merge_stage { + BACKWARD_MERGE_STAGE_NONE = 0, + BACKWARD_MERGE_STAGE_BREAK_L1COW, + BACKWARD_MERGE_STAGE_SET_DIRTY, + BACKWARD_MERGE_STAGE_RUNNING, + BACKWARD_MERGE_STAGE_WAITING_COMPLETION, + BACKWARD_MERGE_STAGE_COMPLETING, + BACKWARD_MERGE_STAGE_FAIL, + BACKWARD_MERGE_STAGE_MAX, +}; + struct qcow2_backward_merge { struct work_struct work; enum qcow2_backward_merge_state state; int error; struct eventfd_ctx *eventfd_ctx; + enum qcow2_backward_merge_stage stage; + long long progress; + long long max_progress; }; struct qcow2_target { -- 2.48.1 From andrey.zhadchenko at virtuozzo.com Tue Mar 4 13:51:36 2025 From: andrey.zhadchenko at virtuozzo.com (Andrey Zhadchenko) Date: Tue, 4 Mar 2025 11:51:36 +0100 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> Message-ID: <8c5f114e-29c8-440b-bd3e-b5fbe1e39c82@virtuozzo.com> On 3/3/25 10:37, Pavel Tikhomirov wrote: > This adds merge_backward "start", "complete" and "cancel" commands. By > that we are able to split single merge_backward into two stages: start > asyncronous merging and completion. That can be usefull for restarting > qemu process while allowing backward merging to run asyncronously in > kernel. > > The "start" command runs merging preparations in workqueue work. After > it finishes, the "complete" command can be called to finish the process > and actually replace the top qcow2 with it's lower. The "cancel" command > forces the work to stop and flushes it. In case we are in completion > waiting state already and there is no work running, the "cancel" command > also reverts merging preparations. > > Locking: > > Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" > and "complete" commands are fully under this lock, and the "cancel" > operation takes the lock explicitly and releases it for work flushing. > The work also takes the lock but only when updating tgt->backward_merge > data. For checks, if the work was caneled in the middle, we read the > state without locking as we don't modify the state there, also we would > re-check the state again before exiting the work function under lock. > > Now on target suspend we "cancel" currently running backward merge, > previously we were just hanging untill backward merge have been > finished for possibly a long time, cancelling seems cleaner. Though we > don't really expect hypervisor suspending the target in the middle of > backward merge that it by itself started. > > https://virtuozzo.atlassian.net/browse/VSTOR-100466 > Signed-off-by: Pavel Tikhomirov > > -- > v2: Cancel from BACKWARD_MERGE_START state should not try to stop the > work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. > --- > drivers/md/dm-qcow2-cmd.c | 142 +++++++++++++++++++++++++++++++---- > drivers/md/dm-qcow2-target.c | 6 ++ > drivers/md/dm-qcow2.h | 19 +++++ > 3 files changed, 153 insertions(+), 14 deletions(-) > > diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c > index 7b4b0ee68ad9f..04a992f3ebba6 100644 > --- a/drivers/md/dm-qcow2-cmd.c > +++ b/drivers/md/dm-qcow2-cmd.c > @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, > wake_up(&tgt->service_wq); > } > > +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); > + > static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, > loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) > { > @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, > WRITE_ONCE(service_status, BLK_STS_OK); > > for (pos = 0; pos < end; pos += step) { > - if (fatal_signal_pending(current)) { > + if (qcow2_backward_merge_should_stop(tgt)) { > ret = -EINTR; > break; > } > @@ -161,10 +163,11 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, > qcow2_submit_embedded_qios(tgt, &list); > } > > -static int qcow2_merge_backward(struct qcow2_target *tgt) > +static int qcow2_merge_backward_start(struct qcow2_target *tgt) > { > struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; > - int ret, ret2; > + > + lockdep_assert_held(&tgt->ctl_mutex); > > if (!lower) > return -ENOENT; > @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) > return -EOPNOTSUPP; > if (lower->hdr.size < qcow2->hdr.size) > return -EBADSLT; > + > + if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) > + return -EBUSY; > + tgt->backward_merge.state = BACKWARD_MERGE_START; > + tgt->backward_merge.error = 0; > + > + schedule_work(&tgt->backward_merge.work); Does this imply we potentially occupy one of the workers of the global pool for the indefinite amount of time? What if we run as much as nworkers (probably ncpus) merges simultaneously? > + return 0; > +} > +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); > + > +void qcow2_merge_backward_work(struct work_struct *work) > +{ > + struct qcow2_target *tgt = container_of(work, struct qcow2_target, > + backward_merge.work); > + struct qcow2 *qcow2, *lower; > + int ret, ret2; > + > + mutex_lock(&tgt->ctl_mutex); > + if (tgt->backward_merge.state != BACKWARD_MERGE_START) { > + mutex_unlock(&tgt->ctl_mutex); > + return; > + } > + tgt->backward_merge.state = BACKWARD_MERGE_RUN; > + mutex_unlock(&tgt->ctl_mutex); > + > + qcow2 = tgt->top; > + lower = qcow2->lower; > + > /* > * Break all COW clus at L1 level. Otherwise, later > * there would be problems with unusing them: > @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) > ret = qcow2_break_l1cow(tgt); > if (ret) { > QC_ERR(tgt->ti, "Can't break L1 COW"); > - return ret; > + goto out_err; > } > > ret = qcow2_set_image_file_features(lower, true); > if (ret) { > QC_ERR(tgt->ti, "Can't set dirty bit"); > - return ret; > + goto out_err; > } > set_backward_merge_in_process(tgt, qcow2, true); > > @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) > ret2 = qcow2_set_image_file_features(lower, false); > if (ret2 < 0) > QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); > - return ret; > } > + > +out_err: > + mutex_lock(&tgt->ctl_mutex); > + if (ret) { > + /* Error */ > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + tgt->backward_merge.error = ret; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { > + /* Merge is canceled */ > + set_backward_merge_in_process(tgt, qcow2, false); > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + tgt->backward_merge.error = -EINTR; > + } else { > + /* Finish merge */ > + tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; > + } > + mutex_unlock(&tgt->ctl_mutex); > +} > + > +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) > +{ > + struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; > + int ret; > + > + lockdep_assert_held(&tgt->ctl_mutex); > + > + if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) > + return -EBUSY; > + > tgt->top = lower; > smp_wmb(); /* Pairs with qcow2_ref_inc() */ > qcow2_inflight_ref_switch(tgt); /* Pending qios */ > qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ > qcow2->lower = NULL; > > - ret2 = qcow2_set_image_file_features(qcow2, false); > - if (ret2 < 0) > - QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); > + ret = qcow2_set_image_file_features(qcow2, false); > + if (ret < 0) > + QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); > qcow2_destroy(qcow2); > > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + > return 0; > } > -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); > +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); > + > +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) > +{ > + bool flush = false; > + > + mutex_lock(&tgt->ctl_mutex); > + if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { > + mutex_unlock(&tgt->ctl_mutex); > + return; > + } > + > + if (tgt->backward_merge.state == BACKWARD_MERGE_START) { > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + flush = true; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { > + tgt->backward_merge.state = BACKWARD_MERGE_STOP; > + flush = true; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { > + flush = true; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { > + set_backward_merge_in_process(tgt, tgt->top, false); > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + } > + mutex_unlock(&tgt->ctl_mutex); > + > + if (flush) > + flush_work(&tgt->backward_merge.work); > +} > + > +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) > +{ > + return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; > +} > > static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) > { > @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, > } > ret = qcow2_get_event(tgt, result, maxlen); > goto out; > + } else if (!strcmp(argv[0], "merge_backward")) { > + if (argc != 2) { > + ret = -EINVAL; > + goto out; > + } > + if (!strcmp(argv[1], "cancel")) { > + qcow2_merge_backward_cancel(tgt); > + ret = 0; > + goto out; > + } > } > > - ret = mutex_lock_killable(&tgt->ctl_mutex); > - if (ret) > - goto out; > + mutex_lock(&tgt->ctl_mutex); > > if (!strcmp(argv[0], "get_errors")) { > ret = qcow2_get_errors(tgt, result, maxlen); > @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, > } else if (!strcmp(argv[0], "merge_forward")) { > ret = qcow2_merge_forward(tgt); > } else if (!strcmp(argv[0], "merge_backward")) { > - ret = qcow2_merge_backward(tgt); > + if (argc != 2) { > + ret = -EINVAL; > + mutex_unlock(&tgt->ctl_mutex); > + goto out; > + } > + if (!strcmp(argv[1], "start")) { > + ret = qcow2_merge_backward_start(tgt); > + } else if (!strcmp(argv[1], "complete")) { > + ret = qcow2_merge_backward_complete(tgt); > + } else { > + ret = -ENOTTY; > + } > } else { > ret = -ENOTTY; > } > diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c > index 540c03cb3c44f..6e2e583ba0b8b 100644 > --- a/drivers/md/dm-qcow2-target.c > +++ b/drivers/md/dm-qcow2-target.c > @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct dm_target *ti, bool allowed) > mutex_lock(&tgt->ctl_mutex); > tgt->service_operations_allowed = allowed; > mutex_unlock(&tgt->ctl_mutex); > + if (!allowed) > + qcow2_merge_backward_cancel(tgt); > } > static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) > { > @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target *tgt) > /* Now kill the queue */ > destroy_workqueue(tgt->wq); > } > + qcow2_merge_backward_cancel(tgt); > > mempool_destroy(tgt->qio_pool); > mempool_destroy(tgt->qrq_pool); > @@ -494,6 +497,9 @@ static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti) > timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); > ti->private = tgt; > tgt->ti = ti; > + > + INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); > + > qcow2_set_service_operations(ti, false); > > return tgt; > diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h > index a89fe3db2196d..bebfdc50ed6d4 100644 > --- a/drivers/md/dm-qcow2.h > +++ b/drivers/md/dm-qcow2.h > @@ -149,6 +149,20 @@ struct md_page { > struct list_head wpc_readers_wait_list; > }; > > +enum qcow2_backward_merge_state { > + BACKWARD_MERGE_STOPPED = 0, > + BACKWARD_MERGE_START, > + BACKWARD_MERGE_RUN, > + BACKWARD_MERGE_WAIT_COMPLETION, > + BACKWARD_MERGE_STOP, > +}; > + > +struct qcow2_backward_merge { > + struct work_struct work; > + enum qcow2_backward_merge_state state; > + int error; > +}; > + > struct qcow2_target { > struct dm_target *ti; > #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */ > @@ -180,6 +194,8 @@ struct qcow2_target { > struct work_struct event_work; > spinlock_t event_lock; > struct mutex ctl_mutex; > + > + struct qcow2_backward_merge backward_merge; > }; > > enum { > @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target *tgt); > void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2); > int qcow2_truncate_safe(struct file *file, loff_t new_len); > > +void qcow2_merge_backward_work(struct work_struct *work); > +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); > + > static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti) > { > return ti->private; From ptikhomirov at virtuozzo.com Tue Mar 4 14:32:02 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Tue, 4 Mar 2025 19:32:02 +0800 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <8c5f114e-29c8-440b-bd3e-b5fbe1e39c82@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> <8c5f114e-29c8-440b-bd3e-b5fbe1e39c82@virtuozzo.com> Message-ID: On 3/4/25 18:51, Andrey Zhadchenko wrote: > > > On 3/3/25 10:37, Pavel Tikhomirov wrote: >> This adds merge_backward "start", "complete" and "cancel" commands. By >> that we are able to split single merge_backward into two stages: start >> asyncronous merging and completion. That can be usefull for restarting >> qemu process while allowing backward merging to run asyncronously in >> kernel. >> >> The "start" command runs merging preparations in workqueue work. After >> it finishes, the "complete" command can be called to finish the process >> and actually replace the top qcow2 with it's lower. The "cancel" command >> forces the work to stop and flushes it. In case we are in completion >> waiting state already and there is no work running, the "cancel" command >> also reverts merging preparations. >> >> Locking: >> >> Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" >> and "complete" commands are fully under this lock, and the "cancel" >> operation takes the lock explicitly and releases it for work flushing. >> The work also takes the lock but only when updating tgt->backward_merge >> data. For checks, if the work was caneled in the middle, we read the >> state without locking as we don't modify the state there, also we would >> re-check the state again before exiting the work function under lock. >> >> Now on target suspend we "cancel" currently running backward merge, >> previously we were just hanging untill backward merge have been >> finished for possibly a long time, cancelling seems cleaner. Though we >> don't really expect hypervisor suspending the target in the middle of >> backward merge that it by itself started. >> >> https://virtuozzo.atlassian.net/browse/VSTOR-100466 >> Signed-off-by: Pavel Tikhomirov >> >> -- >> v2: Cancel from BACKWARD_MERGE_START state should not try to stop the >> work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. >> --- >> ? drivers/md/dm-qcow2-cmd.c??? | 142 +++++++++++++++++++++++++++++++---- >> ? drivers/md/dm-qcow2-target.c |?? 6 ++ >> ? drivers/md/dm-qcow2.h??????? |? 19 +++++ >> ? 3 files changed, 153 insertions(+), 14 deletions(-) >> >> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c >> index 7b4b0ee68ad9f..04a992f3ebba6 100644 >> --- a/drivers/md/dm-qcow2-cmd.c >> +++ b/drivers/md/dm-qcow2-cmd.c >> @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target >> *tgt, struct qio *qio, >> ????? wake_up(&tgt->service_wq); >> ? } >> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); >> + >> ? static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 >> *qcow2, >> ??????????? loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) >> ? { >> @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target >> *tgt, struct qcow2 *qcow2, >> ????? WRITE_ONCE(service_status, BLK_STS_OK); >> ????? for (pos = 0; pos < end; pos += step) { >> -??????? if (fatal_signal_pending(current)) { >> +??????? if (qcow2_backward_merge_should_stop(tgt)) { >> ????????????? ret = -EINTR; >> ????????????? break; >> ????????? } >> @@ -161,10 +163,11 @@ static void set_backward_merge_in_process(struct >> qcow2_target *tgt, >> ????? qcow2_submit_embedded_qios(tgt, &list); >> ? } >> -static int qcow2_merge_backward(struct qcow2_target *tgt) >> +static int qcow2_merge_backward_start(struct qcow2_target *tgt) >> ? { >> ????? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >> -??? int ret, ret2; >> + >> +??? lockdep_assert_held(&tgt->ctl_mutex); >> ????? if (!lower) >> ????????? return -ENOENT; >> @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct >> qcow2_target *tgt) >> ????????? return -EOPNOTSUPP; >> ????? if (lower->hdr.size < qcow2->hdr.size) >> ????????? return -EBADSLT; >> + >> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) >> +??????? return -EBUSY; >> +??? tgt->backward_merge.state = BACKWARD_MERGE_START; >> +??? tgt->backward_merge.error = 0; >> + >> +??? schedule_work(&tgt->backward_merge.work); > > Does this imply we potentially occupy one of the workers of the global > pool for the indefinite amount of time? What if we run as much as > nworkers (probably ncpus) merges simultaneously? System_wq has 1024*NCPU execution contexts: > ``@max_active`` determines the maximum number of execution contexts per CPU > The maximum limit for ``@max_active`` is 2048 and the default value used when 0 is specified is 1024. If we try to run ~1024 works per cpu at the same time we might have a problem, and will need to either swithch to our own work-queue or create explicit kernel thread for each merge. As flushing system-wide workqueues is now deprecated we are also fine with long running work in system_wq and not in system_long_wq, but we can move it to system_long_wq just to be on the safe side. * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. What do you think? > >> +??? return 0; >> +} >> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); >> + >> +void qcow2_merge_backward_work(struct work_struct *work) >> +{ >> +??? struct qcow2_target *tgt = container_of(work, struct qcow2_target, >> +??????????????????????? backward_merge.work); >> +??? struct qcow2 *qcow2, *lower; >> +??? int ret, ret2; >> + >> +??? mutex_lock(&tgt->ctl_mutex); >> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_START) { >> +??????? mutex_unlock(&tgt->ctl_mutex); >> +??????? return; >> +??? } >> +??? tgt->backward_merge.state = BACKWARD_MERGE_RUN; >> +??? mutex_unlock(&tgt->ctl_mutex); >> + >> +??? qcow2 = tgt->top; >> +??? lower = qcow2->lower; >> + >> ????? /* >> ?????? * Break all COW clus at L1 level. Otherwise, later >> ?????? * there would be problems with unusing them: >> @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct >> qcow2_target *tgt) >> ????? ret = qcow2_break_l1cow(tgt); >> ????? if (ret) { >> ????????? QC_ERR(tgt->ti, "Can't break L1 COW"); >> -??????? return ret; >> +??????? goto out_err; >> ????? } >> ????? ret = qcow2_set_image_file_features(lower, true); >> ????? if (ret) { >> ????????? QC_ERR(tgt->ti, "Can't set dirty bit"); >> -??????? return ret; >> +??????? goto out_err; >> ????? } >> ????? set_backward_merge_in_process(tgt, qcow2, true); >> @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct >> qcow2_target *tgt) >> ????????? ret2 = qcow2_set_image_file_features(lower, false); >> ????????? if (ret2 < 0) >> ????????????? QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); >> -??????? return ret; >> ????? } >> + >> +out_err: >> +??? mutex_lock(&tgt->ctl_mutex); >> +??? if (ret) { >> +??????? /* Error */ >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??????? tgt->backward_merge.error = ret; >> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >> +??????? /* Merge is canceled */ >> +??????? set_backward_merge_in_process(tgt, qcow2, false); >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??????? tgt->backward_merge.error = -EINTR; >> +??? } else { >> +??????? /* Finish merge */ >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; >> +??? } >> +??? mutex_unlock(&tgt->ctl_mutex); >> +} >> + >> +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) >> +{ >> +??? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >> +??? int ret; >> + >> +??? lockdep_assert_held(&tgt->ctl_mutex); >> + >> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) >> +??????? return -EBUSY; >> + >> ????? tgt->top = lower; >> ????? smp_wmb(); /* Pairs with qcow2_ref_inc() */ >> ????? qcow2_inflight_ref_switch(tgt); /* Pending qios */ >> ????? qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ >> ????? qcow2->lower = NULL; >> -??? ret2 = qcow2_set_image_file_features(qcow2, false); >> -??? if (ret2 < 0) >> -??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); >> +??? ret = qcow2_set_image_file_features(qcow2, false); >> +??? if (ret < 0) >> +??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); >> ????? qcow2_destroy(qcow2); >> +??? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> + >> ????? return 0; >> ? } >> -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); >> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); >> + >> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) >> +{ >> +??? bool flush = false; >> + >> +??? mutex_lock(&tgt->ctl_mutex); >> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { >> +??????? mutex_unlock(&tgt->ctl_mutex); >> +??????? return; >> +??? } >> + >> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_START) { >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??????? flush = true; >> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOP; >> +??????? flush = true; >> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >> +??????? flush = true; >> +??? } else if (tgt->backward_merge.state == >> BACKWARD_MERGE_WAIT_COMPLETION) { >> +??????? set_backward_merge_in_process(tgt, tgt->top, false); >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??? } >> +??? mutex_unlock(&tgt->ctl_mutex); >> + >> +??? if (flush) >> +??????? flush_work(&tgt->backward_merge.work); >> +} >> + >> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) >> +{ >> +??? return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; >> +} >> ? static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 >> img_id, u8 *ref_index) >> ? { >> @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, unsigned >> int argc, char **argv, >> ????????? } >> ????????? ret = qcow2_get_event(tgt, result, maxlen); >> ????????? goto out; >> +??? } else if (!strcmp(argv[0], "merge_backward")) { >> +??????? if (argc != 2) { >> +??????????? ret = -EINVAL; >> +??????????? goto out; >> +??????? } >> +??????? if (!strcmp(argv[1], "cancel")) { >> +??????????? qcow2_merge_backward_cancel(tgt); >> +??????????? ret = 0; >> +??????????? goto out; >> +??????? } >> ????? } >> -??? ret = mutex_lock_killable(&tgt->ctl_mutex); >> -??? if (ret) >> -??????? goto out; >> +??? mutex_lock(&tgt->ctl_mutex); >> ????? if (!strcmp(argv[0], "get_errors")) { >> ????????? ret = qcow2_get_errors(tgt, result, maxlen); >> @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned >> int argc, char **argv, >> ????? } else if (!strcmp(argv[0], "merge_forward")) { >> ????????? ret = qcow2_merge_forward(tgt); >> ????? } else if (!strcmp(argv[0], "merge_backward")) { >> -??????? ret = qcow2_merge_backward(tgt); >> +??????? if (argc != 2) { >> +??????????? ret = -EINVAL; >> +??????????? mutex_unlock(&tgt->ctl_mutex); >> +??????????? goto out; >> +??????? } >> +??????? if (!strcmp(argv[1], "start")) { >> +??????????? ret = qcow2_merge_backward_start(tgt); >> +??????? } else if (!strcmp(argv[1], "complete")) { >> +??????????? ret = qcow2_merge_backward_complete(tgt); >> +??????? } else { >> +??????????? ret = -ENOTTY; >> +??????? } >> ????? } else { >> ????????? ret = -ENOTTY; >> ????? } >> diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c >> index 540c03cb3c44f..6e2e583ba0b8b 100644 >> --- a/drivers/md/dm-qcow2-target.c >> +++ b/drivers/md/dm-qcow2-target.c >> @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct >> dm_target *ti, bool allowed) >> ????? mutex_lock(&tgt->ctl_mutex); >> ????? tgt->service_operations_allowed = allowed; >> ????? mutex_unlock(&tgt->ctl_mutex); >> +??? if (!allowed) >> +??????? qcow2_merge_backward_cancel(tgt); >> ? } >> ? static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) >> ? { >> @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target >> *tgt) >> ????????? /* Now kill the queue */ >> ????????? destroy_workqueue(tgt->wq); >> ????? } >> +??? qcow2_merge_backward_cancel(tgt); >> ????? mempool_destroy(tgt->qio_pool); >> ????? mempool_destroy(tgt->qrq_pool); >> @@ -494,6 +497,9 @@ static struct qcow2_target >> *alloc_qcow2_target(struct dm_target *ti) >> ????? timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); >> ????? ti->private = tgt; >> ????? tgt->ti = ti; >> + >> +??? INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); >> + >> ????? qcow2_set_service_operations(ti, false); >> ????? return tgt; >> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >> index a89fe3db2196d..bebfdc50ed6d4 100644 >> --- a/drivers/md/dm-qcow2.h >> +++ b/drivers/md/dm-qcow2.h >> @@ -149,6 +149,20 @@ struct md_page { >> ????? struct list_head wpc_readers_wait_list; >> ? }; >> +enum qcow2_backward_merge_state { >> +??? BACKWARD_MERGE_STOPPED = 0, >> +??? BACKWARD_MERGE_START, >> +??? BACKWARD_MERGE_RUN, >> +??? BACKWARD_MERGE_WAIT_COMPLETION, >> +??? BACKWARD_MERGE_STOP, >> +}; >> + >> +struct qcow2_backward_merge { >> +??? struct work_struct work; >> +??? enum qcow2_backward_merge_state state; >> +??? int error; >> +}; >> + >> ? struct qcow2_target { >> ????? struct dm_target *ti; >> ? #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from >> blk_mq_init_sched() */ >> @@ -180,6 +194,8 @@ struct qcow2_target { >> ????? struct work_struct event_work; >> ????? spinlock_t event_lock; >> ????? struct mutex ctl_mutex; >> + >> +??? struct qcow2_backward_merge backward_merge; >> ? }; >> ? enum { >> @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target >> *tgt); >> ? void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct >> qcow2 *qcow2); >> ? int qcow2_truncate_safe(struct file *file, loff_t new_len); >> +void qcow2_merge_backward_work(struct work_struct *work); >> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); >> + >> ? static inline struct qcow2_target *to_qcow2_target(struct dm_target >> *ti) >> ? { >> ????? return ti->private; > -- Best regards, Pavel Tikhomirov Senior Software Developer, Virtuozzo. From andrey.zhadchenko at virtuozzo.com Tue Mar 4 14:41:12 2025 From: andrey.zhadchenko at virtuozzo.com (Andrey Zhadchenko) Date: Tue, 4 Mar 2025 12:41:12 +0100 Subject: [Devel] [PATCH v4 VZ9 4/5] dm-qcow2: add merge_backward set_eventfd command In-Reply-To: <20250303093802.1233834-5-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-5-ptikhomirov@virtuozzo.com> Message-ID: <4c1515c5-3f97-473f-903a-f3777c3724a2@virtuozzo.com> On 3/3/25 10:37, Pavel Tikhomirov wrote: > This eventfd can be used to get an event when merge_backward start work > have finished and is waiting for completion. > > Note: The eventfd can be changed even while work is running. > > Locking: > > The backward_merge.eventfd_ctx is protected from being released by > tgt->ctl_mutex. > > https://virtuozzo.atlassian.net/browse/VSTOR-100466 > Signed-off-by: Pavel Tikhomirov > > -- > v2: Always report that work finished, e.g. also on error or then it was > canceled, this should be more consistent from the userspace perspective. > v4: Address Andrey's reveiw: signal that we are at completion waiting on > change of eventfd. > --- > drivers/md/dm-qcow2-cmd.c | 42 ++++++++++++++++++++++++++++++++++++++- > drivers/md/dm-qcow2.h | 2 ++ > 2 files changed, 43 insertions(+), 1 deletion(-) > > diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c > index 04a992f3ebba6..f16b4f731ca5a 100644 > --- a/drivers/md/dm-qcow2-cmd.c > +++ b/drivers/md/dm-qcow2-cmd.c > @@ -5,6 +5,8 @@ > #include > #include > #include > +#include > +#include > #include > #include "dm-qcow2.h" > > @@ -197,6 +199,8 @@ void qcow2_merge_backward_work(struct work_struct *work) > > mutex_lock(&tgt->ctl_mutex); > if (tgt->backward_merge.state != BACKWARD_MERGE_START) { > + if (tgt->backward_merge.eventfd_ctx) > + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); > mutex_unlock(&tgt->ctl_mutex); > return; > } > @@ -249,6 +253,8 @@ void qcow2_merge_backward_work(struct work_struct *work) > /* Finish merge */ > tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; > } > + if (tgt->backward_merge.eventfd_ctx) > + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); It would be a bit better if we also set a different values for error or success, but it is not necessary, as either complete will fail or we do get_progress and see error > mutex_unlock(&tgt->ctl_mutex); > } > > @@ -312,6 +318,27 @@ static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) > return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; > } > > +#define QCOW2_FILE_UNBIND -1 > + > +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd) > +{ > + struct eventfd_ctx *ctx = NULL; > + > + ctx = efd == QCOW2_FILE_UNBIND ? NULL : eventfd_ctx_fdget(efd); > + if (IS_ERR(ctx)) > + return PTR_ERR(ctx); > + > + mutex_lock(&tgt->ctl_mutex); > + swap(ctx, tgt->backward_merge.eventfd_ctx); > + if (ctx) > + eventfd_ctx_put(ctx); > + if (tgt->backward_merge.eventfd_ctx && > + tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) > + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); > + mutex_unlock(&tgt->ctl_mutex); > + return 0; > +} > + > static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) > { > struct qcow2 *qcow2; > @@ -470,14 +497,27 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, > ret = qcow2_get_event(tgt, result, maxlen); > goto out; > } else if (!strcmp(argv[0], "merge_backward")) { > - if (argc != 2) { > + if (argc < 2) { > ret = -EINVAL; > goto out; > } > if (!strcmp(argv[1], "cancel")) { > + if (argc != 2) { > + ret = -EINVAL; > + goto out; > + } > qcow2_merge_backward_cancel(tgt); > ret = 0; > goto out; > + } else if (!strcmp(argv[1], "set_eventfd")) { > + int efd; > + > + if (argc != 3 || kstrtoint(argv[2], 10, &efd)) { > + ret = -EINVAL; > + goto out; > + } > + ret = qcow2_merge_backward_set_eventfd(tgt, efd); > + goto out; > } > } > > diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h > index bebfdc50ed6d4..c4956e3fd0eb7 100644 > --- a/drivers/md/dm-qcow2.h > +++ b/drivers/md/dm-qcow2.h > @@ -5,6 +5,7 @@ > #include > #include > #include > +#include > #include "dm-core.h" > > #define DM_MSG_PREFIX "qcow2" > @@ -161,6 +162,7 @@ struct qcow2_backward_merge { > struct work_struct work; > enum qcow2_backward_merge_state state; > int error; > + struct eventfd_ctx *eventfd_ctx; > }; > > struct qcow2_target { From andrey.zhadchenko at virtuozzo.com Tue Mar 4 14:48:32 2025 From: andrey.zhadchenko at virtuozzo.com (Andrey Zhadchenko) Date: Tue, 4 Mar 2025 12:48:32 +0100 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> <8c5f114e-29c8-440b-bd3e-b5fbe1e39c82@virtuozzo.com> Message-ID: <08f205b7-120a-4811-90f5-6145d9d6d059@virtuozzo.com> On 3/4/25 12:32, Pavel Tikhomirov wrote: > > > On 3/4/25 18:51, Andrey Zhadchenko wrote: >> >> >> On 3/3/25 10:37, Pavel Tikhomirov wrote: >>> This adds merge_backward "start", "complete" and "cancel" commands. By >>> that we are able to split single merge_backward into two stages: start >>> asyncronous merging and completion. That can be usefull for restarting >>> qemu process while allowing backward merging to run asyncronously in >>> kernel. >>> >>> The "start" command runs merging preparations in workqueue work. After >>> it finishes, the "complete" command can be called to finish the process >>> and actually replace the top qcow2 with it's lower. The "cancel" command >>> forces the work to stop and flushes it. In case we are in completion >>> waiting state already and there is no work running, the "cancel" command >>> also reverts merging preparations. >>> >>> Locking: >>> >>> Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" >>> and "complete" commands are fully under this lock, and the "cancel" >>> operation takes the lock explicitly and releases it for work flushing. >>> The work also takes the lock but only when updating tgt->backward_merge >>> data. For checks, if the work was caneled in the middle, we read the >>> state without locking as we don't modify the state there, also we would >>> re-check the state again before exiting the work function under lock. >>> >>> Now on target suspend we "cancel" currently running backward merge, >>> previously we were just hanging untill backward merge have been >>> finished for possibly a long time, cancelling seems cleaner. Though we >>> don't really expect hypervisor suspending the target in the middle of >>> backward merge that it by itself started. >>> >>> https://virtuozzo.atlassian.net/browse/VSTOR-100466 >>> Signed-off-by: Pavel Tikhomirov >>> >>> -- >>> v2: Cancel from BACKWARD_MERGE_START state should not try to stop the >>> work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. >>> --- >>> ? drivers/md/dm-qcow2-cmd.c??? | 142 +++++++++++++++++++++++++++++++---- >>> ? drivers/md/dm-qcow2-target.c |?? 6 ++ >>> ? drivers/md/dm-qcow2.h??????? |? 19 +++++ >>> ? 3 files changed, 153 insertions(+), 14 deletions(-) >>> >>> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c >>> index 7b4b0ee68ad9f..04a992f3ebba6 100644 >>> --- a/drivers/md/dm-qcow2-cmd.c >>> +++ b/drivers/md/dm-qcow2-cmd.c >>> @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target >>> *tgt, struct qio *qio, >>> ????? wake_up(&tgt->service_wq); >>> ? } >>> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); >>> + >>> ? static int qcow2_service_iter(struct qcow2_target *tgt, struct >>> qcow2 *qcow2, >>> ??????????? loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) >>> ? { >>> @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target >>> *tgt, struct qcow2 *qcow2, >>> ????? WRITE_ONCE(service_status, BLK_STS_OK); >>> ????? for (pos = 0; pos < end; pos += step) { >>> -??????? if (fatal_signal_pending(current)) { >>> +??????? if (qcow2_backward_merge_should_stop(tgt)) { >>> ????????????? ret = -EINTR; >>> ????????????? break; >>> ????????? } >>> @@ -161,10 +163,11 @@ static void >>> set_backward_merge_in_process(struct qcow2_target *tgt, >>> ????? qcow2_submit_embedded_qios(tgt, &list); >>> ? } >>> -static int qcow2_merge_backward(struct qcow2_target *tgt) >>> +static int qcow2_merge_backward_start(struct qcow2_target *tgt) >>> ? { >>> ????? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >>> -??? int ret, ret2; >>> + >>> +??? lockdep_assert_held(&tgt->ctl_mutex); >>> ????? if (!lower) >>> ????????? return -ENOENT; >>> @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct >>> qcow2_target *tgt) >>> ????????? return -EOPNOTSUPP; >>> ????? if (lower->hdr.size < qcow2->hdr.size) >>> ????????? return -EBADSLT; >>> + >>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) >>> +??????? return -EBUSY; >>> +??? tgt->backward_merge.state = BACKWARD_MERGE_START; >>> +??? tgt->backward_merge.error = 0; >>> + >>> +??? schedule_work(&tgt->backward_merge.work); >> >> Does this imply we potentially occupy one of the workers of the global >> pool for the indefinite amount of time? What if we run as much as >> nworkers (probably ncpus) merges simultaneously? > > System_wq has 1024*NCPU execution contexts: > > > ``@max_active`` determines the maximum number of execution contexts > per CPU > > > The maximum limit for ``@max_active`` is 2048 and the default value used > when 0 is specified is 1024. > > If we try to run ~1024 works per cpu at the same time we might have a > problem, and will need to either swithch to our own work-queue or create > explicit kernel thread for each merge. > > > As flushing system-wide workqueues is now deprecated we are also fine > with long running work in system_wq and not in system_long_wq, but we > can move it to system_long_wq just to be on the safe side. > > ?* system_wq is the one used by schedule[_delayed]_work[_on](). > ?* Multi-CPU multi-threaded.? There are users which expect relatively > ?* short queue flush time.? Don't queue works which can run for too > ?* long. > > ?* system_long_wq is similar to system_wq but may host long running > ?* works.? Queue flushing might take relatively long. > > What do you think? In close approximation we can just run on dm-qcow2 workqueue, as each image allocates it's own wq, tgt->wq. And we only run 2 works there, general and flush. However Alexander's dm-ploop rework dropped workqueues in favor of threads, so I do not know if and when we intend to merge it into dm-qcow2. With given limits 1024 per cpu we probably could ignore constraints on global workqueue. Could you please also give me some links where I can read this? Couldn't find anything myself > >> >>> +??? return 0; >>> +} >>> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); >>> + >>> +void qcow2_merge_backward_work(struct work_struct *work) >>> +{ >>> +??? struct qcow2_target *tgt = container_of(work, struct qcow2_target, >>> +??????????????????????? backward_merge.work); >>> +??? struct qcow2 *qcow2, *lower; >>> +??? int ret, ret2; >>> + >>> +??? mutex_lock(&tgt->ctl_mutex); >>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_START) { >>> +??????? mutex_unlock(&tgt->ctl_mutex); >>> +??????? return; >>> +??? } >>> +??? tgt->backward_merge.state = BACKWARD_MERGE_RUN; >>> +??? mutex_unlock(&tgt->ctl_mutex); >>> + >>> +??? qcow2 = tgt->top; >>> +??? lower = qcow2->lower; >>> + >>> ????? /* >>> ?????? * Break all COW clus at L1 level. Otherwise, later >>> ?????? * there would be problems with unusing them: >>> @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct >>> qcow2_target *tgt) >>> ????? ret = qcow2_break_l1cow(tgt); >>> ????? if (ret) { >>> ????????? QC_ERR(tgt->ti, "Can't break L1 COW"); >>> -??????? return ret; >>> +??????? goto out_err; >>> ????? } >>> ????? ret = qcow2_set_image_file_features(lower, true); >>> ????? if (ret) { >>> ????????? QC_ERR(tgt->ti, "Can't set dirty bit"); >>> -??????? return ret; >>> +??????? goto out_err; >>> ????? } >>> ????? set_backward_merge_in_process(tgt, qcow2, true); >>> @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct >>> qcow2_target *tgt) >>> ????????? ret2 = qcow2_set_image_file_features(lower, false); >>> ????????? if (ret2 < 0) >>> ????????????? QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); >>> -??????? return ret; >>> ????? } >>> + >>> +out_err: >>> +??? mutex_lock(&tgt->ctl_mutex); >>> +??? if (ret) { >>> +??????? /* Error */ >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??????? tgt->backward_merge.error = ret; >>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >>> +??????? /* Merge is canceled */ >>> +??????? set_backward_merge_in_process(tgt, qcow2, false); >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??????? tgt->backward_merge.error = -EINTR; >>> +??? } else { >>> +??????? /* Finish merge */ >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; >>> +??? } >>> +??? mutex_unlock(&tgt->ctl_mutex); >>> +} >>> + >>> +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) >>> +{ >>> +??? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >>> +??? int ret; >>> + >>> +??? lockdep_assert_held(&tgt->ctl_mutex); >>> + >>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) >>> +??????? return -EBUSY; >>> + >>> ????? tgt->top = lower; >>> ????? smp_wmb(); /* Pairs with qcow2_ref_inc() */ >>> ????? qcow2_inflight_ref_switch(tgt); /* Pending qios */ >>> ????? qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ >>> ????? qcow2->lower = NULL; >>> -??? ret2 = qcow2_set_image_file_features(qcow2, false); >>> -??? if (ret2 < 0) >>> -??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); >>> +??? ret = qcow2_set_image_file_features(qcow2, false); >>> +??? if (ret < 0) >>> +??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); >>> ????? qcow2_destroy(qcow2); >>> +??? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> + >>> ????? return 0; >>> ? } >>> -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); >>> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); >>> + >>> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) >>> +{ >>> +??? bool flush = false; >>> + >>> +??? mutex_lock(&tgt->ctl_mutex); >>> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { >>> +??????? mutex_unlock(&tgt->ctl_mutex); >>> +??????? return; >>> +??? } >>> + >>> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_START) { >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??????? flush = true; >>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOP; >>> +??????? flush = true; >>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >>> +??????? flush = true; >>> +??? } else if (tgt->backward_merge.state == >>> BACKWARD_MERGE_WAIT_COMPLETION) { >>> +??????? set_backward_merge_in_process(tgt, tgt->top, false); >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??? } >>> +??? mutex_unlock(&tgt->ctl_mutex); >>> + >>> +??? if (flush) >>> +??????? flush_work(&tgt->backward_merge.work); >>> +} >>> + >>> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) >>> +{ >>> +??? return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; >>> +} >>> ? static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 >>> img_id, u8 *ref_index) >>> ? { >>> @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, >>> unsigned int argc, char **argv, >>> ????????? } >>> ????????? ret = qcow2_get_event(tgt, result, maxlen); >>> ????????? goto out; >>> +??? } else if (!strcmp(argv[0], "merge_backward")) { >>> +??????? if (argc != 2) { >>> +??????????? ret = -EINVAL; >>> +??????????? goto out; >>> +??????? } >>> +??????? if (!strcmp(argv[1], "cancel")) { >>> +??????????? qcow2_merge_backward_cancel(tgt); >>> +??????????? ret = 0; >>> +??????????? goto out; >>> +??????? } >>> ????? } >>> -??? ret = mutex_lock_killable(&tgt->ctl_mutex); >>> -??? if (ret) >>> -??????? goto out; >>> +??? mutex_lock(&tgt->ctl_mutex); >>> ????? if (!strcmp(argv[0], "get_errors")) { >>> ????????? ret = qcow2_get_errors(tgt, result, maxlen); >>> @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned >>> int argc, char **argv, >>> ????? } else if (!strcmp(argv[0], "merge_forward")) { >>> ????????? ret = qcow2_merge_forward(tgt); >>> ????? } else if (!strcmp(argv[0], "merge_backward")) { >>> -??????? ret = qcow2_merge_backward(tgt); >>> +??????? if (argc != 2) { >>> +??????????? ret = -EINVAL; >>> +??????????? mutex_unlock(&tgt->ctl_mutex); >>> +??????????? goto out; >>> +??????? } >>> +??????? if (!strcmp(argv[1], "start")) { >>> +??????????? ret = qcow2_merge_backward_start(tgt); >>> +??????? } else if (!strcmp(argv[1], "complete")) { >>> +??????????? ret = qcow2_merge_backward_complete(tgt); >>> +??????? } else { >>> +??????????? ret = -ENOTTY; >>> +??????? } >>> ????? } else { >>> ????????? ret = -ENOTTY; >>> ????? } >>> diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c >>> index 540c03cb3c44f..6e2e583ba0b8b 100644 >>> --- a/drivers/md/dm-qcow2-target.c >>> +++ b/drivers/md/dm-qcow2-target.c >>> @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct >>> dm_target *ti, bool allowed) >>> ????? mutex_lock(&tgt->ctl_mutex); >>> ????? tgt->service_operations_allowed = allowed; >>> ????? mutex_unlock(&tgt->ctl_mutex); >>> +??? if (!allowed) >>> +??????? qcow2_merge_backward_cancel(tgt); >>> ? } >>> ? static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) >>> ? { >>> @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target >>> *tgt) >>> ????????? /* Now kill the queue */ >>> ????????? destroy_workqueue(tgt->wq); >>> ????? } >>> +??? qcow2_merge_backward_cancel(tgt); >>> ????? mempool_destroy(tgt->qio_pool); >>> ????? mempool_destroy(tgt->qrq_pool); >>> @@ -494,6 +497,9 @@ static struct qcow2_target >>> *alloc_qcow2_target(struct dm_target *ti) >>> ????? timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); >>> ????? ti->private = tgt; >>> ????? tgt->ti = ti; >>> + >>> +??? INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); >>> + >>> ????? qcow2_set_service_operations(ti, false); >>> ????? return tgt; >>> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >>> index a89fe3db2196d..bebfdc50ed6d4 100644 >>> --- a/drivers/md/dm-qcow2.h >>> +++ b/drivers/md/dm-qcow2.h >>> @@ -149,6 +149,20 @@ struct md_page { >>> ????? struct list_head wpc_readers_wait_list; >>> ? }; >>> +enum qcow2_backward_merge_state { >>> +??? BACKWARD_MERGE_STOPPED = 0, >>> +??? BACKWARD_MERGE_START, >>> +??? BACKWARD_MERGE_RUN, >>> +??? BACKWARD_MERGE_WAIT_COMPLETION, >>> +??? BACKWARD_MERGE_STOP, >>> +}; >>> + >>> +struct qcow2_backward_merge { >>> +??? struct work_struct work; >>> +??? enum qcow2_backward_merge_state state; >>> +??? int error; >>> +}; >>> + >>> ? struct qcow2_target { >>> ????? struct dm_target *ti; >>> ? #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from >>> blk_mq_init_sched() */ >>> @@ -180,6 +194,8 @@ struct qcow2_target { >>> ????? struct work_struct event_work; >>> ????? spinlock_t event_lock; >>> ????? struct mutex ctl_mutex; >>> + >>> +??? struct qcow2_backward_merge backward_merge; >>> ? }; >>> ? enum { >>> @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target >>> *tgt); >>> ? void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct >>> qcow2 *qcow2); >>> ? int qcow2_truncate_safe(struct file *file, loff_t new_len); >>> +void qcow2_merge_backward_work(struct work_struct *work); >>> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); >>> + >>> ? static inline struct qcow2_target *to_qcow2_target(struct dm_target >>> *ti) >>> ? { >>> ????? return ti->private; >> > From alexander.atanasov at virtuozzo.com Tue Mar 4 14:55:47 2025 From: alexander.atanasov at virtuozzo.com (Alexander Atanasov) Date: Tue, 4 Mar 2025 13:55:47 +0200 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> Message-ID: <2a877212-77d1-47f5-a6b6-10d3f8c54488@virtuozzo.com> On 3.03.25 11:37, Pavel Tikhomirov wrote: > This adds merge_backward "start", "complete" and "cancel" commands. By > that we are able to split single merge_backward into two stages: start > asyncronous merging and completion. That can be usefull for restarting > qemu process while allowing backward merging to run asyncronously in > kernel. > > The "start" command runs merging preparations in workqueue work. After > it finishes, the "complete" command can be called to finish the process > and actually replace the top qcow2 with it's lower. The "cancel" command > forces the work to stop and flushes it. In case we are in completion > waiting state already and there is no work running, the "cancel" command > also reverts merging preparations. > > Locking: > > Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" > and "complete" commands are fully under this lock, and the "cancel" > operation takes the lock explicitly and releases it for work flushing. > The work also takes the lock but only when updating tgt->backward_merge > data. For checks, if the work was caneled in the middle, we read the > state without locking as we don't modify the state there, also we would > re-check the state again before exiting the work function under lock. > > Now on target suspend we "cancel" currently running backward merge, > previously we were just hanging untill backward merge have been > finished for possibly a long time, cancelling seems cleaner. Though we > don't really expect hypervisor suspending the target in the middle of > backward merge that it by itself started. > > https://virtuozzo.atlassian.net/browse/VSTOR-100466 > Signed-off-by: Pavel Tikhomirov > > -- > v2: Cancel from BACKWARD_MERGE_START state should not try to stop the > work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. > --- > drivers/md/dm-qcow2-cmd.c | 142 +++++++++++++++++++++++++++++++---- > drivers/md/dm-qcow2-target.c | 6 ++ > drivers/md/dm-qcow2.h | 19 +++++ > 3 files changed, 153 insertions(+), 14 deletions(-) > > diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c > index 7b4b0ee68ad9f..04a992f3ebba6 100644 > --- a/drivers/md/dm-qcow2-cmd.c > +++ b/drivers/md/dm-qcow2-cmd.c > @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, > wake_up(&tgt->service_wq); > } > > +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); > + > static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, > loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) > { > @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, > WRITE_ONCE(service_status, BLK_STS_OK); > > for (pos = 0; pos < end; pos += step) { > - if (fatal_signal_pending(current)) { > + if (qcow2_backward_merge_should_stop(tgt)) { > ret = -EINTR; > break; > } Is it okay to remove termination on signal - here and the killable mutex? Without signal handling it can prevent clean shutdown or leave it stuck if something goes wrong in the code. > @@ -161,10 +163,11 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, > qcow2_submit_embedded_qios(tgt, &list); > } > > -static int qcow2_merge_backward(struct qcow2_target *tgt) > +static int qcow2_merge_backward_start(struct qcow2_target *tgt) > { > struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; > - int ret, ret2; > + > + lockdep_assert_held(&tgt->ctl_mutex); > > if (!lower) > return -ENOENT; > @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) > return -EOPNOTSUPP; > if (lower->hdr.size < qcow2->hdr.size) > return -EBADSLT; > + > + if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) > + return -EBUSY; > + tgt->backward_merge.state = BACKWARD_MERGE_START; > + tgt->backward_merge.error = 0; > + > + schedule_work(&tgt->backward_merge.work); > + return 0; > +} > +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); > + > +void qcow2_merge_backward_work(struct work_struct *work) > +{ > + struct qcow2_target *tgt = container_of(work, struct qcow2_target, > + backward_merge.work); > + struct qcow2 *qcow2, *lower; > + int ret, ret2; > + > + mutex_lock(&tgt->ctl_mutex); > + if (tgt->backward_merge.state != BACKWARD_MERGE_START) { > + mutex_unlock(&tgt->ctl_mutex); > + return; > + } > + tgt->backward_merge.state = BACKWARD_MERGE_RUN; > + mutex_unlock(&tgt->ctl_mutex); > + > + qcow2 = tgt->top; > + lower = qcow2->lower; > + > /* > * Break all COW clus at L1 level. Otherwise, later > * there would be problems with unusing them: > @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) > ret = qcow2_break_l1cow(tgt); > if (ret) { > QC_ERR(tgt->ti, "Can't break L1 COW"); > - return ret; > + goto out_err; > } > > ret = qcow2_set_image_file_features(lower, true); > if (ret) { > QC_ERR(tgt->ti, "Can't set dirty bit"); > - return ret; > + goto out_err; > } > set_backward_merge_in_process(tgt, qcow2, true); > > @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) > ret2 = qcow2_set_image_file_features(lower, false); > if (ret2 < 0) > QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); > - return ret; > } > + > +out_err: > + mutex_lock(&tgt->ctl_mutex); > + if (ret) { > + /* Error */ > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + tgt->backward_merge.error = ret; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { > + /* Merge is canceled */ > + set_backward_merge_in_process(tgt, qcow2, false); > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + tgt->backward_merge.error = -EINTR; > + } else { > + /* Finish merge */ > + tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; > + } > + mutex_unlock(&tgt->ctl_mutex); > +} > + > +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) > +{ > + struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; > + int ret; > + > + lockdep_assert_held(&tgt->ctl_mutex); > + > + if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) > + return -EBUSY; > + > tgt->top = lower; > smp_wmb(); /* Pairs with qcow2_ref_inc() */ > qcow2_inflight_ref_switch(tgt); /* Pending qios */ > qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ > qcow2->lower = NULL; > > - ret2 = qcow2_set_image_file_features(qcow2, false); > - if (ret2 < 0) > - QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); > + ret = qcow2_set_image_file_features(qcow2, false); > + if (ret < 0) > + QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); > qcow2_destroy(qcow2); > > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + > return 0; > } > -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); > +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); > + > +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) > +{ > + bool flush = false; > + > + mutex_lock(&tgt->ctl_mutex); > + if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { > + mutex_unlock(&tgt->ctl_mutex); > + return; > + } > + > + if (tgt->backward_merge.state == BACKWARD_MERGE_START) { > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + flush = true; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { > + tgt->backward_merge.state = BACKWARD_MERGE_STOP; > + flush = true; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { > + flush = true; > + } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { > + set_backward_merge_in_process(tgt, tgt->top, false); > + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; > + } > + mutex_unlock(&tgt->ctl_mutex); > + > + if (flush) > + flush_work(&tgt->backward_merge.work); > +} > + > +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) > +{ > + return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; > +} > > static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) > { > @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, > } > ret = qcow2_get_event(tgt, result, maxlen); > goto out; > + } else if (!strcmp(argv[0], "merge_backward")) { > + if (argc != 2) { > + ret = -EINVAL; > + goto out; > + } > + if (!strcmp(argv[1], "cancel")) { > + qcow2_merge_backward_cancel(tgt); > + ret = 0; > + goto out; > + } > } > > - ret = mutex_lock_killable(&tgt->ctl_mutex); > - if (ret) > - goto out; > + mutex_lock(&tgt->ctl_mutex); > > if (!strcmp(argv[0], "get_errors")) { > ret = qcow2_get_errors(tgt, result, maxlen); > @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, > } else if (!strcmp(argv[0], "merge_forward")) { > ret = qcow2_merge_forward(tgt); > } else if (!strcmp(argv[0], "merge_backward")) { > - ret = qcow2_merge_backward(tgt); > + if (argc != 2) { > + ret = -EINVAL; > + mutex_unlock(&tgt->ctl_mutex); > + goto out; > + } > + if (!strcmp(argv[1], "start")) { > + ret = qcow2_merge_backward_start(tgt); > + } else if (!strcmp(argv[1], "complete")) { > + ret = qcow2_merge_backward_complete(tgt); > + } else { > + ret = -ENOTTY; > + } > } else { > ret = -ENOTTY; > } > diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c > index 540c03cb3c44f..6e2e583ba0b8b 100644 > --- a/drivers/md/dm-qcow2-target.c > +++ b/drivers/md/dm-qcow2-target.c > @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct dm_target *ti, bool allowed) > mutex_lock(&tgt->ctl_mutex); > tgt->service_operations_allowed = allowed; > mutex_unlock(&tgt->ctl_mutex); > + if (!allowed) > + qcow2_merge_backward_cancel(tgt); > } > static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) > { > @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target *tgt) > /* Now kill the queue */ > destroy_workqueue(tgt->wq); > } > + qcow2_merge_backward_cancel(tgt); > > mempool_destroy(tgt->qio_pool); > mempool_destroy(tgt->qrq_pool); > @@ -494,6 +497,9 @@ static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti) > timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); > ti->private = tgt; > tgt->ti = ti; > + > + INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); > + > qcow2_set_service_operations(ti, false); > > return tgt; > diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h > index a89fe3db2196d..bebfdc50ed6d4 100644 > --- a/drivers/md/dm-qcow2.h > +++ b/drivers/md/dm-qcow2.h > @@ -149,6 +149,20 @@ struct md_page { > struct list_head wpc_readers_wait_list; > }; > > +enum qcow2_backward_merge_state { > + BACKWARD_MERGE_STOPPED = 0, nit: this init is excess > + BACKWARD_MERGE_START, > + BACKWARD_MERGE_RUN, > + BACKWARD_MERGE_WAIT_COMPLETION, > + BACKWARD_MERGE_STOP, > +}; > + > +struct qcow2_backward_merge { > + struct work_struct work; > + enum qcow2_backward_merge_state state; > + int error; > +}; May be add merge error to values returned in qcow2_get_errors, for the users that use dm events interface. > + > struct qcow2_target { > struct dm_target *ti; > #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */ > @@ -180,6 +194,8 @@ struct qcow2_target { > struct work_struct event_work; > spinlock_t event_lock; > struct mutex ctl_mutex; > + > + struct qcow2_backward_merge backward_merge; > }; > > enum { > @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target *tgt); > void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2); > int qcow2_truncate_safe(struct file *file, loff_t new_len); > > +void qcow2_merge_backward_work(struct work_struct *work); > +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); > + > static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti) > { > return ti->private; -- Regards, Alexander Atanasov From andrey.zhadchenko at virtuozzo.com Tue Mar 4 15:08:10 2025 From: andrey.zhadchenko at virtuozzo.com (Andrey Zhadchenko) Date: Tue, 4 Mar 2025 13:08:10 +0100 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <2a877212-77d1-47f5-a6b6-10d3f8c54488@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> <2a877212-77d1-47f5-a6b6-10d3f8c54488@virtuozzo.com> Message-ID: <0fec20f3-d006-40f9-9112-225ca1deba1c@virtuozzo.com> On 3/4/25 12:55, Alexander Atanasov wrote: > On 3.03.25 11:37, Pavel Tikhomirov wrote: >> This adds merge_backward "start", "complete" and "cancel" commands. By >> that we are able to split single merge_backward into two stages: start >> asyncronous merging and completion. That can be usefull for restarting >> qemu process while allowing backward merging to run asyncronously in >> kernel. >> >> The "start" command runs merging preparations in workqueue work. After >> it finishes, the "complete" command can be called to finish the process >> and actually replace the top qcow2 with it's lower. The "cancel" command >> forces the work to stop and flushes it. In case we are in completion >> waiting state already and there is no work running, the "cancel" command >> also reverts merging preparations. >> >> Locking: >> >> Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" >> and "complete" commands are fully under this lock, and the "cancel" >> operation takes the lock explicitly and releases it for work flushing. >> The work also takes the lock but only when updating tgt->backward_merge >> data. For checks, if the work was caneled in the middle, we read the >> state without locking as we don't modify the state there, also we would >> re-check the state again before exiting the work function under lock. >> >> Now on target suspend we "cancel" currently running backward merge, >> previously we were just hanging untill backward merge have been >> finished for possibly a long time, cancelling seems cleaner. Though we >> don't really expect hypervisor suspending the target in the middle of >> backward merge that it by itself started. >> >> https://virtuozzo.atlassian.net/browse/VSTOR-100466 >> Signed-off-by: Pavel Tikhomirov >> >> -- >> v2: Cancel from BACKWARD_MERGE_START state should not try to stop the >> work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. >> --- >> ? drivers/md/dm-qcow2-cmd.c??? | 142 +++++++++++++++++++++++++++++++---- >> ? drivers/md/dm-qcow2-target.c |?? 6 ++ >> ? drivers/md/dm-qcow2.h??????? |? 19 +++++ >> ? 3 files changed, 153 insertions(+), 14 deletions(-) >> >> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c >> index 7b4b0ee68ad9f..04a992f3ebba6 100644 >> --- a/drivers/md/dm-qcow2-cmd.c >> +++ b/drivers/md/dm-qcow2-cmd.c >> @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target >> *tgt, struct qio *qio, >> ????? wake_up(&tgt->service_wq); >> ? } >> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); >> + >> ? static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 >> *qcow2, >> ??????????? loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) >> ? { >> @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target >> *tgt, struct qcow2 *qcow2, >> ????? WRITE_ONCE(service_status, BLK_STS_OK); >> ????? for (pos = 0; pos < end; pos += step) { >> -??????? if (fatal_signal_pending(current)) { >> +??????? if (qcow2_backward_merge_should_stop(tgt)) { >> ????????????? ret = -EINTR; >> ????????????? break; >> ????????? } > > Is it okay to remove termination on signal - here and the killable > mutex? Without signal handling it can prevent clean shutdown or leave it > stuck if something goes wrong in the code. qcow2_service_iter() is now running in a workqueue, so I do not think signals can reach us there > >> @@ -161,10 +163,11 @@ static void set_backward_merge_in_process(struct >> qcow2_target *tgt, >> ????? qcow2_submit_embedded_qios(tgt, &list); >> ? } >> -static int qcow2_merge_backward(struct qcow2_target *tgt) >> +static int qcow2_merge_backward_start(struct qcow2_target *tgt) >> ? { >> ????? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >> -??? int ret, ret2; >> + >> +??? lockdep_assert_held(&tgt->ctl_mutex); >> ????? if (!lower) >> ????????? return -ENOENT; >> @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct >> qcow2_target *tgt) >> ????????? return -EOPNOTSUPP; >> ????? if (lower->hdr.size < qcow2->hdr.size) >> ????????? return -EBADSLT; >> + >> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) >> +??????? return -EBUSY; >> +??? tgt->backward_merge.state = BACKWARD_MERGE_START; >> +??? tgt->backward_merge.error = 0; >> + >> +??? schedule_work(&tgt->backward_merge.work); >> +??? return 0; >> +} >> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); >> + >> +void qcow2_merge_backward_work(struct work_struct *work) >> +{ >> +??? struct qcow2_target *tgt = container_of(work, struct qcow2_target, >> +??????????????????????? backward_merge.work); >> +??? struct qcow2 *qcow2, *lower; >> +??? int ret, ret2; >> + >> +??? mutex_lock(&tgt->ctl_mutex); >> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_START) { >> +??????? mutex_unlock(&tgt->ctl_mutex); >> +??????? return; >> +??? } >> +??? tgt->backward_merge.state = BACKWARD_MERGE_RUN; >> +??? mutex_unlock(&tgt->ctl_mutex); >> + >> +??? qcow2 = tgt->top; >> +??? lower = qcow2->lower; >> + >> ????? /* >> ?????? * Break all COW clus at L1 level. Otherwise, later >> ?????? * there would be problems with unusing them: >> @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct >> qcow2_target *tgt) >> ????? ret = qcow2_break_l1cow(tgt); >> ????? if (ret) { >> ????????? QC_ERR(tgt->ti, "Can't break L1 COW"); >> -??????? return ret; >> +??????? goto out_err; >> ????? } >> ????? ret = qcow2_set_image_file_features(lower, true); >> ????? if (ret) { >> ????????? QC_ERR(tgt->ti, "Can't set dirty bit"); >> -??????? return ret; >> +??????? goto out_err; >> ????? } >> ????? set_backward_merge_in_process(tgt, qcow2, true); >> @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct >> qcow2_target *tgt) >> ????????? ret2 = qcow2_set_image_file_features(lower, false); >> ????????? if (ret2 < 0) >> ????????????? QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); >> -??????? return ret; >> ????? } >> + >> +out_err: >> +??? mutex_lock(&tgt->ctl_mutex); >> +??? if (ret) { >> +??????? /* Error */ >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??????? tgt->backward_merge.error = ret; >> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >> +??????? /* Merge is canceled */ >> +??????? set_backward_merge_in_process(tgt, qcow2, false); >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??????? tgt->backward_merge.error = -EINTR; >> +??? } else { >> +??????? /* Finish merge */ >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; >> +??? } >> +??? mutex_unlock(&tgt->ctl_mutex); >> +} >> + >> +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) >> +{ >> +??? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >> +??? int ret; >> + >> +??? lockdep_assert_held(&tgt->ctl_mutex); >> + >> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) >> +??????? return -EBUSY; >> + >> ????? tgt->top = lower; >> ????? smp_wmb(); /* Pairs with qcow2_ref_inc() */ >> ????? qcow2_inflight_ref_switch(tgt); /* Pending qios */ >> ????? qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ >> ????? qcow2->lower = NULL; >> -??? ret2 = qcow2_set_image_file_features(qcow2, false); >> -??? if (ret2 < 0) >> -??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); >> +??? ret = qcow2_set_image_file_features(qcow2, false); >> +??? if (ret < 0) >> +??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); >> ????? qcow2_destroy(qcow2); >> +??? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> + >> ????? return 0; >> ? } >> -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); >> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); >> + >> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) >> +{ >> +??? bool flush = false; >> + >> +??? mutex_lock(&tgt->ctl_mutex); >> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { >> +??????? mutex_unlock(&tgt->ctl_mutex); >> +??????? return; >> +??? } >> + >> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_START) { >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??????? flush = true; >> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOP; >> +??????? flush = true; >> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >> +??????? flush = true; >> +??? } else if (tgt->backward_merge.state == >> BACKWARD_MERGE_WAIT_COMPLETION) { >> +??????? set_backward_merge_in_process(tgt, tgt->top, false); >> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >> +??? } >> +??? mutex_unlock(&tgt->ctl_mutex); >> + >> +??? if (flush) >> +??????? flush_work(&tgt->backward_merge.work); >> +} >> + >> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) >> +{ >> +??? return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; >> +} >> ? static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 >> img_id, u8 *ref_index) >> ? { >> @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, unsigned >> int argc, char **argv, >> ????????? } >> ????????? ret = qcow2_get_event(tgt, result, maxlen); >> ????????? goto out; >> +??? } else if (!strcmp(argv[0], "merge_backward")) { >> +??????? if (argc != 2) { >> +??????????? ret = -EINVAL; >> +??????????? goto out; >> +??????? } >> +??????? if (!strcmp(argv[1], "cancel")) { >> +??????????? qcow2_merge_backward_cancel(tgt); >> +??????????? ret = 0; >> +??????????? goto out; >> +??????? } >> ????? } >> -??? ret = mutex_lock_killable(&tgt->ctl_mutex); >> -??? if (ret) >> -??????? goto out; >> +??? mutex_lock(&tgt->ctl_mutex); >> ????? if (!strcmp(argv[0], "get_errors")) { >> ????????? ret = qcow2_get_errors(tgt, result, maxlen); >> @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned >> int argc, char **argv, >> ????? } else if (!strcmp(argv[0], "merge_forward")) { >> ????????? ret = qcow2_merge_forward(tgt); >> ????? } else if (!strcmp(argv[0], "merge_backward")) { >> -??????? ret = qcow2_merge_backward(tgt); >> +??????? if (argc != 2) { >> +??????????? ret = -EINVAL; >> +??????????? mutex_unlock(&tgt->ctl_mutex); >> +??????????? goto out; >> +??????? } >> +??????? if (!strcmp(argv[1], "start")) { >> +??????????? ret = qcow2_merge_backward_start(tgt); >> +??????? } else if (!strcmp(argv[1], "complete")) { >> +??????????? ret = qcow2_merge_backward_complete(tgt); >> +??????? } else { >> +??????????? ret = -ENOTTY; >> +??????? } >> ????? } else { >> ????????? ret = -ENOTTY; >> ????? } >> diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c >> index 540c03cb3c44f..6e2e583ba0b8b 100644 >> --- a/drivers/md/dm-qcow2-target.c >> +++ b/drivers/md/dm-qcow2-target.c >> @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct >> dm_target *ti, bool allowed) >> ????? mutex_lock(&tgt->ctl_mutex); >> ????? tgt->service_operations_allowed = allowed; >> ????? mutex_unlock(&tgt->ctl_mutex); >> +??? if (!allowed) >> +??????? qcow2_merge_backward_cancel(tgt); >> ? } >> ? static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) >> ? { >> @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target >> *tgt) >> ????????? /* Now kill the queue */ >> ????????? destroy_workqueue(tgt->wq); >> ????? } >> +??? qcow2_merge_backward_cancel(tgt); >> ????? mempool_destroy(tgt->qio_pool); >> ????? mempool_destroy(tgt->qrq_pool); >> @@ -494,6 +497,9 @@ static struct qcow2_target >> *alloc_qcow2_target(struct dm_target *ti) >> ????? timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); >> ????? ti->private = tgt; >> ????? tgt->ti = ti; >> + >> +??? INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); >> + >> ????? qcow2_set_service_operations(ti, false); >> ????? return tgt; >> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >> index a89fe3db2196d..bebfdc50ed6d4 100644 >> --- a/drivers/md/dm-qcow2.h >> +++ b/drivers/md/dm-qcow2.h >> @@ -149,6 +149,20 @@ struct md_page { >> ????? struct list_head wpc_readers_wait_list; >> ? }; >> +enum qcow2_backward_merge_state { >> +??? BACKWARD_MERGE_STOPPED = 0, > > nit: this init is excess > >> +??? BACKWARD_MERGE_START, >> +??? BACKWARD_MERGE_RUN, >> +??? BACKWARD_MERGE_WAIT_COMPLETION, >> +??? BACKWARD_MERGE_STOP, >> +}; >> + >> +struct qcow2_backward_merge { >> +??? struct work_struct work; >> +??? enum qcow2_backward_merge_state state; >> +??? int error; >> +}; > > May be add merge error to values returned in qcow2_get_errors, > for the users that use dm events interface. > >> + >> ? struct qcow2_target { >> ????? struct dm_target *ti; >> ? #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from >> blk_mq_init_sched() */ >> @@ -180,6 +194,8 @@ struct qcow2_target { >> ????? struct work_struct event_work; >> ????? spinlock_t event_lock; >> ????? struct mutex ctl_mutex; >> + >> +??? struct qcow2_backward_merge backward_merge; >> ? }; >> ? enum { >> @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target >> *tgt); >> ? void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct >> qcow2 *qcow2); >> ? int qcow2_truncate_safe(struct file *file, loff_t new_len); >> +void qcow2_merge_backward_work(struct work_struct *work); >> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); >> + >> ? static inline struct qcow2_target *to_qcow2_target(struct dm_target >> *ti) >> ? { >> ????? return ti->private; > > > From alexander.atanasov at virtuozzo.com Tue Mar 4 15:11:37 2025 From: alexander.atanasov at virtuozzo.com (Alexander Atanasov) Date: Tue, 4 Mar 2025 14:11:37 +0200 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <0fec20f3-d006-40f9-9112-225ca1deba1c@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> <2a877212-77d1-47f5-a6b6-10d3f8c54488@virtuozzo.com> <0fec20f3-d006-40f9-9112-225ca1deba1c@virtuozzo.com> Message-ID: <1cd2bd7d-8b27-449b-bff9-035896af30b1@virtuozzo.com> On 4.03.25 14:08, Andrey Zhadchenko wrote: > > > On 3/4/25 12:55, Alexander Atanasov wrote: >> On 3.03.25 11:37, Pavel Tikhomirov wrote: >>> This adds merge_backward "start", "complete" and "cancel" commands. By >>> that we are able to split single merge_backward into two stages: start >>> asyncronous merging and completion. That can be usefull for restarting >>> qemu process while allowing backward merging to run asyncronously in >>> kernel. >>> >>> The "start" command runs merging preparations in workqueue work. After >>> it finishes, the "complete" command can be called to finish the process >>> and actually replace the top qcow2 with it's lower. The "cancel" command >>> forces the work to stop and flushes it. In case we are in completion >>> waiting state already and there is no work running, the "cancel" command >>> also reverts merging preparations. >>> >>> Locking: >>> >>> Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" >>> and "complete" commands are fully under this lock, and the "cancel" >>> operation takes the lock explicitly and releases it for work flushing. >>> The work also takes the lock but only when updating tgt->backward_merge >>> data. For checks, if the work was caneled in the middle, we read the >>> state without locking as we don't modify the state there, also we would >>> re-check the state again before exiting the work function under lock. >>> >>> Now on target suspend we "cancel" currently running backward merge, >>> previously we were just hanging untill backward merge have been >>> finished for possibly a long time, cancelling seems cleaner. Though we >>> don't really expect hypervisor suspending the target in the middle of >>> backward merge that it by itself started. >>> >>> https://virtuozzo.atlassian.net/browse/VSTOR-100466 >>> Signed-off-by: Pavel Tikhomirov >>> >>> -- >>> v2: Cancel from BACKWARD_MERGE_START state should not try to stop the >>> work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. >>> --- >>> ? drivers/md/dm-qcow2-cmd.c??? | 142 +++++++++++++++++++++++++++++++---- >>> ? drivers/md/dm-qcow2-target.c |?? 6 ++ >>> ? drivers/md/dm-qcow2.h??????? |? 19 +++++ >>> ? 3 files changed, 153 insertions(+), 14 deletions(-) >>> >>> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c >>> index 7b4b0ee68ad9f..04a992f3ebba6 100644 >>> --- a/drivers/md/dm-qcow2-cmd.c >>> +++ b/drivers/md/dm-qcow2-cmd.c >>> @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target >>> *tgt, struct qio *qio, >>> ????? wake_up(&tgt->service_wq); >>> ? } >>> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); >>> + >>> ? static int qcow2_service_iter(struct qcow2_target *tgt, struct >>> qcow2 *qcow2, >>> ??????????? loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) >>> ? { >>> @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target >>> *tgt, struct qcow2 *qcow2, >>> ????? WRITE_ONCE(service_status, BLK_STS_OK); >>> ????? for (pos = 0; pos < end; pos += step) { >>> -??????? if (fatal_signal_pending(current)) { >>> +??????? if (qcow2_backward_merge_should_stop(tgt)) { >>> ????????????? ret = -EINTR; >>> ????????????? break; >>> ????????? } >> >> Is it okay to remove termination on signal - here and the killable >> mutex? Without signal handling it can prevent clean shutdown or leave it >> stuck if something goes wrong in the code. > > qcow2_service_iter() is now running in a workqueue, so I do not think > signals can reach us there You are right about the wq so this leaves the question only about the the mutex. >> >>> @@ -161,10 +163,11 @@ static void >>> set_backward_merge_in_process(struct qcow2_target *tgt, >>> ????? qcow2_submit_embedded_qios(tgt, &list); >>> ? } >>> -static int qcow2_merge_backward(struct qcow2_target *tgt) >>> +static int qcow2_merge_backward_start(struct qcow2_target *tgt) >>> ? { >>> ????? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >>> -??? int ret, ret2; >>> + >>> +??? lockdep_assert_held(&tgt->ctl_mutex); >>> ????? if (!lower) >>> ????????? return -ENOENT; >>> @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct >>> qcow2_target *tgt) >>> ????????? return -EOPNOTSUPP; >>> ????? if (lower->hdr.size < qcow2->hdr.size) >>> ????????? return -EBADSLT; >>> + >>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) >>> +??????? return -EBUSY; >>> +??? tgt->backward_merge.state = BACKWARD_MERGE_START; >>> +??? tgt->backward_merge.error = 0; >>> + >>> +??? schedule_work(&tgt->backward_merge.work); >>> +??? return 0; >>> +} >>> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); >>> + >>> +void qcow2_merge_backward_work(struct work_struct *work) >>> +{ >>> +??? struct qcow2_target *tgt = container_of(work, struct qcow2_target, >>> +??????????????????????? backward_merge.work); >>> +??? struct qcow2 *qcow2, *lower; >>> +??? int ret, ret2; >>> + >>> +??? mutex_lock(&tgt->ctl_mutex); >>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_START) { >>> +??????? mutex_unlock(&tgt->ctl_mutex); >>> +??????? return; >>> +??? } >>> +??? tgt->backward_merge.state = BACKWARD_MERGE_RUN; >>> +??? mutex_unlock(&tgt->ctl_mutex); >>> + >>> +??? qcow2 = tgt->top; >>> +??? lower = qcow2->lower; >>> + >>> ????? /* >>> ?????? * Break all COW clus at L1 level. Otherwise, later >>> ?????? * there would be problems with unusing them: >>> @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct >>> qcow2_target *tgt) >>> ????? ret = qcow2_break_l1cow(tgt); >>> ????? if (ret) { >>> ????????? QC_ERR(tgt->ti, "Can't break L1 COW"); >>> -??????? return ret; >>> +??????? goto out_err; >>> ????? } >>> ????? ret = qcow2_set_image_file_features(lower, true); >>> ????? if (ret) { >>> ????????? QC_ERR(tgt->ti, "Can't set dirty bit"); >>> -??????? return ret; >>> +??????? goto out_err; >>> ????? } >>> ????? set_backward_merge_in_process(tgt, qcow2, true); >>> @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct >>> qcow2_target *tgt) >>> ????????? ret2 = qcow2_set_image_file_features(lower, false); >>> ????????? if (ret2 < 0) >>> ????????????? QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); >>> -??????? return ret; >>> ????? } >>> + >>> +out_err: >>> +??? mutex_lock(&tgt->ctl_mutex); >>> +??? if (ret) { >>> +??????? /* Error */ >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??????? tgt->backward_merge.error = ret; >>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >>> +??????? /* Merge is canceled */ >>> +??????? set_backward_merge_in_process(tgt, qcow2, false); >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??????? tgt->backward_merge.error = -EINTR; >>> +??? } else { >>> +??????? /* Finish merge */ >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; >>> +??? } >>> +??? mutex_unlock(&tgt->ctl_mutex); >>> +} >>> + >>> +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) >>> +{ >>> +??? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >>> +??? int ret; >>> + >>> +??? lockdep_assert_held(&tgt->ctl_mutex); >>> + >>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) >>> +??????? return -EBUSY; >>> + >>> ????? tgt->top = lower; >>> ????? smp_wmb(); /* Pairs with qcow2_ref_inc() */ >>> ????? qcow2_inflight_ref_switch(tgt); /* Pending qios */ >>> ????? qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ >>> ????? qcow2->lower = NULL; >>> -??? ret2 = qcow2_set_image_file_features(qcow2, false); >>> -??? if (ret2 < 0) >>> -??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); >>> +??? ret = qcow2_set_image_file_features(qcow2, false); >>> +??? if (ret < 0) >>> +??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); >>> ????? qcow2_destroy(qcow2); >>> +??? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> + >>> ????? return 0; >>> ? } >>> -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); >>> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); >>> + >>> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) >>> +{ >>> +??? bool flush = false; >>> + >>> +??? mutex_lock(&tgt->ctl_mutex); >>> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { >>> +??????? mutex_unlock(&tgt->ctl_mutex); >>> +??????? return; >>> +??? } >>> + >>> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_START) { >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??????? flush = true; >>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOP; >>> +??????? flush = true; >>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >>> +??????? flush = true; >>> +??? } else if (tgt->backward_merge.state == >>> BACKWARD_MERGE_WAIT_COMPLETION) { >>> +??????? set_backward_merge_in_process(tgt, tgt->top, false); >>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>> +??? } >>> +??? mutex_unlock(&tgt->ctl_mutex); >>> + >>> +??? if (flush) >>> +??????? flush_work(&tgt->backward_merge.work); >>> +} >>> + >>> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) >>> +{ >>> +??? return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; >>> +} >>> ? static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 >>> img_id, u8 *ref_index) >>> ? { >>> @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, >>> unsigned int argc, char **argv, >>> ????????? } >>> ????????? ret = qcow2_get_event(tgt, result, maxlen); >>> ????????? goto out; >>> +??? } else if (!strcmp(argv[0], "merge_backward")) { >>> +??????? if (argc != 2) { >>> +??????????? ret = -EINVAL; >>> +??????????? goto out; >>> +??????? } >>> +??????? if (!strcmp(argv[1], "cancel")) { >>> +??????????? qcow2_merge_backward_cancel(tgt); >>> +??????????? ret = 0; >>> +??????????? goto out; >>> +??????? } >>> ????? } >>> -??? ret = mutex_lock_killable(&tgt->ctl_mutex); >>> -??? if (ret) >>> -??????? goto out; >>> +??? mutex_lock(&tgt->ctl_mutex); >>> ????? if (!strcmp(argv[0], "get_errors")) { >>> ????????? ret = qcow2_get_errors(tgt, result, maxlen); >>> @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, unsigned >>> int argc, char **argv, >>> ????? } else if (!strcmp(argv[0], "merge_forward")) { >>> ????????? ret = qcow2_merge_forward(tgt); >>> ????? } else if (!strcmp(argv[0], "merge_backward")) { >>> -??????? ret = qcow2_merge_backward(tgt); >>> +??????? if (argc != 2) { >>> +??????????? ret = -EINVAL; >>> +??????????? mutex_unlock(&tgt->ctl_mutex); >>> +??????????? goto out; >>> +??????? } >>> +??????? if (!strcmp(argv[1], "start")) { >>> +??????????? ret = qcow2_merge_backward_start(tgt); >>> +??????? } else if (!strcmp(argv[1], "complete")) { >>> +??????????? ret = qcow2_merge_backward_complete(tgt); >>> +??????? } else { >>> +??????????? ret = -ENOTTY; >>> +??????? } >>> ????? } else { >>> ????????? ret = -ENOTTY; >>> ????? } >>> diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c >>> index 540c03cb3c44f..6e2e583ba0b8b 100644 >>> --- a/drivers/md/dm-qcow2-target.c >>> +++ b/drivers/md/dm-qcow2-target.c >>> @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct >>> dm_target *ti, bool allowed) >>> ????? mutex_lock(&tgt->ctl_mutex); >>> ????? tgt->service_operations_allowed = allowed; >>> ????? mutex_unlock(&tgt->ctl_mutex); >>> +??? if (!allowed) >>> +??????? qcow2_merge_backward_cancel(tgt); >>> ? } >>> ? static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) >>> ? { >>> @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target >>> *tgt) >>> ????????? /* Now kill the queue */ >>> ????????? destroy_workqueue(tgt->wq); >>> ????? } >>> +??? qcow2_merge_backward_cancel(tgt); >>> ????? mempool_destroy(tgt->qio_pool); >>> ????? mempool_destroy(tgt->qrq_pool); >>> @@ -494,6 +497,9 @@ static struct qcow2_target >>> *alloc_qcow2_target(struct dm_target *ti) >>> ????? timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); >>> ????? ti->private = tgt; >>> ????? tgt->ti = ti; >>> + >>> +??? INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); >>> + >>> ????? qcow2_set_service_operations(ti, false); >>> ????? return tgt; >>> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >>> index a89fe3db2196d..bebfdc50ed6d4 100644 >>> --- a/drivers/md/dm-qcow2.h >>> +++ b/drivers/md/dm-qcow2.h >>> @@ -149,6 +149,20 @@ struct md_page { >>> ????? struct list_head wpc_readers_wait_list; >>> ? }; >>> +enum qcow2_backward_merge_state { >>> +??? BACKWARD_MERGE_STOPPED = 0, >> >> nit: this init is excess >> >>> +??? BACKWARD_MERGE_START, >>> +??? BACKWARD_MERGE_RUN, >>> +??? BACKWARD_MERGE_WAIT_COMPLETION, >>> +??? BACKWARD_MERGE_STOP, >>> +}; >>> + >>> +struct qcow2_backward_merge { >>> +??? struct work_struct work; >>> +??? enum qcow2_backward_merge_state state; >>> +??? int error; >>> +}; >> >> May be add merge error to values returned in qcow2_get_errors, >> for the users that use dm events interface. >> >>> + >>> ? struct qcow2_target { >>> ????? struct dm_target *ti; >>> ? #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from >>> blk_mq_init_sched() */ >>> @@ -180,6 +194,8 @@ struct qcow2_target { >>> ????? struct work_struct event_work; >>> ????? spinlock_t event_lock; >>> ????? struct mutex ctl_mutex; >>> + >>> +??? struct qcow2_backward_merge backward_merge; >>> ? }; >>> ? enum { >>> @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target >>> *tgt); >>> ? void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct >>> qcow2 *qcow2); >>> ? int qcow2_truncate_safe(struct file *file, loff_t new_len); >>> +void qcow2_merge_backward_work(struct work_struct *work); >>> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); >>> + >>> ? static inline struct qcow2_target *to_qcow2_target(struct dm_target >>> *ti) >>> ? { >>> ????? return ti->private; >> >> >> > -- Regards, Alexander Atanasov From ptikhomirov at virtuozzo.com Wed Mar 5 06:28:21 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 11:28:21 +0800 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <08f205b7-120a-4811-90f5-6145d9d6d059@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> <8c5f114e-29c8-440b-bd3e-b5fbe1e39c82@virtuozzo.com> <08f205b7-120a-4811-90f5-6145d9d6d059@virtuozzo.com> Message-ID: On 3/4/25 19:48, Andrey Zhadchenko wrote: > > > On 3/4/25 12:32, Pavel Tikhomirov wrote: >> >> >> On 3/4/25 18:51, Andrey Zhadchenko wrote: >>> >>> >>> On 3/3/25 10:37, Pavel Tikhomirov wrote: >>>> This adds merge_backward "start", "complete" and "cancel" commands. By >>>> that we are able to split single merge_backward into two stages: start >>>> asyncronous merging and completion. That can be usefull for restarting >>>> qemu process while allowing backward merging to run asyncronously in >>>> kernel. >>>> >>>> The "start" command runs merging preparations in workqueue work. After >>>> it finishes, the "complete" command can be called to finish the process >>>> and actually replace the top qcow2 with it's lower. The "cancel" >>>> command >>>> forces the work to stop and flushes it. In case we are in completion >>>> waiting state already and there is no work running, the "cancel" >>>> command >>>> also reverts merging preparations. >>>> >>>> Locking: >>>> >>>> Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" >>>> and "complete" commands are fully under this lock, and the "cancel" >>>> operation takes the lock explicitly and releases it for work flushing. >>>> The work also takes the lock but only when updating tgt->backward_merge >>>> data. For checks, if the work was caneled in the middle, we read the >>>> state without locking as we don't modify the state there, also we would >>>> re-check the state again before exiting the work function under lock. >>>> >>>> Now on target suspend we "cancel" currently running backward merge, >>>> previously we were just hanging untill backward merge have been >>>> finished for possibly a long time, cancelling seems cleaner. Though we >>>> don't really expect hypervisor suspending the target in the middle of >>>> backward merge that it by itself started. >>>> >>>> https://virtuozzo.atlassian.net/browse/VSTOR-100466 >>>> Signed-off-by: Pavel Tikhomirov >>>> >>>> -- >>>> v2: Cancel from BACKWARD_MERGE_START state should not try to stop the >>>> work via BACKWARD_MERGE_STOP state, else we will deadlock in this >>>> state. >>>> --- >>>> ? drivers/md/dm-qcow2-cmd.c??? | 142 ++++++++++++++++++++++++++++++ >>>> +---- >>>> ? drivers/md/dm-qcow2-target.c |?? 6 ++ >>>> ? drivers/md/dm-qcow2.h??????? |? 19 +++++ >>>> ? 3 files changed, 153 insertions(+), 14 deletions(-) >>>> >>>> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c >>>> index 7b4b0ee68ad9f..04a992f3ebba6 100644 >>>> --- a/drivers/md/dm-qcow2-cmd.c >>>> +++ b/drivers/md/dm-qcow2-cmd.c >>>> @@ -52,6 +52,8 @@ static void service_qio_endio(struct qcow2_target >>>> *tgt, struct qio *qio, >>>> ????? wake_up(&tgt->service_wq); >>>> ? } >>>> +static bool qcow2_backward_merge_should_stop(struct qcow2_target >>>> *tgt); >>>> + >>>> ? static int qcow2_service_iter(struct qcow2_target *tgt, struct >>>> qcow2 *qcow2, >>>> ??????????? loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) >>>> ? { >>>> @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target >>>> *tgt, struct qcow2 *qcow2, >>>> ????? WRITE_ONCE(service_status, BLK_STS_OK); >>>> ????? for (pos = 0; pos < end; pos += step) { >>>> -??????? if (fatal_signal_pending(current)) { >>>> +??????? if (qcow2_backward_merge_should_stop(tgt)) { >>>> ????????????? ret = -EINTR; >>>> ????????????? break; >>>> ????????? } >>>> @@ -161,10 +163,11 @@ static void >>>> set_backward_merge_in_process(struct qcow2_target *tgt, >>>> ????? qcow2_submit_embedded_qios(tgt, &list); >>>> ? } >>>> -static int qcow2_merge_backward(struct qcow2_target *tgt) >>>> +static int qcow2_merge_backward_start(struct qcow2_target *tgt) >>>> ? { >>>> ????? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >>>> -??? int ret, ret2; >>>> + >>>> +??? lockdep_assert_held(&tgt->ctl_mutex); >>>> ????? if (!lower) >>>> ????????? return -ENOENT; >>>> @@ -174,6 +177,35 @@ static int qcow2_merge_backward(struct >>>> qcow2_target *tgt) >>>> ????????? return -EOPNOTSUPP; >>>> ????? if (lower->hdr.size < qcow2->hdr.size) >>>> ????????? return -EBADSLT; >>>> + >>>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) >>>> +??????? return -EBUSY; >>>> +??? tgt->backward_merge.state = BACKWARD_MERGE_START; >>>> +??? tgt->backward_merge.error = 0; >>>> + >>>> +??? schedule_work(&tgt->backward_merge.work); >>> >>> Does this imply we potentially occupy one of the workers of the >>> global pool for the indefinite amount of time? What if we run as much >>> as nworkers (probably ncpus) merges simultaneously? >> >> System_wq has 1024*NCPU execution contexts: >> >> ?> ``@max_active`` determines the maximum number of execution contexts >> per CPU >> >> ?> The maximum limit for ``@max_active`` is 2048 and the default value >> used >> when 0 is specified is 1024. >> >> If we try to run ~1024 works per cpu at the same time we might have a >> problem, and will need to either swithch to our own work-queue or >> create explicit kernel thread for each merge. >> >> >> As flushing system-wide workqueues is now deprecated we are also fine >> with long running work in system_wq and not in system_long_wq, but we >> can move it to system_long_wq just to be on the safe side. >> >> ??* system_wq is the one used by schedule[_delayed]_work[_on](). >> ??* Multi-CPU multi-threaded.? There are users which expect relatively >> ??* short queue flush time.? Don't queue works which can run for too >> ??* long. >> >> ??* system_long_wq is similar to system_wq but may host long running >> ??* works.? Queue flushing might take relatively long. >> >> What do you think? > > In close approximation we can just run on dm-qcow2 workqueue, as each > image allocates it's own wq, tgt->wq. And we only run 2 works there, > general and flush. > However Alexander's dm-ploop rework dropped workqueues in favor of > threads, so I do not know if and when we intend to merge it into dm-qcow2. > > With given limits 1024 per cpu we probably could ignore constraints on > global workqueue. Could you please also give me some links where I can > read this? Couldn't find anything myself Strange, both citations are grepable in both mainstream and vz9 trees: linux$ git grep "system_wq is the one used by schedule" include/linux/workqueue.h: * system_wq is the one used by schedule[_delayed]_work[_on](). https://github.com/torvalds/linux/blob/48a5eed9ad584315c30ed35204510536235ce402/include/linux/workqueue.h#L430 linux$ git grep "determines the maximum number of execution contexts" Documentation/core-api/workqueue.rst:``@max_active`` determines the maximum number of execution contexts per https://github.com/torvalds/linux/blob/48a5eed9ad584315c30ed35204510536235ce402/Documentation/core-api/workqueue.rst?plain=1#L242 In VZ9: kernel-vz9$ git grep "system_wq is the one used by schedule" include/linux/workqueue.h: * system_wq is the one used by schedule[_delayed]_work[_on](). kernel-vz9$ git grep "determines the maximum number of execution contexts" Documentation/core-api/workqueue.rst:``@max_active`` determines the maximum number of execution contexts > >> >>> >>>> +??? return 0; >>>> +} >>>> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); >>>> + >>>> +void qcow2_merge_backward_work(struct work_struct *work) >>>> +{ >>>> +??? struct qcow2_target *tgt = container_of(work, struct qcow2_target, >>>> +??????????????????????? backward_merge.work); >>>> +??? struct qcow2 *qcow2, *lower; >>>> +??? int ret, ret2; >>>> + >>>> +??? mutex_lock(&tgt->ctl_mutex); >>>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_START) { >>>> +??????? mutex_unlock(&tgt->ctl_mutex); >>>> +??????? return; >>>> +??? } >>>> +??? tgt->backward_merge.state = BACKWARD_MERGE_RUN; >>>> +??? mutex_unlock(&tgt->ctl_mutex); >>>> + >>>> +??? qcow2 = tgt->top; >>>> +??? lower = qcow2->lower; >>>> + >>>> ????? /* >>>> ?????? * Break all COW clus at L1 level. Otherwise, later >>>> ?????? * there would be problems with unusing them: >>>> @@ -183,13 +215,13 @@ static int qcow2_merge_backward(struct >>>> qcow2_target *tgt) >>>> ????? ret = qcow2_break_l1cow(tgt); >>>> ????? if (ret) { >>>> ????????? QC_ERR(tgt->ti, "Can't break L1 COW"); >>>> -??????? return ret; >>>> +??????? goto out_err; >>>> ????? } >>>> ????? ret = qcow2_set_image_file_features(lower, true); >>>> ????? if (ret) { >>>> ????????? QC_ERR(tgt->ti, "Can't set dirty bit"); >>>> -??????? return ret; >>>> +??????? goto out_err; >>>> ????? } >>>> ????? set_backward_merge_in_process(tgt, qcow2, true); >>>> @@ -200,22 +232,85 @@ static int qcow2_merge_backward(struct >>>> qcow2_target *tgt) >>>> ????????? ret2 = qcow2_set_image_file_features(lower, false); >>>> ????????? if (ret2 < 0) >>>> ????????????? QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); >>>> -??????? return ret; >>>> ????? } >>>> + >>>> +out_err: >>>> +??? mutex_lock(&tgt->ctl_mutex); >>>> +??? if (ret) { >>>> +??????? /* Error */ >>>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>>> +??????? tgt->backward_merge.error = ret; >>>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >>>> +??????? /* Merge is canceled */ >>>> +??????? set_backward_merge_in_process(tgt, qcow2, false); >>>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>>> +??????? tgt->backward_merge.error = -EINTR; >>>> +??? } else { >>>> +??????? /* Finish merge */ >>>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; >>>> +??? } >>>> +??? mutex_unlock(&tgt->ctl_mutex); >>>> +} >>>> + >>>> +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) >>>> +{ >>>> +??? struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; >>>> +??? int ret; >>>> + >>>> +??? lockdep_assert_held(&tgt->ctl_mutex); >>>> + >>>> +??? if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) >>>> +??????? return -EBUSY; >>>> + >>>> ????? tgt->top = lower; >>>> ????? smp_wmb(); /* Pairs with qcow2_ref_inc() */ >>>> ????? qcow2_inflight_ref_switch(tgt); /* Pending qios */ >>>> ????? qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ >>>> ????? qcow2->lower = NULL; >>>> -??? ret2 = qcow2_set_image_file_features(qcow2, false); >>>> -??? if (ret2 < 0) >>>> -??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); >>>> +??? ret = qcow2_set_image_file_features(qcow2, false); >>>> +??? if (ret < 0) >>>> +??????? QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); >>>> ????? qcow2_destroy(qcow2); >>>> +??? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>>> + >>>> ????? return 0; >>>> ? } >>>> -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); >>>> +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); >>>> + >>>> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) >>>> +{ >>>> +??? bool flush = false; >>>> + >>>> +??? mutex_lock(&tgt->ctl_mutex); >>>> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { >>>> +??????? mutex_unlock(&tgt->ctl_mutex); >>>> +??????? return; >>>> +??? } >>>> + >>>> +??? if (tgt->backward_merge.state == BACKWARD_MERGE_START) { >>>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>>> +??????? flush = true; >>>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { >>>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOP; >>>> +??????? flush = true; >>>> +??? } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { >>>> +??????? flush = true; >>>> +??? } else if (tgt->backward_merge.state == >>>> BACKWARD_MERGE_WAIT_COMPLETION) { >>>> +??????? set_backward_merge_in_process(tgt, tgt->top, false); >>>> +??????? tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; >>>> +??? } >>>> +??? mutex_unlock(&tgt->ctl_mutex); >>>> + >>>> +??? if (flush) >>>> +??????? flush_work(&tgt->backward_merge.work); >>>> +} >>>> + >>>> +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) >>>> +{ >>>> +??? return READ_ONCE(tgt->backward_merge.state) == >>>> BACKWARD_MERGE_STOP; >>>> +} >>>> ? static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 >>>> img_id, u8 *ref_index) >>>> ? { >>>> @@ -374,11 +469,19 @@ int qcow2_message(struct dm_target *ti, >>>> unsigned int argc, char **argv, >>>> ????????? } >>>> ????????? ret = qcow2_get_event(tgt, result, maxlen); >>>> ????????? goto out; >>>> +??? } else if (!strcmp(argv[0], "merge_backward")) { >>>> +??????? if (argc != 2) { >>>> +??????????? ret = -EINVAL; >>>> +??????????? goto out; >>>> +??????? } >>>> +??????? if (!strcmp(argv[1], "cancel")) { >>>> +??????????? qcow2_merge_backward_cancel(tgt); >>>> +??????????? ret = 0; >>>> +??????????? goto out; >>>> +??????? } >>>> ????? } >>>> -??? ret = mutex_lock_killable(&tgt->ctl_mutex); >>>> -??? if (ret) >>>> -??????? goto out; >>>> +??? mutex_lock(&tgt->ctl_mutex); >>>> ????? if (!strcmp(argv[0], "get_errors")) { >>>> ????????? ret = qcow2_get_errors(tgt, result, maxlen); >>>> @@ -388,7 +491,18 @@ int qcow2_message(struct dm_target *ti, >>>> unsigned int argc, char **argv, >>>> ????? } else if (!strcmp(argv[0], "merge_forward")) { >>>> ????????? ret = qcow2_merge_forward(tgt); >>>> ????? } else if (!strcmp(argv[0], "merge_backward")) { >>>> -??????? ret = qcow2_merge_backward(tgt); >>>> +??????? if (argc != 2) { >>>> +??????????? ret = -EINVAL; >>>> +??????????? mutex_unlock(&tgt->ctl_mutex); >>>> +??????????? goto out; >>>> +??????? } >>>> +??????? if (!strcmp(argv[1], "start")) { >>>> +??????????? ret = qcow2_merge_backward_start(tgt); >>>> +??????? } else if (!strcmp(argv[1], "complete")) { >>>> +??????????? ret = qcow2_merge_backward_complete(tgt); >>>> +??????? } else { >>>> +??????????? ret = -ENOTTY; >>>> +??????? } >>>> ????? } else { >>>> ????????? ret = -ENOTTY; >>>> ????? } >>>> diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2- >>>> target.c >>>> index 540c03cb3c44f..6e2e583ba0b8b 100644 >>>> --- a/drivers/md/dm-qcow2-target.c >>>> +++ b/drivers/md/dm-qcow2-target.c >>>> @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct >>>> dm_target *ti, bool allowed) >>>> ????? mutex_lock(&tgt->ctl_mutex); >>>> ????? tgt->service_operations_allowed = allowed; >>>> ????? mutex_unlock(&tgt->ctl_mutex); >>>> +??? if (!allowed) >>>> +??????? qcow2_merge_backward_cancel(tgt); >>>> ? } >>>> ? static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) >>>> ? { >>>> @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct >>>> qcow2_target *tgt) >>>> ????????? /* Now kill the queue */ >>>> ????????? destroy_workqueue(tgt->wq); >>>> ????? } >>>> +??? qcow2_merge_backward_cancel(tgt); >>>> ????? mempool_destroy(tgt->qio_pool); >>>> ????? mempool_destroy(tgt->qrq_pool); >>>> @@ -494,6 +497,9 @@ static struct qcow2_target >>>> *alloc_qcow2_target(struct dm_target *ti) >>>> ????? timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); >>>> ????? ti->private = tgt; >>>> ????? tgt->ti = ti; >>>> + >>>> +??? INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); >>>> + >>>> ????? qcow2_set_service_operations(ti, false); >>>> ????? return tgt; >>>> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >>>> index a89fe3db2196d..bebfdc50ed6d4 100644 >>>> --- a/drivers/md/dm-qcow2.h >>>> +++ b/drivers/md/dm-qcow2.h >>>> @@ -149,6 +149,20 @@ struct md_page { >>>> ????? struct list_head wpc_readers_wait_list; >>>> ? }; >>>> +enum qcow2_backward_merge_state { >>>> +??? BACKWARD_MERGE_STOPPED = 0, >>>> +??? BACKWARD_MERGE_START, >>>> +??? BACKWARD_MERGE_RUN, >>>> +??? BACKWARD_MERGE_WAIT_COMPLETION, >>>> +??? BACKWARD_MERGE_STOP, >>>> +}; >>>> + >>>> +struct qcow2_backward_merge { >>>> +??? struct work_struct work; >>>> +??? enum qcow2_backward_merge_state state; >>>> +??? int error; >>>> +}; >>>> + >>>> ? struct qcow2_target { >>>> ????? struct dm_target *ti; >>>> ? #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from >>>> blk_mq_init_sched() */ >>>> @@ -180,6 +194,8 @@ struct qcow2_target { >>>> ????? struct work_struct event_work; >>>> ????? spinlock_t event_lock; >>>> ????? struct mutex ctl_mutex; >>>> + >>>> +??? struct qcow2_backward_merge backward_merge; >>>> ? }; >>>> ? enum { >>>> @@ -375,6 +391,9 @@ int qcow2_inflight_ref_switch(struct >>>> qcow2_target *tgt); >>>> ? void qcow2_flush_deferred_activity(struct qcow2_target *tgt, >>>> struct qcow2 *qcow2); >>>> ? int qcow2_truncate_safe(struct file *file, loff_t new_len); >>>> +void qcow2_merge_backward_work(struct work_struct *work); >>>> +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); >>>> + >>>> ? static inline struct qcow2_target *to_qcow2_target(struct >>>> dm_target *ti) >>>> ? { >>>> ????? return ti->private; >>> >> > -- Best regards, Pavel Tikhomirov Senior Software Developer, Virtuozzo. From ptikhomirov at virtuozzo.com Wed Mar 5 06:53:32 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 11:53:32 +0800 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <2a877212-77d1-47f5-a6b6-10d3f8c54488@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> <2a877212-77d1-47f5-a6b6-10d3f8c54488@virtuozzo.com> Message-ID: <275703d9-3ddd-42a3-b7c6-01710096220c@virtuozzo.com> >> @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target >> *tgt, struct qcow2 *qcow2, >> ????? WRITE_ONCE(service_status, BLK_STS_OK); >> ????? for (pos = 0; pos < end; pos += step) { >> -??????? if (fatal_signal_pending(current)) { >> +??????? if (qcow2_backward_merge_should_stop(tgt)) { >> ????????????? ret = -EINTR; >> ????????????? break; >> ????????? } > > Is it okay to remove termination on signal - here and the killable > mutex? Without signal handling it can prevent clean shutdown or leave it > stuck if something goes wrong in the code. Sorry, I should've probably explained it in commit message as a note. Yes, it's ok, and it is intentional: 1) Now this code always runs in workqueue, so we change the way how it can be interrupted to checking the state variable. Sending signal to workqueue worker thread does not have much sense as when signal comes worker may have already switched to execute different work. 2) Now the ctl_mutex is not held for a long time anymore, so there is no point in taking it with interruptible primitive as we won't wait long for mutex. >> @@ -494,6 +497,9 @@ static struct qcow2_target >> *alloc_qcow2_target(struct dm_target *ti) >> ????? timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); >> ????? ti->private = tgt; >> ????? tgt->ti = ti; >> + >> +??? INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); >> + >> ????? qcow2_set_service_operations(ti, false); >> ????? return tgt; >> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >> index a89fe3db2196d..bebfdc50ed6d4 100644 >> --- a/drivers/md/dm-qcow2.h >> +++ b/drivers/md/dm-qcow2.h >> @@ -149,6 +149,20 @@ struct md_page { >> ????? struct list_head wpc_readers_wait_list; >> ? }; >> +enum qcow2_backward_merge_state { >> +??? BACKWARD_MERGE_STOPPED = 0, > > nit: this init is excess Agreed. I will remove it here. Note: that in case of same thing in qcow2_backward_merge_stage it is intentional to identify that the enum values are used as an array index, and starting from 0 is important. > >> +??? BACKWARD_MERGE_START, >> +??? BACKWARD_MERGE_RUN, >> +??? BACKWARD_MERGE_WAIT_COMPLETION, >> +??? BACKWARD_MERGE_STOP, >> +}; >> + >> +struct qcow2_backward_merge { >> +??? struct work_struct work; >> +??? enum qcow2_backward_merge_state state; >> +??? int error; >> +}; > > May be add merge error to values returned in qcow2_get_errors, > for the users that use dm events interface. > Ok, will do. From ptikhomirov at virtuozzo.com Wed Mar 5 07:50:29 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 12:50:29 +0800 Subject: [Devel] [PATCH v4 VZ9 4/5] dm-qcow2: add merge_backward set_eventfd command In-Reply-To: <4c1515c5-3f97-473f-903a-f3777c3724a2@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-5-ptikhomirov@virtuozzo.com> <4c1515c5-3f97-473f-903a-f3777c3724a2@virtuozzo.com> Message-ID: <199da249-bfc1-437f-900f-229df38a1a16@virtuozzo.com> On 3/4/25 19:41, Andrey Zhadchenko wrote: > > > On 3/3/25 10:37, Pavel Tikhomirov wrote: >> This eventfd can be used to get an event when merge_backward start work >> have finished and is waiting for completion. >> >> Note: The eventfd can be changed even while work is running. >> >> Locking: >> >> The backward_merge.eventfd_ctx is protected from being released by >> tgt->ctl_mutex. >> >> https://virtuozzo.atlassian.net/browse/VSTOR-100466 >> Signed-off-by: Pavel Tikhomirov >> >> -- >> v2: Always report that work finished, e.g. also on error or then it was >> canceled, this should be more consistent from the userspace perspective. >> v4: Address Andrey's reveiw: signal that we are at completion waiting on >> change of eventfd. >> --- >> ? drivers/md/dm-qcow2-cmd.c | 42 ++++++++++++++++++++++++++++++++++++++- >> ? drivers/md/dm-qcow2.h???? |? 2 ++ >> ? 2 files changed, 43 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c >> index 04a992f3ebba6..f16b4f731ca5a 100644 >> --- a/drivers/md/dm-qcow2-cmd.c >> +++ b/drivers/md/dm-qcow2-cmd.c >> @@ -5,6 +5,8 @@ >> ? #include >> ? #include >> ? #include >> +#include >> +#include >> ? #include >> ? #include "dm-qcow2.h" >> @@ -197,6 +199,8 @@ void qcow2_merge_backward_work(struct work_struct >> *work) >> ????? mutex_lock(&tgt->ctl_mutex); >> ????? if (tgt->backward_merge.state != BACKWARD_MERGE_START) { >> +??????? if (tgt->backward_merge.eventfd_ctx) >> +??????????? eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); >> ????????? mutex_unlock(&tgt->ctl_mutex); >> ????????? return; >> ????? } >> @@ -249,6 +253,8 @@ void qcow2_merge_backward_work(struct work_struct >> *work) >> ????????? /* Finish merge */ >> ????????? tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; >> ????? } >> +??? if (tgt->backward_merge.eventfd_ctx) >> +??????? eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); > > It would be a bit better if we also set a different values for error or > success, but it is not necessary, as either complete will fail or we do > get_progress and see error I don't think that it is such a good idea 1) The commit https://github.com/torvalds/linux/commit/3652117f854819a1 removes second argument of eventfd_signal in mainstream kernel. 2) In `man 2 eventfd` examples one can see that read from eventfd reads the sum of all previous writes, so even if we pass something meaningful (n) there instead of 1, it would be hard to distinguish such a case from writing 1 n times. > >> ????? mutex_unlock(&tgt->ctl_mutex); >> ? } >> @@ -312,6 +318,27 @@ static bool >> qcow2_backward_merge_should_stop(struct qcow2_target *tgt) >> ????? return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; >> ? } >> +#define QCOW2_FILE_UNBIND -1 >> + >> +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, >> int efd) >> +{ >> +??? struct eventfd_ctx *ctx = NULL; >> + >> +??? ctx = efd == QCOW2_FILE_UNBIND ? NULL : eventfd_ctx_fdget(efd); >> +??? if (IS_ERR(ctx)) >> +??????? return PTR_ERR(ctx); >> + >> +??? mutex_lock(&tgt->ctl_mutex); >> +??? swap(ctx, tgt->backward_merge.eventfd_ctx); >> +??? if (ctx) >> +??????? eventfd_ctx_put(ctx); >> +??? if (tgt->backward_merge.eventfd_ctx && >> +??????? tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) >> +??????? eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); >> +??? mutex_unlock(&tgt->ctl_mutex); >> +??? return 0; >> +} >> + >> ? static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 >> img_id, u8 *ref_index) >> ? { >> ????? struct qcow2 *qcow2; >> @@ -470,14 +497,27 @@ int qcow2_message(struct dm_target *ti, unsigned >> int argc, char **argv, >> ????????? ret = qcow2_get_event(tgt, result, maxlen); >> ????????? goto out; >> ????? } else if (!strcmp(argv[0], "merge_backward")) { >> -??????? if (argc != 2) { >> +??????? if (argc < 2) { >> ????????????? ret = -EINVAL; >> ????????????? goto out; >> ????????? } >> ????????? if (!strcmp(argv[1], "cancel")) { >> +??????????? if (argc != 2) { >> +??????????????? ret = -EINVAL; >> +??????????????? goto out; >> +??????????? } >> ????????????? qcow2_merge_backward_cancel(tgt); >> ????????????? ret = 0; >> ????????????? goto out; >> +??????? } else if (!strcmp(argv[1], "set_eventfd")) { >> +??????????? int efd; >> + >> +??????????? if (argc != 3 || kstrtoint(argv[2], 10, &efd)) { >> +??????????????? ret = -EINVAL; >> +??????????????? goto out; >> +??????????? } >> +??????????? ret = qcow2_merge_backward_set_eventfd(tgt, efd); >> +??????????? goto out; >> ????????? } >> ????? } >> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >> index bebfdc50ed6d4..c4956e3fd0eb7 100644 >> --- a/drivers/md/dm-qcow2.h >> +++ b/drivers/md/dm-qcow2.h >> @@ -5,6 +5,7 @@ >> ? #include >> ? #include >> ? #include >> +#include >> ? #include "dm-core.h" >> ? #define DM_MSG_PREFIX "qcow2" >> @@ -161,6 +162,7 @@ struct qcow2_backward_merge { >> ????? struct work_struct work; >> ????? enum qcow2_backward_merge_state state; >> ????? int error; >> +??? struct eventfd_ctx *eventfd_ctx; >> ? }; >> ? struct qcow2_target { > -- Best regards, Pavel Tikhomirov Senior Software Developer, Virtuozzo. From alexander.atanasov at virtuozzo.com Wed Mar 5 11:36:36 2025 From: alexander.atanasov at virtuozzo.com (Alexander Atanasov) Date: Wed, 5 Mar 2025 10:36:36 +0200 Subject: [Devel] [PATCH v4 VZ9 3/5] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <275703d9-3ddd-42a3-b7c6-01710096220c@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-4-ptikhomirov@virtuozzo.com> <2a877212-77d1-47f5-a6b6-10d3f8c54488@virtuozzo.com> <275703d9-3ddd-42a3-b7c6-01710096220c@virtuozzo.com> Message-ID: On 5.03.25 5:53, Pavel Tikhomirov wrote: > >>> @@ -63,7 +65,7 @@ static int qcow2_service_iter(struct qcow2_target >>> *tgt, struct qcow2 *qcow2, >>> ????? WRITE_ONCE(service_status, BLK_STS_OK); >>> ????? for (pos = 0; pos < end; pos += step) { >>> -??????? if (fatal_signal_pending(current)) { >>> +??????? if (qcow2_backward_merge_should_stop(tgt)) { >>> ????????????? ret = -EINTR; >>> ????????????? break; >>> ????????? } >> >> Is it okay to remove termination on signal - here and the killable >> mutex? Without signal handling it can prevent clean shutdown or leave it >> stuck if something goes wrong in the code. > > Sorry, I should've probably explained it in commit message as a note. > > Yes, it's ok, and it is intentional: > > 1) Now this code always runs in workqueue, so we change the way how it > can be interrupted to checking the state variable. Sending signal to > workqueue worker thread does not have much sense as when signal comes > worker may have already switched to execute different work. > > 2) Now the ctl_mutex is not held for a long time anymore, so there is no > point in taking it with interruptible primitive as we won't wait long > for mutex. Ok. Makes sense. Note - If merge can not be stopped with SIGKILL, then stop/cancelation should be implemented somewhere in userspace unless that change of behaviour is acceptable. > >>> @@ -494,6 +497,9 @@ static struct qcow2_target >>> *alloc_qcow2_target(struct dm_target *ti) >>> ????? timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); >>> ????? ti->private = tgt; >>> ????? tgt->ti = ti; >>> + >>> +??? INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); >>> + >>> ????? qcow2_set_service_operations(ti, false); >>> ????? return tgt; >>> diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h >>> index a89fe3db2196d..bebfdc50ed6d4 100644 >>> --- a/drivers/md/dm-qcow2.h >>> +++ b/drivers/md/dm-qcow2.h >>> @@ -149,6 +149,20 @@ struct md_page { >>> ????? struct list_head wpc_readers_wait_list; >>> ? }; >>> +enum qcow2_backward_merge_state { >>> +??? BACKWARD_MERGE_STOPPED = 0, >> >> nit: this init is excess > > Agreed. I will remove it here. > > Note: that in case of same thing in qcow2_backward_merge_stage it is > intentional to identify that the enum values are used as an array index, > and starting from 0 is important. Agree. for array it can used as a hint to preserve specific values and/o/ order when it is changed and it is required. Either zero init or a comment can be used. -- Regards, Alexander Atanasov From alexander.atanasov at virtuozzo.com Wed Mar 5 11:42:19 2025 From: alexander.atanasov at virtuozzo.com (Alexander Atanasov) Date: Wed, 5 Mar 2025 10:42:19 +0200 Subject: [Devel] [PATCH v4 VZ9 4/5] dm-qcow2: add merge_backward set_eventfd command In-Reply-To: <20250303093802.1233834-5-ptikhomirov@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-5-ptikhomirov@virtuozzo.com> Message-ID: <23cb94f3-3d98-438b-b99a-56a7a46eafa8@virtuozzo.com> On 3.03.25 11:37, Pavel Tikhomirov wrote: > This eventfd can be used to get an event when merge_backward start work > have finished and is waiting for completion. What is the benefit of using eventfd versus devicemapper events? -- Regards, Alexander Atanasov From ptikhomirov at virtuozzo.com Wed Mar 5 12:11:18 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 17:11:18 +0800 Subject: [Devel] [PATCH v4 VZ9 4/5] dm-qcow2: add merge_backward set_eventfd command In-Reply-To: <23cb94f3-3d98-438b-b99a-56a7a46eafa8@virtuozzo.com> References: <20250303093802.1233834-1-ptikhomirov@virtuozzo.com> <20250303093802.1233834-5-ptikhomirov@virtuozzo.com> <23cb94f3-3d98-438b-b99a-56a7a46eafa8@virtuozzo.com> Message-ID: <0a0b217a-7cbb-495a-a883-fdf8bb4c3109@virtuozzo.com> On 3/5/25 16:42, Alexander Atanasov wrote: > On 3.03.25 11:37, Pavel Tikhomirov wrote: >> This eventfd can be used to get an event when merge_backward start work >> have finished and is waiting for completion. > > What is the benefit of using eventfd versus devicemapper events? In my understanding the benefit of eventfd compared to qcow2_get_event is that we can poll on eventfd but with qcow2_get_event we can only busy-wait. But maybe I'm missing something, is there devicemapper native way to poll events? > -- Best regards, Pavel Tikhomirov Senior Software Developer, Virtuozzo. From ptikhomirov at virtuozzo.com Wed Mar 5 14:45:22 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 19:45:22 +0800 Subject: [Devel] [PATCH v5 VZ9 1/4] dm-qcow2: fix warning about wrong printk format for size_t In-Reply-To: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> References: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> Message-ID: <20250305114644.1765112-2-ptikhomirov@virtuozzo.com> In file included from ./include/linux/kernel.h:20, from ./include/linux/list.h:9, from ./include/linux/preempt.h:12, from ./include/linux/spinlock.h:56, from drivers/md/dm-qcow2-map.c:5: drivers/md/dm-qcow2-map.c: In function ?process_compressed_read?: ./include/linux/kern_levels.h:5:25: warning: format ?%d? expects argument of type ?int?, but argument 3 has type ?size_t? {aka ?long unsigned int?} [-Wformat=] 5 | #define KERN_SOH "\001" /* ASCII Start Of Header */ | ^~~~~~ ./include/linux/printk.h:497:25: note: in definition of macro ?printk_index_wrap? 497 | _p_func(_fmt, ##__VA_ARGS__); \ | ^~~~ ./include/linux/printk.h:568:9: note: in expansion of macro ?printk? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~ ./include/linux/kern_levels.h:11:25: note: in expansion of macro ?KERN_SOH? 11 | #define KERN_ERR KERN_SOH "3" /* error conditions */ | ^~~~~~~~ ./include/linux/printk.h:568:16: note: in expansion of macro ?KERN_ERR? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~~~ drivers/md/dm-qcow2.h:215:33: note: in expansion of macro ?pr_err? 215 | #define QC_ERR(dmti, fmt, ...) pr_err (QCOW2_FMT(fmt), \ | ^~~~~~ drivers/md/dm-qcow2-map.c:3691:41: note: in expansion of macro ?QC_ERR? 3691 | QC_ERR(qcow2->tgt->ti, | ^~~~~~ While on it fix line wrap alignment. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Rebase on top of vz9.80.19, "%lu" is also incorrect, see Documentation/core-api/printk-formats.rst. --- drivers/md/dm-qcow2-map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c index 7a1312a74e9fb..f7cb036bb416e 100644 --- a/drivers/md/dm-qcow2-map.c +++ b/drivers/md/dm-qcow2-map.c @@ -3689,8 +3689,8 @@ static void process_compressed_read(struct list_head *read_list, buf = kvmalloc(qcow2->clu_size + dctxlen, GFP_NOIO); if (!buf) { QC_ERR(qcow2->tgt->ti, - "can not allocate decompression buffer:%lu", - qcow2->clu_size + dctxlen); + "can not allocate decompression buffer:%zu", + qcow2->clu_size + dctxlen); end_qios(read_list, BLK_STS_RESOURCE); return; } -- 2.48.1 From ptikhomirov at virtuozzo.com Wed Mar 5 14:45:21 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 19:45:21 +0800 Subject: [Devel] [PATCH v5 VZ9 0/4] dm-qcow2: make backward merge asyncronous Message-ID: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. v2: rebase on top of vz9.80.19, make completion event consistent, fix deadlock when cancel after start and before work run v3: weaken locking in progress printing a bit to decrease possible lock contention v5: add "start" stage, remove excess enum init, backward merge error to qcow2_get_errors, note about signals handling removal, merge eventfd and start, release eventfd after work finishes. Note: I didn't yet reworked bash test and ploop part. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov Pavel Tikhomirov (4): dm-qcow2: fix warning about wrong printk format for size_t dm-qcow2: cleanup error handling in qcow2_merge_backward dm-qcow2: make merge_backward command asyncronous dm-qcow2: add merge_backward progress command drivers/md/dm-qcow2-cmd.c | 323 ++++++++++++++++++++++++++++++++--- drivers/md/dm-qcow2-map.c | 4 +- drivers/md/dm-qcow2-target.c | 6 + drivers/md/dm-qcow2.h | 36 ++++ 4 files changed, 341 insertions(+), 28 deletions(-) -- 2.48.1 From ptikhomirov at virtuozzo.com Wed Mar 5 14:45:23 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 19:45:23 +0800 Subject: [Devel] [PATCH v5 VZ9 2/4] dm-qcow2: cleanup error handling in qcow2_merge_backward In-Reply-To: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> References: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> Message-ID: <20250305114644.1765112-3-ptikhomirov@virtuozzo.com> The label "out" is excess, lets remove it in accordance with: "If there is no cleanup needed then just return directly." https://www.kernel.org/doc/html/v4.10/process/coding-style.html#centralized-exiting-of-functions https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov --- drivers/md/dm-qcow2-cmd.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 6dc7e07220557..7b4b0ee68ad9f 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -166,18 +166,14 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; int ret, ret2; - ret = -ENOENT; if (!lower) - goto out; - ret = -EACCES; + return -ENOENT; if (!(lower->file->f_mode & FMODE_WRITE)) - goto out; - ret = -EOPNOTSUPP; + return -EACCES; if (qcow2->clu_size != lower->clu_size) - goto out; - ret = -EBADSLT; + return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) - goto out; + return -EBADSLT; /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -187,13 +183,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - goto out; + return ret; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - goto out; + return ret; } set_backward_merge_in_process(tgt, qcow2, true); @@ -204,7 +200,7 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - goto out; + return ret; } tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -216,8 +212,8 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); qcow2_destroy(qcow2); -out: - return ret; + + return 0; } ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); -- 2.48.1 From ptikhomirov at virtuozzo.com Wed Mar 5 14:45:24 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 19:45:24 +0800 Subject: [Devel] [PATCH v5 VZ9 3/4] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> References: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> Message-ID: <20250305114644.1765112-4-ptikhomirov@virtuozzo.com> This adds merge_backward "start", "complete", "update_eventfd" and "cancel" commands. By that we are able to split single merge_backward into two stages: start asyncronous merging and completion. That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. The "start" command runs merging preparations in workqueue work. After it finishes it sends event to eventfd set on "start", receiving event on eventfd the "complete" command can be called to finish the process and actually replace the top qcow2 with it's lower. In case work encounters any errors or "cancel" request it will also send event to eventfd, calling "complete" after that will fail. Basically userspace is guaranteed to receive event from eventfd in any case after start. The "cancel" command forces the work to stop and flushes it. In case we are in completion waiting state already and there is no work running, the "cancel" command also reverts merging preparations. The "update_eventfd" command can be used to update eventfd for currently running merge, e.g. in case old eventfd was lost for some reason. This command on success guarantees that the caller will receive event from the new eventfd. If "update_eventfd" fails with -EBUSY, it means that there is no currently running merge in progress. Locking: Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" and "complete" commands are fully under this lock, and the "cancel" operation takes the lock explicitly and releases it for work flushing. The work also takes the lock but only when updating tgt->backward_merge data. For checks, if the work was caneled in the middle, we read the state without locking as we don't modify the state there, also we would re-check the state again before exiting the work function under lock. Now on target suspend we "cancel" currently running backward merge, previously we were just hanging untill backward merge have been finished for possibly a long time, cancelling seems cleaner. Though we don't really expect hypervisor suspending the target in the middle of backward merge that it by itself started. The backward_merge.eventfd_ctx is also protected from being released by tgt->ctl_mutex. Note: After this patch the backward merge runs in a workqueue and also the tgt->ctl_mutex is not held for a long time anymore, so we remove interruptible mutex wait, and replace pending signal checks in the middle of backward merge with checking "should stop" state. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Cancel from BACKWARD_MERGE_START state should not try to stop the work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. Always report that work finished, e.g. also on error or then it was canceled, this should be more consistent from the userspace perspective. v5: Address Alexander's and Andrey's reveiw: remove excess enum first element init, add backward merge error to qcow2_get_errors, add note about signals handling removal, merge eventfd into this patch and rework it to be easier for userspace to use. Unbind eventfd after sending event. --- drivers/md/dm-qcow2-cmd.c | 222 ++++++++++++++++++++++++++++++++--- drivers/md/dm-qcow2-target.c | 6 + drivers/md/dm-qcow2.h | 21 ++++ 3 files changed, 233 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 7b4b0ee68ad9f..56f2d3e285cdb 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include "dm-qcow2.h" @@ -17,8 +19,10 @@ static int qcow2_get_errors(struct qcow2_target *tgt, char *result, unsigned int sz = 0; int ret; - ret = DMEMIT("wants_check=%d\nmd_writeback_error=%d\ntruncate_error=%d\n", - wants_check, tgt->md_writeback_error, tgt->truncate_error); + ret = DMEMIT("wants_check=%d\nmd_writeback_error=%d\ntruncate_error=%d\n" + "merge_backward_error=%d\n", + wants_check, tgt->md_writeback_error, tgt->truncate_error, + tgt->backward_merge.error); return ret ? 1 : 0; } @@ -52,6 +56,8 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); + static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) { @@ -63,7 +69,7 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); for (pos = 0; pos < end; pos += step) { - if (fatal_signal_pending(current)) { + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; } @@ -161,10 +167,14 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } -static int qcow2_merge_backward(struct qcow2_target *tgt) +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd); + +static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) { struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; - int ret, ret2; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); if (!lower) return -ENOENT; @@ -174,6 +184,43 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) return -EBADSLT; + + if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) + return -EBUSY; + + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + if (ret) + return ret; + + tgt->backward_merge.state = BACKWARD_MERGE_START; + tgt->backward_merge.error = 0; + + schedule_work(&tgt->backward_merge.work); + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); + +void qcow2_merge_backward_work(struct work_struct *work) +{ + struct qcow2_target *tgt = container_of(work, struct qcow2_target, + backward_merge.work); + struct qcow2 *qcow2, *lower; + int ret, ret2; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state != BACKWARD_MERGE_START) { + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); + qcow2_merge_backward_set_eventfd(tgt, -1); + mutex_unlock(&tgt->ctl_mutex); + return; + } + tgt->backward_merge.state = BACKWARD_MERGE_RUN; + mutex_unlock(&tgt->ctl_mutex); + + qcow2 = tgt->top; + lower = qcow2->lower; + /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -183,13 +230,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - return ret; + goto out_err; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - return ret; + goto out_err; } set_backward_merge_in_process(tgt, qcow2, true); @@ -200,22 +247,129 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - return ret; } + +out_err: + mutex_lock(&tgt->ctl_mutex); + if (ret) { + /* Error */ + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = ret; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + /* Merge is canceled */ + set_backward_merge_in_process(tgt, qcow2, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = -EINTR; + } else { + /* Finish merge */ + tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + } + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); + qcow2_merge_backward_set_eventfd(tgt, -1); + mutex_unlock(&tgt->ctl_mutex); +} + +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) +{ + struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); + + if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) + return -EBUSY; + tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ qcow2_inflight_ref_switch(tgt); /* Pending qios */ qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ qcow2->lower = NULL; - ret2 = qcow2_set_image_file_features(qcow2, false); - if (ret2 < 0) - QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); + ret = qcow2_set_image_file_features(qcow2, false); + if (ret < 0) + QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); qcow2_destroy(qcow2); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); + +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) +{ + bool flush = false; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { + mutex_unlock(&tgt->ctl_mutex); + return; + } + + if (tgt->backward_merge.state == BACKWARD_MERGE_START) { + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { + tgt->backward_merge.state = BACKWARD_MERGE_STOP; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { + set_backward_merge_in_process(tgt, tgt->top, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + } + mutex_unlock(&tgt->ctl_mutex); + + if (flush) + flush_work(&tgt->backward_merge.work); +} + +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) +{ + return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; +} + +#define QCOW2_FILE_UNBIND -1 + +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd) +{ + struct eventfd_ctx *ctx = NULL; + + lockdep_assert_held(&tgt->ctl_mutex); + + ctx = efd == QCOW2_FILE_UNBIND ? NULL : eventfd_ctx_fdget(efd); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + swap(ctx, tgt->backward_merge.eventfd_ctx); + if (ctx) + eventfd_ctx_put(ctx); + + return 0; +} + +static int qcow2_merge_backward_update_eventfd(struct qcow2_target *tgt, int efd) +{ + int ret; + + mutex_lock(&tgt->ctl_mutex); + if (efd != QCOW2_FILE_UNBIND && + (tgt->backward_merge.state != BACKWARD_MERGE_START || + tgt->backward_merge.state != BACKWARD_MERGE_RUN)) { + mutex_unlock(&tgt->ctl_mutex); + return -EBUSY; + } + + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + if (ret) { + mutex_unlock(&tgt->ctl_mutex); + return ret; + } + + mutex_unlock(&tgt->ctl_mutex); return 0; } -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) { @@ -337,6 +491,7 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, struct qcow2_target *tgt = to_qcow2_target(ti); int ret = -EPERM; u32 val, val2; + int efd; if (!capable(CAP_SYS_ADMIN)) goto out; @@ -374,11 +529,30 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_get_event(tgt, result, maxlen); goto out; + } else if (!strcmp(argv[0], "merge_backward")) { + if (argc < 2) { + ret = -EINVAL; + goto out; + } + if (!strcmp(argv[1], "cancel")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + qcow2_merge_backward_cancel(tgt); + ret = 0; + goto out; + } else if (!strcmp(argv[1], "update_eventfd")) { + if (argc != 3 || kstrtoint(argv[2], 10, &efd)) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_update_eventfd(tgt, efd); + goto out; + } } - ret = mutex_lock_killable(&tgt->ctl_mutex); - if (ret) - goto out; + mutex_lock(&tgt->ctl_mutex); if (!strcmp(argv[0], "get_errors")) { ret = qcow2_get_errors(tgt, result, maxlen); @@ -388,11 +562,27 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcmp(argv[0], "merge_forward")) { ret = qcow2_merge_forward(tgt); } else if (!strcmp(argv[0], "merge_backward")) { - ret = qcow2_merge_backward(tgt); + /* argc >= 2 */ + if (!strcmp(argv[1], "start")) { + if (argc != 3 || kstrtoint(argv[2], 10, &efd) || efd < 0) { + ret = -EINVAL; + goto out_unlock; + } + ret = qcow2_merge_backward_start(tgt, efd); + } else if (!strcmp(argv[1], "complete")) { + if (argc != 2) { + ret = -EINVAL; + goto out_unlock; + } + ret = qcow2_merge_backward_complete(tgt); + } else { + ret = -ENOTTY; + } } else { ret = -ENOTTY; } +out_unlock: mutex_unlock(&tgt->ctl_mutex); out: return ret; diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c index 540c03cb3c44f..6e2e583ba0b8b 100644 --- a/drivers/md/dm-qcow2-target.c +++ b/drivers/md/dm-qcow2-target.c @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct dm_target *ti, bool allowed) mutex_lock(&tgt->ctl_mutex); tgt->service_operations_allowed = allowed; mutex_unlock(&tgt->ctl_mutex); + if (!allowed) + qcow2_merge_backward_cancel(tgt); } static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) { @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target *tgt) /* Now kill the queue */ destroy_workqueue(tgt->wq); } + qcow2_merge_backward_cancel(tgt); mempool_destroy(tgt->qio_pool); mempool_destroy(tgt->qrq_pool); @@ -494,6 +497,9 @@ static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti) timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); ti->private = tgt; tgt->ti = ti; + + INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); + qcow2_set_service_operations(ti, false); return tgt; diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index a89fe3db2196d..ca43e13d35c34 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "dm-core.h" #define DM_MSG_PREFIX "qcow2" @@ -149,6 +150,21 @@ struct md_page { struct list_head wpc_readers_wait_list; }; +enum qcow2_backward_merge_state { + BACKWARD_MERGE_STOPPED, + BACKWARD_MERGE_START, + BACKWARD_MERGE_RUN, + BACKWARD_MERGE_WAIT_COMPLETION, + BACKWARD_MERGE_STOP, +}; + +struct qcow2_backward_merge { + struct work_struct work; + enum qcow2_backward_merge_state state; + int error; + struct eventfd_ctx *eventfd_ctx; +}; + struct qcow2_target { struct dm_target *ti; #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */ @@ -180,6 +196,8 @@ struct qcow2_target { struct work_struct event_work; spinlock_t event_lock; struct mutex ctl_mutex; + + struct qcow2_backward_merge backward_merge; }; enum { @@ -375,6 +393,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target *tgt); void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2); int qcow2_truncate_safe(struct file *file, loff_t new_len); +void qcow2_merge_backward_work(struct work_struct *work); +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); + static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti) { return ti->private; -- 2.48.1 From ptikhomirov at virtuozzo.com Wed Mar 5 14:45:25 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Wed, 5 Mar 2025 19:45:25 +0800 Subject: [Devel] [PATCH v5 VZ9 4/4] dm-qcow2: add merge_backward progress command In-Reply-To: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> References: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> Message-ID: <20250305114644.1765112-5-ptikhomirov@virtuozzo.com> This allows to see progress of backward merge. It shows the stage we are at and for iterative stages it provides progress in form of how many iteratious are done and how many iterations there are in total. Locking: The progress data consistency is protected by tgt->ctl_mutex, we always update stage and error consistently under lock. Inside iterative stages for progress updating we have xchg instead of lock so that changes to progress are atomic and imply memory barrier (this way we would not see progress greater than max_progress in progress reporting), but at the same time there is less contention on tgt->ctl_mutex. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v3: Address Kostya's review comments: move progress printing out of lock, remove excess updates of max_progress, make progress updates without lock. v5: Add "start" stage to distinguish already scheduled work. --- drivers/md/dm-qcow2-cmd.c | 85 +++++++++++++++++++++++++++++++++++++++ drivers/md/dm-qcow2.h | 15 +++++++ 2 files changed, 100 insertions(+) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 56f2d3e285cdb..78b31a2b664a5 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -56,6 +56,10 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress); +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress); static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, @@ -68,7 +72,10 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); + backward_merge_update_max_progress(tgt, end); for (pos = 0; pos < end; pos += step) { + backward_merge_update_progress(tgt, pos); + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; @@ -167,6 +174,67 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } +static void __backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + tgt->backward_merge.stage = stage; + tgt->backward_merge.progress = 0; + tgt->backward_merge.max_progress = 0; +} + +static void backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + mutex_lock(&tgt->ctl_mutex); + __backward_merge_update_stage(tgt, stage); + mutex_unlock(&tgt->ctl_mutex); +} + +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress) +{ + xchg(&tgt->backward_merge.max_progress, max_progress); +} + +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress) +{ + xchg(&tgt->backward_merge.progress, progress); +} + +char *backward_merge_stage_names[] = { + "none", + "start", + "break_l1cow", + "set_dirty", + "running", + "waiting_completion", + "completing", + "fail", +}; + +static int qcow2_merge_backward_progress(struct qcow2_target *tgt, + char *result, unsigned int maxlen) +{ + struct qcow2_backward_merge backward_merge; + unsigned int sz = 0; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(backward_merge_stage_names) != BACKWARD_MERGE_STAGE_MAX); + + mutex_lock(&tgt->ctl_mutex); + backward_merge = tgt->backward_merge; + mutex_unlock(&tgt->ctl_mutex); + + ret = DMEMIT("stage=%s\nprogress=%lld\nmax_progress=%lld\nerror=%d\n", + backward_merge_stage_names[backward_merge.stage], + backward_merge.progress, + backward_merge.max_progress, + backward_merge.error); + + return ret ? 1 : 0; +} + static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd); static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) @@ -193,6 +261,7 @@ static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) return ret; tgt->backward_merge.state = BACKWARD_MERGE_START; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_START); tgt->backward_merge.error = 0; schedule_work(&tgt->backward_merge.work); @@ -216,6 +285,7 @@ void qcow2_merge_backward_work(struct work_struct *work) return; } tgt->backward_merge.state = BACKWARD_MERGE_RUN; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_BREAK_L1COW); mutex_unlock(&tgt->ctl_mutex); qcow2 = tgt->top; @@ -233,6 +303,7 @@ void qcow2_merge_backward_work(struct work_struct *work) goto out_err; } + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_SET_DIRTY); ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); @@ -241,6 +312,7 @@ void qcow2_merge_backward_work(struct work_struct *work) set_backward_merge_in_process(tgt, qcow2, true); /* Start merge */ + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_RUNNING); ret = qcow2_merge_common(tgt); if (ret) { set_backward_merge_in_process(tgt, qcow2, false); @@ -255,14 +327,17 @@ void qcow2_merge_backward_work(struct work_struct *work) /* Error */ tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = ret; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { /* Merge is canceled */ set_backward_merge_in_process(tgt, qcow2, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = -EINTR; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else { /* Finish merge */ tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_WAITING_COMPLETION); } if (tgt->backward_merge.eventfd_ctx) eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); @@ -279,6 +354,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) return -EBUSY; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_COMPLETING); tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -292,6 +368,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) qcow2_destroy(qcow2); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); return 0; } @@ -318,6 +395,7 @@ void qcow2_merge_backward_cancel(struct qcow2_target *tgt) } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { set_backward_merge_in_process(tgt, tgt->top, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); } mutex_unlock(&tgt->ctl_mutex); @@ -549,6 +627,13 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_merge_backward_update_eventfd(tgt, efd); goto out; + } else if (!strcmp(argv[1], "progress")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_progress(tgt, result, maxlen); + goto out; } } diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index ca43e13d35c34..5aa00c6a5ebd5 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -158,11 +158,26 @@ enum qcow2_backward_merge_state { BACKWARD_MERGE_STOP, }; +enum qcow2_backward_merge_stage { + BACKWARD_MERGE_STAGE_NONE = 0, + BACKWARD_MERGE_STAGE_START, + BACKWARD_MERGE_STAGE_BREAK_L1COW, + BACKWARD_MERGE_STAGE_SET_DIRTY, + BACKWARD_MERGE_STAGE_RUNNING, + BACKWARD_MERGE_STAGE_WAITING_COMPLETION, + BACKWARD_MERGE_STAGE_COMPLETING, + BACKWARD_MERGE_STAGE_FAIL, + BACKWARD_MERGE_STAGE_MAX, +}; + struct qcow2_backward_merge { struct work_struct work; enum qcow2_backward_merge_state state; int error; struct eventfd_ctx *eventfd_ctx; + enum qcow2_backward_merge_stage stage; + long long progress; + long long max_progress; }; struct qcow2_target { -- 2.48.1 From ptikhomirov at virtuozzo.com Thu Mar 6 08:21:09 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Thu, 6 Mar 2025 13:21:09 +0800 Subject: [Devel] [PATCH v6 VZ9 3/4] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250305114644.1765112-4-ptikhomirov@virtuozzo.com> References: <20250305114644.1765112-4-ptikhomirov@virtuozzo.com> Message-ID: <20250306052121.1930830-1-ptikhomirov@virtuozzo.com> This adds merge_backward "start", "complete", "update_eventfd" and "cancel" commands. By that we are able to split single merge_backward into two stages: start asyncronous merging and completion. That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. The "start" command runs merging preparations in workqueue work. After it finishes it sends event to eventfd set on "start", receiving event on eventfd the "complete" command can be called to finish the process and actually replace the top qcow2 with it's lower. In case work encounters any errors or "cancel" request it will also send event to eventfd, calling "complete" after that will fail. Basically userspace is guaranteed to receive event from eventfd in any case after start. The "cancel" command forces the work to stop and flushes it. In case we are in completion waiting state already and there is no work running, the "cancel" command also reverts merging preparations. The "update_eventfd" command can be used to update eventfd for currently running merge, e.g. in case old eventfd was lost for some reason. This command on success guarantees that the caller will receive event from the new eventfd. If "update_eventfd" fails with -EBUSY, it means that there is no currently running merge in progress. Locking: Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" and "complete" commands are fully under this lock, and the "cancel" operation takes the lock explicitly and releases it for work flushing. The work also takes the lock but only when updating tgt->backward_merge data. For checks, if the work was caneled in the middle, we read the state without locking as we don't modify the state there, also we would re-check the state again before exiting the work function under lock. Now on target suspend we "cancel" currently running backward merge, previously we were just hanging untill backward merge have been finished for possibly a long time, cancelling seems cleaner. Though we don't really expect hypervisor suspending the target in the middle of backward merge that it by itself started. The backward_merge.eventfd_ctx is also protected from being released by tgt->ctl_mutex. Note: After this patch the backward merge runs in a workqueue and also the tgt->ctl_mutex is not held for a long time anymore, so we remove interruptible mutex wait, and replace pending signal checks in the middle of backward merge with checking "should stop" state. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov -- v2: Cancel from BACKWARD_MERGE_START state should not try to stop the work via BACKWARD_MERGE_STOP state, else we will deadlock in this state. Always report that work finished, e.g. also on error or then it was canceled, this should be more consistent from the userspace perspective. v5: Address Alexander's and Andrey's reveiw: remove excess enum first element init, add backward merge error to qcow2_get_errors, add note about signals handling removal, merge eventfd into this patch and rework it to be easier for userspace to use. Unbind eventfd after sending event. v6: Fix condition in update_eventfd. --- drivers/md/dm-qcow2-cmd.c | 222 ++++++++++++++++++++++++++++++++--- drivers/md/dm-qcow2-target.c | 6 + drivers/md/dm-qcow2.h | 21 ++++ 3 files changed, 233 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 7b4b0ee68ad9f..cd416ffc18140 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include "dm-qcow2.h" @@ -17,8 +19,10 @@ static int qcow2_get_errors(struct qcow2_target *tgt, char *result, unsigned int sz = 0; int ret; - ret = DMEMIT("wants_check=%d\nmd_writeback_error=%d\ntruncate_error=%d\n", - wants_check, tgt->md_writeback_error, tgt->truncate_error); + ret = DMEMIT("wants_check=%d\nmd_writeback_error=%d\ntruncate_error=%d\n" + "merge_backward_error=%d\n", + wants_check, tgt->md_writeback_error, tgt->truncate_error, + tgt->backward_merge.error); return ret ? 1 : 0; } @@ -52,6 +56,8 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); + static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) { @@ -63,7 +69,7 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); for (pos = 0; pos < end; pos += step) { - if (fatal_signal_pending(current)) { + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; } @@ -161,10 +167,14 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } -static int qcow2_merge_backward(struct qcow2_target *tgt) +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd); + +static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) { struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; - int ret, ret2; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); if (!lower) return -ENOENT; @@ -174,6 +184,43 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) return -EBADSLT; + + if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) + return -EBUSY; + + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + if (ret) + return ret; + + tgt->backward_merge.state = BACKWARD_MERGE_START; + tgt->backward_merge.error = 0; + + schedule_work(&tgt->backward_merge.work); + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); + +void qcow2_merge_backward_work(struct work_struct *work) +{ + struct qcow2_target *tgt = container_of(work, struct qcow2_target, + backward_merge.work); + struct qcow2 *qcow2, *lower; + int ret, ret2; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state != BACKWARD_MERGE_START) { + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); + qcow2_merge_backward_set_eventfd(tgt, -1); + mutex_unlock(&tgt->ctl_mutex); + return; + } + tgt->backward_merge.state = BACKWARD_MERGE_RUN; + mutex_unlock(&tgt->ctl_mutex); + + qcow2 = tgt->top; + lower = qcow2->lower; + /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -183,13 +230,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - return ret; + goto out_err; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - return ret; + goto out_err; } set_backward_merge_in_process(tgt, qcow2, true); @@ -200,22 +247,129 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - return ret; } + +out_err: + mutex_lock(&tgt->ctl_mutex); + if (ret) { + /* Error */ + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = ret; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + /* Merge is canceled */ + set_backward_merge_in_process(tgt, qcow2, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = -EINTR; + } else { + /* Finish merge */ + tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + } + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); + qcow2_merge_backward_set_eventfd(tgt, -1); + mutex_unlock(&tgt->ctl_mutex); +} + +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) +{ + struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); + + if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) + return -EBUSY; + tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ qcow2_inflight_ref_switch(tgt); /* Pending qios */ qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ qcow2->lower = NULL; - ret2 = qcow2_set_image_file_features(qcow2, false); - if (ret2 < 0) - QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); + ret = qcow2_set_image_file_features(qcow2, false); + if (ret < 0) + QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); qcow2_destroy(qcow2); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); + +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) +{ + bool flush = false; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { + mutex_unlock(&tgt->ctl_mutex); + return; + } + + if (tgt->backward_merge.state == BACKWARD_MERGE_START) { + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { + tgt->backward_merge.state = BACKWARD_MERGE_STOP; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { + set_backward_merge_in_process(tgt, tgt->top, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + } + mutex_unlock(&tgt->ctl_mutex); + + if (flush) + flush_work(&tgt->backward_merge.work); +} + +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) +{ + return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; +} + +#define QCOW2_FILE_UNBIND -1 + +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd) +{ + struct eventfd_ctx *ctx = NULL; + + lockdep_assert_held(&tgt->ctl_mutex); + + ctx = efd == QCOW2_FILE_UNBIND ? NULL : eventfd_ctx_fdget(efd); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + swap(ctx, tgt->backward_merge.eventfd_ctx); + if (ctx) + eventfd_ctx_put(ctx); + + return 0; +} + +static int qcow2_merge_backward_update_eventfd(struct qcow2_target *tgt, int efd) +{ + int ret; + + mutex_lock(&tgt->ctl_mutex); + if (efd != QCOW2_FILE_UNBIND && + (tgt->backward_merge.state != BACKWARD_MERGE_START && + tgt->backward_merge.state != BACKWARD_MERGE_RUN)) { + mutex_unlock(&tgt->ctl_mutex); + return -EBUSY; + } + + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + if (ret) { + mutex_unlock(&tgt->ctl_mutex); + return ret; + } + + mutex_unlock(&tgt->ctl_mutex); return 0; } -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) { @@ -337,6 +491,7 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, struct qcow2_target *tgt = to_qcow2_target(ti); int ret = -EPERM; u32 val, val2; + int efd; if (!capable(CAP_SYS_ADMIN)) goto out; @@ -374,11 +529,30 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_get_event(tgt, result, maxlen); goto out; + } else if (!strcmp(argv[0], "merge_backward")) { + if (argc < 2) { + ret = -EINVAL; + goto out; + } + if (!strcmp(argv[1], "cancel")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + qcow2_merge_backward_cancel(tgt); + ret = 0; + goto out; + } else if (!strcmp(argv[1], "update_eventfd")) { + if (argc != 3 || kstrtoint(argv[2], 10, &efd)) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_update_eventfd(tgt, efd); + goto out; + } } - ret = mutex_lock_killable(&tgt->ctl_mutex); - if (ret) - goto out; + mutex_lock(&tgt->ctl_mutex); if (!strcmp(argv[0], "get_errors")) { ret = qcow2_get_errors(tgt, result, maxlen); @@ -388,11 +562,27 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcmp(argv[0], "merge_forward")) { ret = qcow2_merge_forward(tgt); } else if (!strcmp(argv[0], "merge_backward")) { - ret = qcow2_merge_backward(tgt); + /* argc >= 2 */ + if (!strcmp(argv[1], "start")) { + if (argc != 3 || kstrtoint(argv[2], 10, &efd) || efd < 0) { + ret = -EINVAL; + goto out_unlock; + } + ret = qcow2_merge_backward_start(tgt, efd); + } else if (!strcmp(argv[1], "complete")) { + if (argc != 2) { + ret = -EINVAL; + goto out_unlock; + } + ret = qcow2_merge_backward_complete(tgt); + } else { + ret = -ENOTTY; + } } else { ret = -ENOTTY; } +out_unlock: mutex_unlock(&tgt->ctl_mutex); out: return ret; diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c index 540c03cb3c44f..6e2e583ba0b8b 100644 --- a/drivers/md/dm-qcow2-target.c +++ b/drivers/md/dm-qcow2-target.c @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct dm_target *ti, bool allowed) mutex_lock(&tgt->ctl_mutex); tgt->service_operations_allowed = allowed; mutex_unlock(&tgt->ctl_mutex); + if (!allowed) + qcow2_merge_backward_cancel(tgt); } static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) { @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target *tgt) /* Now kill the queue */ destroy_workqueue(tgt->wq); } + qcow2_merge_backward_cancel(tgt); mempool_destroy(tgt->qio_pool); mempool_destroy(tgt->qrq_pool); @@ -494,6 +497,9 @@ static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti) timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); ti->private = tgt; tgt->ti = ti; + + INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); + qcow2_set_service_operations(ti, false); return tgt; diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index a89fe3db2196d..ca43e13d35c34 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "dm-core.h" #define DM_MSG_PREFIX "qcow2" @@ -149,6 +150,21 @@ struct md_page { struct list_head wpc_readers_wait_list; }; +enum qcow2_backward_merge_state { + BACKWARD_MERGE_STOPPED, + BACKWARD_MERGE_START, + BACKWARD_MERGE_RUN, + BACKWARD_MERGE_WAIT_COMPLETION, + BACKWARD_MERGE_STOP, +}; + +struct qcow2_backward_merge { + struct work_struct work; + enum qcow2_backward_merge_state state; + int error; + struct eventfd_ctx *eventfd_ctx; +}; + struct qcow2_target { struct dm_target *ti; #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */ @@ -180,6 +196,8 @@ struct qcow2_target { struct work_struct event_work; spinlock_t event_lock; struct mutex ctl_mutex; + + struct qcow2_backward_merge backward_merge; }; enum { @@ -375,6 +393,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target *tgt); void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2); int qcow2_truncate_safe(struct file *file, loff_t new_len); +void qcow2_merge_backward_work(struct work_struct *work); +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); + static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti) { return ti->private; -- 2.48.1 From ptikhomirov at virtuozzo.com Thu Mar 6 08:29:02 2025 From: ptikhomirov at virtuozzo.com (Pavel Tikhomirov) Date: Thu, 6 Mar 2025 13:29:02 +0800 Subject: [Devel] [PATCH v6 VZ9 3/4] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250306052121.1930830-1-ptikhomirov@virtuozzo.com> References: <20250305114644.1765112-4-ptikhomirov@virtuozzo.com> <20250306052121.1930830-1-ptikhomirov@virtuozzo.com> Message-ID: <28d942c8-4864-479a-9ede-06ae025a63cb@virtuozzo.com> Note: This was detected by updated test: https://bitbucket.org/virtuozzocore/dm-qcow2-merge-backward-testing/src/master/test-v5.sh From andrey.zhadchenko at virtuozzo.com Thu Mar 6 11:13:33 2025 From: andrey.zhadchenko at virtuozzo.com (Andrey Zhadchenko) Date: Thu, 6 Mar 2025 09:13:33 +0100 Subject: [Devel] [PATCH v5 VZ9 0/4] dm-qcow2: make backward merge asyncronous In-Reply-To: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> References: <20250305114644.1765112-1-ptikhomirov@virtuozzo.com> Message-ID: Reviewed-by: Andrey Zhadchenko On 3/5/25 12:45, Pavel Tikhomirov wrote: > That can be usefull for restarting qemu process while allowing backward > merging to run asyncronously in kernel. > > v2: rebase on top of vz9.80.19, make completion event consistent, fix > deadlock when cancel after start and before work run > v3: weaken locking in progress printing a bit to decrease possible lock > contention > v5: add "start" stage, remove excess enum init, backward merge error to > qcow2_get_errors, note about signals handling removal, merge eventfd and > start, release eventfd after work finishes. > > Note: I didn't yet reworked bash test and ploop part. > > https://virtuozzo.atlassian.net/browse/VSTOR-100466 > Signed-off-by: Pavel Tikhomirov > > Pavel Tikhomirov (4): > dm-qcow2: fix warning about wrong printk format for size_t > dm-qcow2: cleanup error handling in qcow2_merge_backward > dm-qcow2: make merge_backward command asyncronous > dm-qcow2: add merge_backward progress command > > drivers/md/dm-qcow2-cmd.c | 323 ++++++++++++++++++++++++++++++++--- > drivers/md/dm-qcow2-map.c | 4 +- > drivers/md/dm-qcow2-target.c | 6 + > drivers/md/dm-qcow2.h | 36 ++++ > 4 files changed, 341 insertions(+), 28 deletions(-) > From khorenko at virtuozzo.com Mon Mar 10 20:05:16 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Mon, 10 Mar 2025 18:05:16 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] vhost/vsock: remove unused variable i in VHOST_RESET_OWNER ioctl In-Reply-To: <20250303073842.1209656-1-ptikhomirov@virtuozzo.com> Message-ID: <202503101705.52AH5GRZ1231517@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 5720376e819e0208821e2624cf9e79c80ec7215a Author: Pavel Tikhomirov Date: Mon Mar 3 15:37:04 2025 +0800 vhost/vsock: remove unused variable i in VHOST_RESET_OWNER ioctl Fixes compilation warning: drivers/vhost/vsock.c: In function ???vhost_vsock_reset_owner???: drivers/vhost/vsock.c:846:16: warning: unused variable ???i??? [-Wunused-variable] 846 | size_t i; | ^ Fixes: ad35221ad1341 ("vhost/vsock: add VHOST_RESET_OWNER ioctl") Signed-off-by: Pavel Tikhomirov Feature: vhost-vsock: VHOST_RESET_OWNER ioctl --- drivers/vhost/vsock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 3654fa0fd584..36750e163052 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -843,7 +843,6 @@ static int vhost_vsock_reset_owner(struct vhost_vsock *vsock) { struct vhost_iotlb *umem; long err; - size_t i; mutex_lock(&vsock->dev.mutex); err = vhost_dev_check_owner(&vsock->dev); From khorenko at virtuozzo.com Mon Mar 10 20:15:42 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Mon, 10 Mar 2025 18:15:42 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] dm-qcow2: fix warning about wrong printk format for size_t In-Reply-To: <20250305114644.1765112-2-ptikhomirov@virtuozzo.com> Message-ID: <202503101715.52AHFgAK1232888@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 2f6638650970b1fe73b0f737ca8d26a6fb67ab2e Author: Pavel Tikhomirov Date: Wed Mar 5 19:45:22 2025 +0800 dm-qcow2: fix warning about wrong printk format for size_t In file included from ./include/linux/kernel.h:20, from ./include/linux/list.h:9, from ./include/linux/preempt.h:12, from ./include/linux/spinlock.h:56, from drivers/md/dm-qcow2-map.c:5: drivers/md/dm-qcow2-map.c: In function ???process_compressed_read???: ./include/linux/kern_levels.h:5:25: warning: format ???%d??? expects argument of type ???int???, but argument 3 has type ???size_t??? {aka ???long unsigned int???} [-Wformat=] 5 | #define KERN_SOH "\001" /* ASCII Start Of Header */ | ^~~~~~ ./include/linux/printk.h:497:25: note: in definition of macro ???printk_index_wrap??? 497 | _p_func(_fmt, ##__VA_ARGS__); \ | ^~~~ ./include/linux/printk.h:568:9: note: in expansion of macro ???printk??? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~ ./include/linux/kern_levels.h:11:25: note: in expansion of macro ???KERN_SOH??? 11 | #define KERN_ERR KERN_SOH "3" /* error conditions */ | ^~~~~~~~ ./include/linux/printk.h:568:16: note: in expansion of macro ???KERN_ERR??? 568 | printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__) | ^~~~~~~~ drivers/md/dm-qcow2.h:215:33: note: in expansion of macro ???pr_err??? 215 | #define QC_ERR(dmti, fmt, ...) pr_err (QCOW2_FMT(fmt), \ | ^~~~~~ drivers/md/dm-qcow2-map.c:3691:41: note: in expansion of macro ???QC_ERR??? 3691 | QC_ERR(qcow2->tgt->ti, | ^~~~~~ While on it fix line wrap alignment. Fixes: f3662b758e84 ("dm-qcow2: fixup printk argument type") https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov Feature: dm-qcow2: block device over QCOW2 files driver -- v2: Rebase on top of vz9.80.19, "%lu" is also incorrect, see Documentation/core-api/printk-formats.rst. --- drivers/md/dm-qcow2-map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c index 7a1312a74e9f..f7cb036bb416 100644 --- a/drivers/md/dm-qcow2-map.c +++ b/drivers/md/dm-qcow2-map.c @@ -3689,8 +3689,8 @@ static void process_compressed_read(struct list_head *read_list, buf = kvmalloc(qcow2->clu_size + dctxlen, GFP_NOIO); if (!buf) { QC_ERR(qcow2->tgt->ti, - "can not allocate decompression buffer:%lu", - qcow2->clu_size + dctxlen); + "can not allocate decompression buffer:%zu", + qcow2->clu_size + dctxlen); end_qios(read_list, BLK_STS_RESOURCE); return; } From khorenko at virtuozzo.com Mon Mar 10 20:17:44 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Mon, 10 Mar 2025 18:17:44 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] dm-qcow2: cleanup error handling in qcow2_merge_backward In-Reply-To: <20250305114644.1765112-3-ptikhomirov@virtuozzo.com> Message-ID: <202503101717.52AHHi9p1233923@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit b91360e9e9e5df5f6a8f9e3b4ab85b0adb598f7e Author: Pavel Tikhomirov Date: Wed Mar 5 19:45:23 2025 +0800 dm-qcow2: cleanup error handling in qcow2_merge_backward The label "out" is excess, lets remove it in accordance with: "If there is no cleanup needed then just return directly." https://www.kernel.org/doc/html/v4.10/process/coding-style.html#centralized-exiting-of-functions https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov Feature: dm-qcow2: block device over QCOW2 files driver --- drivers/md/dm-qcow2-cmd.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 6dc7e0722055..7b4b0ee68ad9 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -166,18 +166,14 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; int ret, ret2; - ret = -ENOENT; if (!lower) - goto out; - ret = -EACCES; + return -ENOENT; if (!(lower->file->f_mode & FMODE_WRITE)) - goto out; - ret = -EOPNOTSUPP; + return -EACCES; if (qcow2->clu_size != lower->clu_size) - goto out; - ret = -EBADSLT; + return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) - goto out; + return -EBADSLT; /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -187,13 +183,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - goto out; + return ret; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - goto out; + return ret; } set_backward_merge_in_process(tgt, qcow2, true); @@ -204,7 +200,7 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - goto out; + return ret; } tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -216,8 +212,8 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); qcow2_destroy(qcow2); -out: - return ret; + + return 0; } ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); From khorenko at virtuozzo.com Mon Mar 10 20:39:25 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Mon, 10 Mar 2025 18:39:25 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] dm-qcow2: make merge_backward command asyncronous In-Reply-To: <20250306052121.1930830-1-ptikhomirov@virtuozzo.com> Message-ID: <202503101739.52AHdPed1236859@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 0946cf907557e9c22294b0831bf760608e184416 Author: Pavel Tikhomirov Date: Thu Mar 6 13:21:09 2025 +0800 dm-qcow2: make merge_backward command asyncronous This adds merge_backward "start", "complete", "update_eventfd" and "cancel" commands. By that we are able to split single merge_backward into two stages: start asyncronous merging and completion. That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. The "start" command runs merging preparations in workqueue work. After it finishes it sends event to eventfd set on "start", receiving event on eventfd the "complete" command can be called to finish the process and actually replace the top qcow2 with it's lower. In case work encounters any errors or "cancel" request it will also send event to eventfd, calling "complete" after that will fail. Basically userspace is guaranteed to receive event from eventfd in any case after start. The "cancel" command forces the work to stop and flushes it. In case we are in completion waiting state already and there is no work running, the "cancel" command also reverts merging preparations. The "update_eventfd" command can be used to update eventfd for currently running merge, e.g. in case old eventfd was lost for some reason. This command on success guarantees that the caller will receive event from the new eventfd. If "update_eventfd" fails with -EBUSY, it means that there is no currently running merge in progress. Locking: Data in tgt->backward_merge is protected by tgt->ctl_mutex. The "start" and "complete" commands are fully under this lock, and the "cancel" operation takes the lock explicitly and releases it for work flushing. The work also takes the lock but only when updating tgt->backward_merge data. For checks, if the work was caneled in the middle, we read the state without locking as we don't modify the state there, also we would re-check the state again before exiting the work function under lock. Now on target suspend we "cancel" currently running backward merge, previously we were just hanging untill backward merge have been finished for possibly a long time, cancelling seems cleaner. Though we don't really expect hypervisor suspending the target in the middle of backward merge that it by itself started. The backward_merge.eventfd_ctx is also protected from being released by tgt->ctl_mutex. Note: After this patch the backward merge runs in a workqueue and also the tgt->ctl_mutex is not held for a long time anymore, so we remove interruptible mutex wait, and replace pending signal checks in the middle of backward merge with checking "should stop" state. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov Reviewed-by: Andrey Zhadchenko ====== Patchset description: dm-qcow2: make backward merge asyncronous That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. Feature: dm-qcow2: block device over QCOW2 files driver --- drivers/md/dm-qcow2-cmd.c | 222 +++++++++++++++++++++++++++++++++++++++---- drivers/md/dm-qcow2-target.c | 6 ++ drivers/md/dm-qcow2.h | 21 ++++ 3 files changed, 233 insertions(+), 16 deletions(-) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index 7b4b0ee68ad9..cd416ffc1814 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include "dm-qcow2.h" @@ -17,8 +19,10 @@ static int qcow2_get_errors(struct qcow2_target *tgt, char *result, unsigned int sz = 0; int ret; - ret = DMEMIT("wants_check=%d\nmd_writeback_error=%d\ntruncate_error=%d\n", - wants_check, tgt->md_writeback_error, tgt->truncate_error); + ret = DMEMIT("wants_check=%d\nmd_writeback_error=%d\ntruncate_error=%d\n" + "merge_backward_error=%d\n", + wants_check, tgt->md_writeback_error, tgt->truncate_error, + tgt->backward_merge.error); return ret ? 1 : 0; } @@ -52,6 +56,8 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); + static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, loff_t end, loff_t step, unsigned int bi_op, u8 qio_flags) { @@ -63,7 +69,7 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); for (pos = 0; pos < end; pos += step) { - if (fatal_signal_pending(current)) { + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; } @@ -161,10 +167,14 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } -static int qcow2_merge_backward(struct qcow2_target *tgt) +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd); + +static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) { struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; - int ret, ret2; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); if (!lower) return -ENOENT; @@ -174,6 +184,43 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) return -EOPNOTSUPP; if (lower->hdr.size < qcow2->hdr.size) return -EBADSLT; + + if (tgt->backward_merge.state != BACKWARD_MERGE_STOPPED) + return -EBUSY; + + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + if (ret) + return ret; + + tgt->backward_merge.state = BACKWARD_MERGE_START; + tgt->backward_merge.error = 0; + + schedule_work(&tgt->backward_merge.work); + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_start, ERRNO); + +void qcow2_merge_backward_work(struct work_struct *work) +{ + struct qcow2_target *tgt = container_of(work, struct qcow2_target, + backward_merge.work); + struct qcow2 *qcow2, *lower; + int ret, ret2; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state != BACKWARD_MERGE_START) { + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); + qcow2_merge_backward_set_eventfd(tgt, -1); + mutex_unlock(&tgt->ctl_mutex); + return; + } + tgt->backward_merge.state = BACKWARD_MERGE_RUN; + mutex_unlock(&tgt->ctl_mutex); + + qcow2 = tgt->top; + lower = qcow2->lower; + /* * Break all COW clus at L1 level. Otherwise, later * there would be problems with unusing them: @@ -183,13 +230,13 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret = qcow2_break_l1cow(tgt); if (ret) { QC_ERR(tgt->ti, "Can't break L1 COW"); - return ret; + goto out_err; } ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); - return ret; + goto out_err; } set_backward_merge_in_process(tgt, qcow2, true); @@ -200,22 +247,129 @@ static int qcow2_merge_backward(struct qcow2_target *tgt) ret2 = qcow2_set_image_file_features(lower, false); if (ret2 < 0) QC_ERR(tgt->ti, "Can't unuse lower (%d)", ret2); - return ret; } + +out_err: + mutex_lock(&tgt->ctl_mutex); + if (ret) { + /* Error */ + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = ret; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + /* Merge is canceled */ + set_backward_merge_in_process(tgt, qcow2, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + tgt->backward_merge.error = -EINTR; + } else { + /* Finish merge */ + tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + } + if (tgt->backward_merge.eventfd_ctx) + eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); + qcow2_merge_backward_set_eventfd(tgt, -1); + mutex_unlock(&tgt->ctl_mutex); +} + +static int qcow2_merge_backward_complete(struct qcow2_target *tgt) +{ + struct qcow2 *qcow2 = tgt->top, *lower = qcow2->lower; + int ret; + + lockdep_assert_held(&tgt->ctl_mutex); + + if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) + return -EBUSY; + tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ qcow2_inflight_ref_switch(tgt); /* Pending qios */ qcow2_flush_deferred_activity(tgt, qcow2); /* Delayed md pages */ qcow2->lower = NULL; - ret2 = qcow2_set_image_file_features(qcow2, false); - if (ret2 < 0) - QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret2); + ret = qcow2_set_image_file_features(qcow2, false); + if (ret < 0) + QC_ERR(tgt->ti, "Can't unuse merged img (%d)", ret); qcow2_destroy(qcow2); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + + return 0; +} +ALLOW_ERROR_INJECTION(qcow2_merge_backward_complete, ERRNO); + +void qcow2_merge_backward_cancel(struct qcow2_target *tgt) +{ + bool flush = false; + + mutex_lock(&tgt->ctl_mutex); + if (tgt->backward_merge.state == BACKWARD_MERGE_STOPPED) { + mutex_unlock(&tgt->ctl_mutex); + return; + } + + if (tgt->backward_merge.state == BACKWARD_MERGE_START) { + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_RUN) { + tgt->backward_merge.state = BACKWARD_MERGE_STOP; + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { + flush = true; + } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { + set_backward_merge_in_process(tgt, tgt->top, false); + tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + } + mutex_unlock(&tgt->ctl_mutex); + + if (flush) + flush_work(&tgt->backward_merge.work); +} + +static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt) +{ + return READ_ONCE(tgt->backward_merge.state) == BACKWARD_MERGE_STOP; +} + +#define QCOW2_FILE_UNBIND -1 + +static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd) +{ + struct eventfd_ctx *ctx = NULL; + + lockdep_assert_held(&tgt->ctl_mutex); + + ctx = efd == QCOW2_FILE_UNBIND ? NULL : eventfd_ctx_fdget(efd); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + swap(ctx, tgt->backward_merge.eventfd_ctx); + if (ctx) + eventfd_ctx_put(ctx); + + return 0; +} + +static int qcow2_merge_backward_update_eventfd(struct qcow2_target *tgt, int efd) +{ + int ret; + + mutex_lock(&tgt->ctl_mutex); + if (efd != QCOW2_FILE_UNBIND && + (tgt->backward_merge.state != BACKWARD_MERGE_START && + tgt->backward_merge.state != BACKWARD_MERGE_RUN)) { + mutex_unlock(&tgt->ctl_mutex); + return -EBUSY; + } + + ret = qcow2_merge_backward_set_eventfd(tgt, efd); + if (ret) { + mutex_unlock(&tgt->ctl_mutex); + return ret; + } + + mutex_unlock(&tgt->ctl_mutex); return 0; } -ALLOW_ERROR_INJECTION(qcow2_merge_backward, ERRNO); static struct qcow2 *qcow2_get_img(struct qcow2_target *tgt, u32 img_id, u8 *ref_index) { @@ -337,6 +491,7 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, struct qcow2_target *tgt = to_qcow2_target(ti); int ret = -EPERM; u32 val, val2; + int efd; if (!capable(CAP_SYS_ADMIN)) goto out; @@ -374,11 +529,30 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_get_event(tgt, result, maxlen); goto out; + } else if (!strcmp(argv[0], "merge_backward")) { + if (argc < 2) { + ret = -EINVAL; + goto out; + } + if (!strcmp(argv[1], "cancel")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + qcow2_merge_backward_cancel(tgt); + ret = 0; + goto out; + } else if (!strcmp(argv[1], "update_eventfd")) { + if (argc != 3 || kstrtoint(argv[2], 10, &efd)) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_update_eventfd(tgt, efd); + goto out; + } } - ret = mutex_lock_killable(&tgt->ctl_mutex); - if (ret) - goto out; + mutex_lock(&tgt->ctl_mutex); if (!strcmp(argv[0], "get_errors")) { ret = qcow2_get_errors(tgt, result, maxlen); @@ -388,11 +562,27 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcmp(argv[0], "merge_forward")) { ret = qcow2_merge_forward(tgt); } else if (!strcmp(argv[0], "merge_backward")) { - ret = qcow2_merge_backward(tgt); + /* argc >= 2 */ + if (!strcmp(argv[1], "start")) { + if (argc != 3 || kstrtoint(argv[2], 10, &efd) || efd < 0) { + ret = -EINVAL; + goto out_unlock; + } + ret = qcow2_merge_backward_start(tgt, efd); + } else if (!strcmp(argv[1], "complete")) { + if (argc != 2) { + ret = -EINVAL; + goto out_unlock; + } + ret = qcow2_merge_backward_complete(tgt); + } else { + ret = -ENOTTY; + } } else { ret = -ENOTTY; } +out_unlock: mutex_unlock(&tgt->ctl_mutex); out: return ret; diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c index 540c03cb3c44..6e2e583ba0b8 100644 --- a/drivers/md/dm-qcow2-target.c +++ b/drivers/md/dm-qcow2-target.c @@ -25,6 +25,8 @@ static void qcow2_set_service_operations(struct dm_target *ti, bool allowed) mutex_lock(&tgt->ctl_mutex); tgt->service_operations_allowed = allowed; mutex_unlock(&tgt->ctl_mutex); + if (!allowed) + qcow2_merge_backward_cancel(tgt); } static void qcow2_set_wants_suspend(struct dm_target *ti, bool wants) { @@ -251,6 +253,7 @@ static void qcow2_tgt_destroy(struct qcow2_target *tgt) /* Now kill the queue */ destroy_workqueue(tgt->wq); } + qcow2_merge_backward_cancel(tgt); mempool_destroy(tgt->qio_pool); mempool_destroy(tgt->qrq_pool); @@ -494,6 +497,9 @@ static struct qcow2_target *alloc_qcow2_target(struct dm_target *ti) timer_setup(&tgt->enospc_timer, qcow2_enospc_timer, 0); ti->private = tgt; tgt->ti = ti; + + INIT_WORK(&tgt->backward_merge.work, qcow2_merge_backward_work); + qcow2_set_service_operations(ti, false); return tgt; diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index a89fe3db2196..ca43e13d35c3 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "dm-core.h" #define DM_MSG_PREFIX "qcow2" @@ -149,6 +150,21 @@ struct md_page { struct list_head wpc_readers_wait_list; }; +enum qcow2_backward_merge_state { + BACKWARD_MERGE_STOPPED, + BACKWARD_MERGE_START, + BACKWARD_MERGE_RUN, + BACKWARD_MERGE_WAIT_COMPLETION, + BACKWARD_MERGE_STOP, +}; + +struct qcow2_backward_merge { + struct work_struct work; + enum qcow2_backward_merge_state state; + int error; + struct eventfd_ctx *eventfd_ctx; +}; + struct qcow2_target { struct dm_target *ti; #define QCOW2_QRQ_POOL_SIZE 512 /* Twice nr_requests from blk_mq_init_sched() */ @@ -180,6 +196,8 @@ struct qcow2_target { struct work_struct event_work; spinlock_t event_lock; struct mutex ctl_mutex; + + struct qcow2_backward_merge backward_merge; }; enum { @@ -375,6 +393,9 @@ int qcow2_inflight_ref_switch(struct qcow2_target *tgt); void qcow2_flush_deferred_activity(struct qcow2_target *tgt, struct qcow2 *qcow2); int qcow2_truncate_safe(struct file *file, loff_t new_len); +void qcow2_merge_backward_work(struct work_struct *work); +void qcow2_merge_backward_cancel(struct qcow2_target *tgt); + static inline struct qcow2_target *to_qcow2_target(struct dm_target *ti) { return ti->private; From khorenko at virtuozzo.com Mon Mar 10 20:39:26 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Mon, 10 Mar 2025 18:39:26 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] dm-qcow2: add merge_backward progress command In-Reply-To: <20250305114644.1765112-5-ptikhomirov@virtuozzo.com> Message-ID: <202503101739.52AHdQIF1236930@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 4a2265edbd0d2e6313e7a145a6499a99192f2795 Author: Pavel Tikhomirov Date: Wed Mar 5 19:45:25 2025 +0800 dm-qcow2: add merge_backward progress command This allows to see progress of backward merge. It shows the stage we are at and for iterative stages it provides progress in form of how many iteratious are done and how many iterations there are in total. Locking: The progress data consistency is protected by tgt->ctl_mutex, we always update stage and error consistently under lock. Inside iterative stages for progress updating we have xchg instead of lock so that changes to progress are atomic and imply memory barrier (this way we would not see progress greater than max_progress in progress reporting), but at the same time there is less contention on tgt->ctl_mutex. https://virtuozzo.atlassian.net/browse/VSTOR-100466 Signed-off-by: Pavel Tikhomirov Reviewed-by: Andrey Zhadchenko ====== Patchset description: dm-qcow2: make backward merge asyncronous That can be usefull for restarting qemu process while allowing backward merging to run asyncronously in kernel. Feature: dm-qcow2: block device over QCOW2 files driver --- drivers/md/dm-qcow2-cmd.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/dm-qcow2.h | 15 +++++++++ 2 files changed, 100 insertions(+) diff --git a/drivers/md/dm-qcow2-cmd.c b/drivers/md/dm-qcow2-cmd.c index cd416ffc1814..89828f64cfe3 100644 --- a/drivers/md/dm-qcow2-cmd.c +++ b/drivers/md/dm-qcow2-cmd.c @@ -56,6 +56,10 @@ static void service_qio_endio(struct qcow2_target *tgt, struct qio *qio, wake_up(&tgt->service_wq); } +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress); +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress); static bool qcow2_backward_merge_should_stop(struct qcow2_target *tgt); static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, @@ -68,7 +72,10 @@ static int qcow2_service_iter(struct qcow2_target *tgt, struct qcow2 *qcow2, WRITE_ONCE(service_status, BLK_STS_OK); + backward_merge_update_max_progress(tgt, end); for (pos = 0; pos < end; pos += step) { + backward_merge_update_progress(tgt, pos); + if (qcow2_backward_merge_should_stop(tgt)) { ret = -EINTR; break; @@ -167,6 +174,67 @@ static void set_backward_merge_in_process(struct qcow2_target *tgt, qcow2_submit_embedded_qios(tgt, &list); } +static void __backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + tgt->backward_merge.stage = stage; + tgt->backward_merge.progress = 0; + tgt->backward_merge.max_progress = 0; +} + +static void backward_merge_update_stage(struct qcow2_target *tgt, + enum qcow2_backward_merge_stage stage) +{ + mutex_lock(&tgt->ctl_mutex); + __backward_merge_update_stage(tgt, stage); + mutex_unlock(&tgt->ctl_mutex); +} + +static void backward_merge_update_max_progress(struct qcow2_target *tgt, + long long max_progress) +{ + xchg(&tgt->backward_merge.max_progress, max_progress); +} + +static void backward_merge_update_progress(struct qcow2_target *tgt, + long long progress) +{ + xchg(&tgt->backward_merge.progress, progress); +} + +char *backward_merge_stage_names[] = { + "none", + "start", + "break_l1cow", + "set_dirty", + "running", + "waiting_completion", + "completing", + "fail", +}; + +static int qcow2_merge_backward_progress(struct qcow2_target *tgt, + char *result, unsigned int maxlen) +{ + struct qcow2_backward_merge backward_merge; + unsigned int sz = 0; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(backward_merge_stage_names) != BACKWARD_MERGE_STAGE_MAX); + + mutex_lock(&tgt->ctl_mutex); + backward_merge = tgt->backward_merge; + mutex_unlock(&tgt->ctl_mutex); + + ret = DMEMIT("stage=%s\nprogress=%lld\nmax_progress=%lld\nerror=%d\n", + backward_merge_stage_names[backward_merge.stage], + backward_merge.progress, + backward_merge.max_progress, + backward_merge.error); + + return ret ? 1 : 0; +} + static int qcow2_merge_backward_set_eventfd(struct qcow2_target *tgt, int efd); static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) @@ -193,6 +261,7 @@ static int qcow2_merge_backward_start(struct qcow2_target *tgt, int efd) return ret; tgt->backward_merge.state = BACKWARD_MERGE_START; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_START); tgt->backward_merge.error = 0; schedule_work(&tgt->backward_merge.work); @@ -216,6 +285,7 @@ void qcow2_merge_backward_work(struct work_struct *work) return; } tgt->backward_merge.state = BACKWARD_MERGE_RUN; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_BREAK_L1COW); mutex_unlock(&tgt->ctl_mutex); qcow2 = tgt->top; @@ -233,6 +303,7 @@ void qcow2_merge_backward_work(struct work_struct *work) goto out_err; } + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_SET_DIRTY); ret = qcow2_set_image_file_features(lower, true); if (ret) { QC_ERR(tgt->ti, "Can't set dirty bit"); @@ -241,6 +312,7 @@ void qcow2_merge_backward_work(struct work_struct *work) set_backward_merge_in_process(tgt, qcow2, true); /* Start merge */ + backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_RUNNING); ret = qcow2_merge_common(tgt); if (ret) { set_backward_merge_in_process(tgt, qcow2, false); @@ -255,14 +327,17 @@ void qcow2_merge_backward_work(struct work_struct *work) /* Error */ tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = ret; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else if (tgt->backward_merge.state == BACKWARD_MERGE_STOP) { /* Merge is canceled */ set_backward_merge_in_process(tgt, qcow2, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; tgt->backward_merge.error = -EINTR; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_FAIL); } else { /* Finish merge */ tgt->backward_merge.state = BACKWARD_MERGE_WAIT_COMPLETION; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_WAITING_COMPLETION); } if (tgt->backward_merge.eventfd_ctx) eventfd_signal(tgt->backward_merge.eventfd_ctx, 1); @@ -279,6 +354,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) if (tgt->backward_merge.state != BACKWARD_MERGE_WAIT_COMPLETION) return -EBUSY; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_COMPLETING); tgt->top = lower; smp_wmb(); /* Pairs with qcow2_ref_inc() */ @@ -292,6 +368,7 @@ static int qcow2_merge_backward_complete(struct qcow2_target *tgt) qcow2_destroy(qcow2); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); return 0; } @@ -318,6 +395,7 @@ void qcow2_merge_backward_cancel(struct qcow2_target *tgt) } else if (tgt->backward_merge.state == BACKWARD_MERGE_WAIT_COMPLETION) { set_backward_merge_in_process(tgt, tgt->top, false); tgt->backward_merge.state = BACKWARD_MERGE_STOPPED; + __backward_merge_update_stage(tgt, BACKWARD_MERGE_STAGE_NONE); } mutex_unlock(&tgt->ctl_mutex); @@ -549,6 +627,13 @@ int qcow2_message(struct dm_target *ti, unsigned int argc, char **argv, } ret = qcow2_merge_backward_update_eventfd(tgt, efd); goto out; + } else if (!strcmp(argv[1], "progress")) { + if (argc != 2) { + ret = -EINVAL; + goto out; + } + ret = qcow2_merge_backward_progress(tgt, result, maxlen); + goto out; } } diff --git a/drivers/md/dm-qcow2.h b/drivers/md/dm-qcow2.h index ca43e13d35c3..5aa00c6a5ebd 100644 --- a/drivers/md/dm-qcow2.h +++ b/drivers/md/dm-qcow2.h @@ -158,11 +158,26 @@ enum qcow2_backward_merge_state { BACKWARD_MERGE_STOP, }; +enum qcow2_backward_merge_stage { + BACKWARD_MERGE_STAGE_NONE = 0, + BACKWARD_MERGE_STAGE_START, + BACKWARD_MERGE_STAGE_BREAK_L1COW, + BACKWARD_MERGE_STAGE_SET_DIRTY, + BACKWARD_MERGE_STAGE_RUNNING, + BACKWARD_MERGE_STAGE_WAITING_COMPLETION, + BACKWARD_MERGE_STAGE_COMPLETING, + BACKWARD_MERGE_STAGE_FAIL, + BACKWARD_MERGE_STAGE_MAX, +}; + struct qcow2_backward_merge { struct work_struct work; enum qcow2_backward_merge_state state; int error; struct eventfd_ctx *eventfd_ctx; + enum qcow2_backward_merge_stage stage; + long long progress; + long long max_progress; }; struct qcow2_target { From khorenko at virtuozzo.com Tue Mar 11 16:55:07 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 14:55:07 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse: clear splice buffers from response to a killed request In-Reply-To: <20250228144726.29188-1-kui.liu@virtuozzo.com> Message-ID: <202503111355.52BDt7LG1253406@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit b29b3a7a24abe13d2aee4a256c13f3a045b65d1a Author: Liu Kui Date: Fri Feb 28 22:47:26 2025 +0800 fs/fuse: clear splice buffers from response to a killed request Normally we just ignore the response to a killed request, however if there are splice buffers returned with the response, we must clear these splice buffers before returning them to userspace. Fixed #VSTOR-100385 https://virtuozzo.atlassian.net/browse/VSTOR-100385 Signed-off-by: Liu Kui Acked-by: Alexey Kuznetsov Feature: fuse: enhanced splice support --- fs/fuse/dev.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cc6b9f348cf6..118613f17b10 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1986,6 +1986,11 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, { unsigned reqsize = sizeof(struct fuse_out_header); + if (unlikely(args->killed)) { + cs->req->out.h.error = -EIO; + return 0; + } + reqsize += fuse_len_args(args->out_numargs, args->out_args); if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) @@ -2081,6 +2086,9 @@ static int copy_out_splices(struct fuse_copy_state *cs, struct fuse_args *args, int ioff = pipe->bufs[tail & mask].offset; int ilen = pipe->bufs[tail & mask].len; + if (unlikely(args->killed)) + goto skip_copy; + while (ilen > 0) { int copy = ilen; @@ -2105,6 +2113,7 @@ static int copy_out_splices(struct fuse_copy_state *cs, struct fuse_args *args, ioff += copy; ilen -= copy; } +skip_copy: put_page(ipage); pipe->bufs[tail & mask].ops = NULL; pipe->bufs[tail & mask].page = NULL; @@ -2119,7 +2128,9 @@ static int copy_out_splices(struct fuse_copy_state *cs, struct fuse_args *args, } } - if (args->page_zeroing && didx < ap->num_pages) { + if (unlikely(args->killed)) { + cs->req->out.h.error = -EIO; + } else if (args->page_zeroing && didx < ap->num_pages) { if (doff < dend) { void *dst = kmap_atomic(dpage); @@ -2159,6 +2170,11 @@ static int copy_out_krpczc(struct fuse_copy_state *cs, struct fuse_args *args, void *dst; int err; + if (unlikely(args->killed)) { + cs->req->out.h.error = -EIO; + return 0; + } + if (args->out_numargs != 1 || !args->out_pages) return -EINVAL; @@ -2338,10 +2354,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, if (!req->args->page_replace) cs->move_pages = 0; - if (req->args->killed) { - err = 0; - req->out.h.error = -EIO; - } else if (oh.error == FUSE_OUT_SPLICES) { + if (oh.error == FUSE_OUT_SPLICES) { req->out.h.error = 0; err = copy_out_splices(cs, req->args, nbytes); } else if (oh.error == FUSE_OUT_KRPCZC) { From khorenko at virtuozzo.com Tue Mar 11 19:33:39 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:33:39 +0100 Subject: [Devel] [PATCH vz9 1/6] ms/xfrm: interface: use DEV_STATS_INC() In-Reply-To: <20250311163344.1534122-1-khorenko@virtuozzo.com> References: <20250311163344.1534122-1-khorenko@virtuozzo.com> Message-ID: <20250311163344.1534122-2-khorenko@virtuozzo.com> From: Sabrina Dubroca JIRA: https://issues.redhat.com/browse/RHEL-31751 commit f7c4e3e5d4f6609b4725a97451948ca2e425379a Author: Eric Dumazet Date: Tue Sep 5 13:23:03 2023 +0000 xfrm: interface: use DEV_STATS_INC() syzbot/KCSAN reported data-races in xfrm whenever dev->stats fields are updated. It appears all of these updates can happen from multiple cpus. Adopt SMP safe DEV_STATS_INC() to update dev->stats fields. BUG: KCSAN: data-race in xfrmi_xmit / xfrmi_xmit read-write to 0xffff88813726b160 of 8 bytes by task 23986 on cpu 1: xfrmi_xmit+0x74e/0xb20 net/xfrm/xfrm_interface_core.c:583 __netdev_start_xmit include/linux/netdevice.h:4889 [inline] netdev_start_xmit include/linux/netdevice.h:4903 [inline] xmit_one net/core/dev.c:3544 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 __dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 dev_queue_xmit include/linux/netdevice.h:3082 [inline] neigh_connected_output+0x231/0x2a0 net/core/neighbour.c:1581 neigh_output include/net/neighbour.h:542 [inline] ip_finish_output2+0x74a/0x850 net/ipv4/ip_output.c:230 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:293 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:458 [inline] ip_local_out net/ipv4/ip_output.c:127 [inline] ip_send_skb+0x72/0xe0 net/ipv4/ip_output.c:1487 udp_send_skb+0x6a4/0x990 net/ipv4/udp.c:963 udp_sendmsg+0x1249/0x12d0 net/ipv4/udp.c:1246 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:840 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2540 ___sys_sendmsg net/socket.c:2594 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2680 __do_sys_sendmmsg net/socket.c:2709 [inline] __se_sys_sendmmsg net/socket.c:2706 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2706 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read-write to 0xffff88813726b160 of 8 bytes by task 23987 on cpu 0: xfrmi_xmit+0x74e/0xb20 net/xfrm/xfrm_interface_core.c:583 __netdev_start_xmit include/linux/netdevice.h:4889 [inline] netdev_start_xmit include/linux/netdevice.h:4903 [inline] xmit_one net/core/dev.c:3544 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 __dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 dev_queue_xmit include/linux/netdevice.h:3082 [inline] neigh_connected_output+0x231/0x2a0 net/core/neighbour.c:1581 neigh_output include/net/neighbour.h:542 [inline] ip_finish_output2+0x74a/0x850 net/ipv4/ip_output.c:230 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:293 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:458 [inline] ip_local_out net/ipv4/ip_output.c:127 [inline] ip_send_skb+0x72/0xe0 net/ipv4/ip_output.c:1487 udp_send_skb+0x6a4/0x990 net/ipv4/udp.c:963 udp_sendmsg+0x1249/0x12d0 net/ipv4/udp.c:1246 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:840 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2540 ___sys_sendmsg net/socket.c:2594 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2680 __do_sys_sendmmsg net/socket.c:2709 [inline] __se_sys_sendmmsg net/socket.c:2706 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2706 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00000000000010d7 -> 0x00000000000010d8 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 23987 Comm: syz-executor.5 Not tainted 6.5.0-syzkaller-10885-g0468be89b3fa #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 6b50b0bac39b13db9edce8899ee4de363dc95ef9) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- net/xfrm/xfrm_interface_core.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 3baf81d9974c..47e502310c5e 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -334,8 +334,8 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) skb->dev = dev; if (err) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_dropped); return 0; } @@ -380,7 +380,6 @@ static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; @@ -427,7 +426,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) tdev = dst->dev; if (tdev == dev) { - stats->collisions++; + DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; @@ -466,13 +465,13 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { - stats->tx_errors++; - stats->tx_aborted_errors++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); @@ -482,7 +481,6 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; @@ -499,7 +497,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); @@ -515,7 +513,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); @@ -534,8 +532,8 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_err: - stats->tx_errors++; - stats->tx_dropped++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:33:38 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:33:38 +0100 Subject: [Devel] [PATCH vz9 0/6] net: esp: fix bad handling of pages from page_pool Message-ID: <20250311163344.1534122-1-khorenko@virtuozzo.com> We have experienced multiple crashes due to memory corruptions, which happend after multiple kernel complains about "BUG: Bad page state". Using debug commit dba1b8a7ab68 ("mm/page_pool: catch page_pool memory leaks") we have caught the detailed reason: page dumped because: page_pool leak which should probably be fixed with ms commit: c3198822c6cb ("net: esp: fix bad handling of pages from page_pool") RHEL9.5 has already backported this commit along with several others, so let's backport all of those commits in order to simplify later rebase. https://virtuozzo.atlassian.net/browse/ASUP-1064 https://virtuozzo.atlassian.net/browse/VSTOR-101702 Sabrina Dubroca (6): ms/xfrm: interface: use DEV_STATS_INC() ms/xfrm: fix a data-race in xfrm_gen_index() ms/xfrm: annotate data-race around use_time ms/xfrm: fix a data-race in xfrm_lookup_with_ifid() ms/net: skbuff: don't include to ms/net: esp: fix bad handling of pages from page_pool include/linux/skbuff.h | 15 ++++++++++-- include/net/netns/xfrm.h | 1 + include/net/page_pool/types.h | 2 -- net/core/page_pool.c | 39 ----------------------------- net/core/skbuff.c | 45 ++++++++++++++++++++++++++++++++-- net/ipv4/esp4.c | 8 +++--- net/ipv6/esp6.c | 8 +++--- net/xfrm/xfrm_interface_core.c | 22 ++++++++--------- net/xfrm/xfrm_policy.c | 19 +++++++------- net/xfrm/xfrm_state.c | 10 ++++---- 10 files changed, 90 insertions(+), 79 deletions(-) -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:33:40 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:33:40 +0100 Subject: [Devel] [PATCH vz9 2/6] ms/xfrm: fix a data-race in xfrm_gen_index() In-Reply-To: <20250311163344.1534122-1-khorenko@virtuozzo.com> References: <20250311163344.1534122-1-khorenko@virtuozzo.com> Message-ID: <20250311163344.1534122-3-khorenko@virtuozzo.com> From: Sabrina Dubroca JIRA: https://issues.redhat.com/browse/RHEL-31751 commit 3e4bc23926b83c3c67e5f61ae8571602754131a6 Author: Eric Dumazet Date: Fri Sep 8 18:13:59 2023 +0000 xfrm: fix a data-race in xfrm_gen_index() xfrm_gen_index() mutual exclusion uses net->xfrm.xfrm_policy_lock. This means we must use a per-netns idx_generator variable, instead of a static one. Alternative would be to use an atomic variable. syzbot reported: BUG: KCSAN: data-race in xfrm_sk_policy_insert / xfrm_sk_policy_insert write to 0xffffffff87005938 of 4 bytes by task 29466 on cpu 0: xfrm_gen_index net/xfrm/xfrm_policy.c:1385 [inline] xfrm_sk_policy_insert+0x262/0x640 net/xfrm/xfrm_policy.c:2347 xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffffffff87005938 of 4 bytes by task 29460 on cpu 1: xfrm_sk_policy_insert+0x13e/0x640 xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00006ad8 -> 0x00006b18 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 29460 Comm: syz-executor.1 Not tainted 6.5.0-rc5-syzkaller-00243-g9106536c1aa3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Fixes: 1121994c803f ("netns xfrm: policy insertion in netns") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Cc: Herbert Xu Acked-by: Herbert Xu Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 1a178039befae55a1518ce39da04e221b60b8544) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- include/net/netns/xfrm.h | 1 + net/xfrm/xfrm_policy.c | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index bd7c3be4af5d..423b52eca908 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -50,6 +50,7 @@ struct netns_xfrm { struct list_head policy_all; struct hlist_head *policy_byidx; unsigned int policy_idx_hmask; + unsigned int idx_generator; struct hlist_head policy_inexact[XFRM_POLICY_MAX]; struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index aa7f6615ecc5..2f615a3cc70b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1375,8 +1375,6 @@ EXPORT_SYMBOL(xfrm_policy_hash_rebuild); * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { - static u32 idx_generator; - for (;;) { struct hlist_head *list; struct xfrm_policy *p; @@ -1384,8 +1382,8 @@ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) int found; if (!index) { - idx = (idx_generator | dir); - idx_generator += 8; + idx = (net->xfrm.idx_generator | dir); + net->xfrm.idx_generator += 8; } else { idx = index; index = 0; -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:33:41 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:33:41 +0100 Subject: [Devel] [PATCH vz9 3/6] ms/xfrm: annotate data-race around use_time In-Reply-To: <20250311163344.1534122-1-khorenko@virtuozzo.com> References: <20250311163344.1534122-1-khorenko@virtuozzo.com> Message-ID: <20250311163344.1534122-4-khorenko@virtuozzo.com> From: Sabrina Dubroca JIRA: https://issues.redhat.com/browse/RHEL-31751 commit 0a9e5794b21e2d1303759ff8fe5f9215db7757ba Author: Eric Dumazet Date: Thu Jan 26 11:21:30 2023 +0000 xfrm: annotate data-race around use_time KCSAN reported multiple cpus can update use_time at the same time. Adds READ_ONCE()/WRITE_ONCE() annotations. Note that 32bit arches are not fully protected, but they will probably no longer be supported/used in 2106. BUG: KCSAN: data-race in __xfrm_policy_check / __xfrm_policy_check write to 0xffff88813e7ec108 of 8 bytes by interrupt on cpu 0: __xfrm_policy_check+0x6ae/0x17f0 net/xfrm/xfrm_policy.c:3664 __xfrm_policy_check2 include/net/xfrm.h:1174 [inline] xfrm_policy_check include/net/xfrm.h:1179 [inline] xfrm6_policy_check+0x2e9/0x320 include/net/xfrm.h:1189 udpv6_queue_rcv_one_skb+0x48/0xa30 net/ipv6/udp.c:703 udpv6_queue_rcv_skb+0x2d6/0x310 net/ipv6/udp.c:792 udp6_unicast_rcv_skb+0x16b/0x190 net/ipv6/udp.c:935 __udp6_lib_rcv+0x84b/0x9b0 net/ipv6/udp.c:1020 udpv6_rcv+0x4b/0x50 net/ipv6/udp.c:1133 ip6_protocol_deliver_rcu+0x99e/0x1020 net/ipv6/ip6_input.c:439 ip6_input_finish net/ipv6/ip6_input.c:484 [inline] NF_HOOK include/linux/netfilter.h:302 [inline] ip6_input+0xca/0x180 net/ipv6/ip6_input.c:493 dst_input include/net/dst.h:454 [inline] ip6_rcv_finish+0x1e9/0x2d0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:302 [inline] ipv6_rcv+0x85/0x140 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5482 [inline] __netif_receive_skb+0x8b/0x1b0 net/core/dev.c:5596 process_backlog+0x23f/0x3b0 net/core/dev.c:5924 __napi_poll+0x65/0x390 net/core/dev.c:6485 napi_poll net/core/dev.c:6552 [inline] net_rx_action+0x37e/0x730 net/core/dev.c:6663 __do_softirq+0xf2/0x2c7 kernel/softirq.c:571 do_softirq+0xb1/0xf0 kernel/softirq.c:472 __local_bh_enable_ip+0x6f/0x80 kernel/softirq.c:396 __raw_read_unlock_bh include/linux/rwlock_api_smp.h:257 [inline] _raw_read_unlock_bh+0x17/0x20 kernel/locking/spinlock.c:284 wg_socket_send_skb_to_peer+0x107/0x120 drivers/net/wireguard/socket.c:184 wg_packet_create_data_done drivers/net/wireguard/send.c:251 [inline] wg_packet_tx_worker+0x142/0x360 drivers/net/wireguard/send.c:276 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 write to 0xffff88813e7ec108 of 8 bytes by interrupt on cpu 1: __xfrm_policy_check+0x6ae/0x17f0 net/xfrm/xfrm_policy.c:3664 __xfrm_policy_check2 include/net/xfrm.h:1174 [inline] xfrm_policy_check include/net/xfrm.h:1179 [inline] xfrm6_policy_check+0x2e9/0x320 include/net/xfrm.h:1189 udpv6_queue_rcv_one_skb+0x48/0xa30 net/ipv6/udp.c:703 udpv6_queue_rcv_skb+0x2d6/0x310 net/ipv6/udp.c:792 udp6_unicast_rcv_skb+0x16b/0x190 net/ipv6/udp.c:935 __udp6_lib_rcv+0x84b/0x9b0 net/ipv6/udp.c:1020 udpv6_rcv+0x4b/0x50 net/ipv6/udp.c:1133 ip6_protocol_deliver_rcu+0x99e/0x1020 net/ipv6/ip6_input.c:439 ip6_input_finish net/ipv6/ip6_input.c:484 [inline] NF_HOOK include/linux/netfilter.h:302 [inline] ip6_input+0xca/0x180 net/ipv6/ip6_input.c:493 dst_input include/net/dst.h:454 [inline] ip6_rcv_finish+0x1e9/0x2d0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:302 [inline] ipv6_rcv+0x85/0x140 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5482 [inline] __netif_receive_skb+0x8b/0x1b0 net/core/dev.c:5596 process_backlog+0x23f/0x3b0 net/core/dev.c:5924 __napi_poll+0x65/0x390 net/core/dev.c:6485 napi_poll net/core/dev.c:6552 [inline] net_rx_action+0x37e/0x730 net/core/dev.c:6663 __do_softirq+0xf2/0x2c7 kernel/softirq.c:571 do_softirq+0xb1/0xf0 kernel/softirq.c:472 __local_bh_enable_ip+0x6f/0x80 kernel/softirq.c:396 __raw_read_unlock_bh include/linux/rwlock_api_smp.h:257 [inline] _raw_read_unlock_bh+0x17/0x20 kernel/locking/spinlock.c:284 wg_socket_send_skb_to_peer+0x107/0x120 drivers/net/wireguard/socket.c:184 wg_packet_create_data_done drivers/net/wireguard/send.c:251 [inline] wg_packet_tx_worker+0x142/0x360 drivers/net/wireguard/send.c:276 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 value changed: 0x0000000063c62d6f -> 0x0000000063c62d70 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 4185 Comm: kworker/1:2 Tainted: G W 6.2.0-rc4-syzkaller-00009-gd532dd102151-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: wg-crypt-wg0 wg_packet_tx_worker Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Cc: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit c137a781a237e3be2fb178b0d65d1385ac6d5068) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- net/xfrm/xfrm_policy.c | 11 +++++++---- net/xfrm/xfrm_state.c | 10 +++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2f615a3cc70b..52d45c5a7fb0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; @@ -3670,7 +3670,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = ktime_get_real_seconds(); + /* This lockless write can happen from different cpus. */ + WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; @@ -3685,7 +3686,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, xfrm_pol_put(pols[0]); return 0; } - pols[1]->curlft.use_time = ktime_get_real_seconds(); + /* This write can happen from different cpus. */ + WRITE_ONCE(pols[1]->curlft.use_time, + ktime_get_real_seconds()); npols++; } } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 821e4ea52f3f..f9e56d481562 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -595,7 +595,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } if (x->lft.hard_use_expires_seconds) { time64_t tmo = x->lft.hard_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -617,7 +617,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } if (x->lft.soft_use_expires_seconds) { time64_t tmo = x->lft.soft_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) @@ -1902,7 +1902,7 @@ int xfrm_state_update(struct xfrm_state *x) hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); - if (x1->curlft.use_time) + if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { @@ -1936,8 +1936,8 @@ int xfrm_state_check_expire(struct xfrm_state *x) { xfrm_dev_state_update_curlft(x); - if (!x->curlft.use_time) - x->curlft.use_time = ktime_get_real_seconds(); + if (!READ_ONCE(x->curlft.use_time)) + WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:33:42 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:33:42 +0100 Subject: [Devel] [PATCH vz9 4/6] ms/xfrm: fix a data-race in xfrm_lookup_with_ifid() In-Reply-To: <20250311163344.1534122-1-khorenko@virtuozzo.com> References: <20250311163344.1534122-1-khorenko@virtuozzo.com> Message-ID: <20250311163344.1534122-5-khorenko@virtuozzo.com> From: Sabrina Dubroca JIRA: https://issues.redhat.com/browse/RHEL-31751 commit de5724ca38fd5e442bae9c1fab31942b6544012d Author: Eric Dumazet Date: Wed Oct 11 10:24:29 2023 +0000 xfrm: fix a data-race in xfrm_lookup_with_ifid() syzbot complains about a race in xfrm_lookup_with_ifid() [1] When preparing commit 0a9e5794b21e ("xfrm: annotate data-race around use_time") I thought xfrm_lookup_with_ifid() was modifying a still private structure. [1] BUG: KCSAN: data-race in xfrm_lookup_with_ifid / xfrm_lookup_with_ifid write to 0xffff88813ea41108 of 8 bytes by task 8150 on cpu 1: xfrm_lookup_with_ifid+0xce7/0x12d0 net/xfrm/xfrm_policy.c:3218 xfrm_lookup net/xfrm/xfrm_policy.c:3270 [inline] xfrm_lookup_route+0x3b/0x100 net/xfrm/xfrm_policy.c:3281 ip6_dst_lookup_flow+0x98/0xc0 net/ipv6/ip6_output.c:1246 send6+0x241/0x3c0 drivers/net/wireguard/socket.c:139 wg_socket_send_skb_to_peer+0xbd/0x130 drivers/net/wireguard/socket.c:178 wg_socket_send_buffer_to_peer+0xd6/0x100 drivers/net/wireguard/socket.c:200 wg_packet_send_handshake_initiation drivers/net/wireguard/send.c:40 [inline] wg_packet_handshake_send_worker+0x10c/0x150 drivers/net/wireguard/send.c:51 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2703 worker_thread+0x525/0x730 kernel/workqueue.c:2784 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 write to 0xffff88813ea41108 of 8 bytes by task 15867 on cpu 0: xfrm_lookup_with_ifid+0xce7/0x12d0 net/xfrm/xfrm_policy.c:3218 xfrm_lookup net/xfrm/xfrm_policy.c:3270 [inline] xfrm_lookup_route+0x3b/0x100 net/xfrm/xfrm_policy.c:3281 ip6_dst_lookup_flow+0x98/0xc0 net/ipv6/ip6_output.c:1246 send6+0x241/0x3c0 drivers/net/wireguard/socket.c:139 wg_socket_send_skb_to_peer+0xbd/0x130 drivers/net/wireguard/socket.c:178 wg_socket_send_buffer_to_peer+0xd6/0x100 drivers/net/wireguard/socket.c:200 wg_packet_send_handshake_initiation drivers/net/wireguard/send.c:40 [inline] wg_packet_handshake_send_worker+0x10c/0x150 drivers/net/wireguard/send.c:51 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2703 worker_thread+0x525/0x730 kernel/workqueue.c:2784 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 value changed: 0x00000000651cd9d1 -> 0x00000000651cd9d2 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15867 Comm: kworker/u4:58 Not tainted 6.6.0-rc4-syzkaller-00016-g5e62ed3b1c8a #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/06/2023 Workqueue: wg-kex-wg2 wg_packet_handshake_send_worker Fixes: 0a9e5794b21e ("xfrm: annotate data-race around use_time") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit a8d0aa5863643d67a2a61a9e9d6135a24d248f37) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 52d45c5a7fb0..ce785a556cef 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3221,7 +3221,7 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, } for (i = 0; i < num_pols; i++) - pols[i]->curlft.use_time = ktime_get_real_seconds(); + WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds()); if (num_xfrms < 0) { /* Prohibit the flow */ -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:33:44 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:33:44 +0100 Subject: [Devel] [PATCH vz9 6/6] ms/net: esp: fix bad handling of pages from page_pool In-Reply-To: <20250311163344.1534122-1-khorenko@virtuozzo.com> References: <20250311163344.1534122-1-khorenko@virtuozzo.com> Message-ID: <20250311163344.1534122-7-khorenko@virtuozzo.com> From: Sabrina Dubroca JIRA: https://issues.redhat.com/browse/RHEL-31751 commit c3198822c6cb9fb588e446540485669cc81c5d34 Author: Dragos Tatulea Date: Fri Mar 8 17:26:00 2024 +0200 net: esp: fix bad handling of pages from page_pool When the skb is reorganized during esp_output (!esp->inline), the pages coming from the original skb fragments are supposed to be released back to the system through put_page. But if the skb fragment pages are originating from a page_pool, calling put_page on them will trigger a page_pool leak which will eventually result in a crash. This leak can be easily observed when using CONFIG_DEBUG_VM and doing ipsec + gre (non offloaded) forwarding: BUG: Bad page state in process ksoftirqd/16 pfn:1451b6 page:00000000de2b8d32 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1451b6000 pfn:0x1451b6 flags: 0x200000000000000(node=0|zone=2) page_type: 0xffffffff() raw: 0200000000000000 dead000000000040 ffff88810d23c000 0000000000000000 raw: 00000001451b6000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: page_pool leak Modules linked in: ip_gre gre mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core overlay zram zsmalloc fuse [last unloaded: mlx5_core] CPU: 16 PID: 96 Comm: ksoftirqd/16 Not tainted 6.8.0-rc4+ #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x36/0x50 bad_page+0x70/0xf0 free_unref_page_prepare+0x27a/0x460 free_unref_page+0x38/0x120 esp_ssg_unref.isra.0+0x15f/0x200 esp_output_tail+0x66d/0x780 esp_xmit+0x2c5/0x360 validate_xmit_xfrm+0x313/0x370 ? validate_xmit_skb+0x1d/0x330 validate_xmit_skb_list+0x4c/0x70 sch_direct_xmit+0x23e/0x350 __dev_queue_xmit+0x337/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x25e/0x580 iptunnel_xmit+0x19b/0x240 ip_tunnel_xmit+0x5fb/0xb60 ipgre_xmit+0x14d/0x280 [ip_gre] dev_hard_start_xmit+0xc3/0x1c0 __dev_queue_xmit+0x208/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x1ca/0x580 ip_sublist_rcv_finish+0x32/0x40 ip_sublist_rcv+0x1b2/0x1f0 ? ip_rcv_finish_core.constprop.0+0x460/0x460 ip_list_rcv+0x103/0x130 __netif_receive_skb_list_core+0x181/0x1e0 netif_receive_skb_list_internal+0x1b3/0x2c0 napi_gro_receive+0xc8/0x200 gro_cell_poll+0x52/0x90 __napi_poll+0x25/0x1a0 net_rx_action+0x28e/0x300 __do_softirq+0xc3/0x276 ? sort_range+0x20/0x20 run_ksoftirqd+0x1e/0x30 smpboot_thread_fn+0xa6/0x130 kthread+0xcd/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x31/0x50 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 The suggested fix is to introduce a new wrapper (skb_page_unref) that covers page refcounting for page_pool pages as well. Cc: stable at vger.kernel.org Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reported-and-tested-by: Anatoli N.Chechelnickiy Reported-by: Ian Kumlien Link: https://lore.kernel.org/netdev/CAA85sZvvHtrpTQRqdaOx6gd55zPAVsqMYk_Lwh4Md5knTq7AyA at mail.gmail.com Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Reviewed-by: Jakub Kicinski Acked-by: Ilias Apalodimas Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 419aa90c9f4d29f48693335be3a66409d5561357) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- include/linux/skbuff.h | 10 ++++++++++ net/ipv4/esp4.c | 8 ++++---- net/ipv6/esp6.c | 8 ++++---- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9c140d57d051..7e913bc1d21b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3331,6 +3331,16 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) bool napi_pp_put_page(struct page *page, bool napi_safe); +static inline void +skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) +{ +#ifdef CONFIG_PAGE_POOL + if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) + return; +#endif + put_page(page); +} + static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 96902b32adca..c18a1a23f957 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -95,7 +95,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; @@ -116,7 +116,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - put_page(sg_page(sg)); + skb_page_unref(skb, sg_page(sg), false); } #ifdef CONFIG_INET_ESPINTCP @@ -262,7 +262,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); kfree(tmp); if (xo && (xo->flags & XFRM_DEV_RESUME)) { @@ -643,7 +643,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } if (sg != dsg) - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b906a85f8e2e..979aba172f46 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -112,7 +112,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; @@ -132,7 +132,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - put_page(sg_page(sg)); + skb_page_unref(skb, sg_page(sg), false); } #ifdef CONFIG_INET6_ESPINTCP @@ -295,7 +295,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); kfree(tmp); esp_output_encap_csum(skb); @@ -678,7 +678,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } if (sg != dsg) - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:33:43 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:33:43 +0100 Subject: [Devel] [PATCH vz9 5/6] ms/net: skbuff: don't include to In-Reply-To: <20250311163344.1534122-1-khorenko@virtuozzo.com> References: <20250311163344.1534122-1-khorenko@virtuozzo.com> Message-ID: <20250311163344.1534122-6-khorenko@virtuozzo.com> From: Sabrina Dubroca JIRA: https://issues.redhat.com/browse/RHEL-31751 Conflicts: context around #include in net/core/skbuff.c commit 75eaf63ea7afeafd026ffef03bdc69e31f10829b Author: Alexander Lobakin Date: Fri Aug 4 20:05:25 2023 +0200 net: skbuff: don't include to Currently, touching triggers a rebuild of more than half of the kernel. That's because it's included in . And each new include to page_pool/types.h adds more [useless] data for the toolchain to process per each source file from that pile. In commit 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling"), Matteo included it to be able to call a couple of functions defined there. Then, in commit 57f05bc2ab24 ("page_pool: keep pp info as long as page pool owns the page") one of the calls was removed, so only one was left. It's the call to page_pool_return_skb_page() in napi_frag_unref(). The function is external and doesn't have any dependencies. Having very niche page_pool_types.h included only for that looks like an overkill. As %PP_SIGNATURE is not local to page_pool.c (was only in the early submissions), nothing holds this function there. Teleport page_pool_return_skb_page() to skbuff.c, just next to the main consumer, skb_pp_recycle(), and rename it to napi_pp_put_page(), as it doesn't work with skbs at all and the former name tells nothing. The #if guards here are only to not compile and have it in the vmlinux when not needed -- both call sites are already guarded. Now, touching page_pool_types.h only triggers rebuilding of the drivers using it and a couple of core networking files. Suggested-by: Jakub Kicinski # make skbuff.h less heavy Suggested-by: Alexander Duyck # move to skbuff.c Signed-off-by: Alexander Lobakin Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230804180529.2483231-3-aleksander.lobakin at intel.com Signed-off-by: Jakub Kicinski Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 06fe287412e5781c2073b0a53d4d83fe020cf4b5) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- include/linux/skbuff.h | 5 ++-- include/net/page_pool/types.h | 2 -- net/core/page_pool.c | 39 ------------------------------ net/core/skbuff.c | 45 +++++++++++++++++++++++++++++++++-- 4 files changed, 46 insertions(+), 45 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c6744d96a854..9c140d57d051 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -38,7 +38,6 @@ #include #include #include -#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif @@ -3330,13 +3329,15 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +bool napi_pp_put_page(struct page *page, bool napi_safe); + static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL - if (recycle && page_pool_return_skb_page(page, napi_safe)) + if (recycle && napi_pp_put_page(page, napi_safe)) return; #endif put_page(page); diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index efb5733ba31d..070ccbc375b3 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -185,8 +185,6 @@ struct page_pool { struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); -bool page_pool_return_skb_page(struct page *page, bool napi_safe); - struct page_pool *page_pool_create(const struct page_pool_params *params); struct xdp_mem_info; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e4d44ad61921..6667cbf68009 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -947,42 +947,3 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); - -bool page_pool_return_skb_page(struct page *page, bool napi_safe) -{ - struct napi_struct *napi; - struct page_pool *pp; - bool allow_direct; - - page = compound_head(page); - - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; - - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - */ - napi = READ_ONCE(pp->p.napi); - allow_direct = napi_safe && napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); - - return true; -} -EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6afcb1aab87b..7bab2941e8ef 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -72,7 +72,7 @@ #include #include #include -#include +#include #include #include @@ -853,11 +853,52 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page, bool napi_safe) +{ + struct napi_struct *napi; + struct page_pool *pp; + bool allow_direct; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + */ + napi = READ_ONCE(pp->p.napi); + allow_direct = napi_safe && napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, allow_direct); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data), napi_safe); } static void skb_free_head(struct sk_buff *skb, bool napi_safe) -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:34:59 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:34:59 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] ms/xfrm: interface: use DEV_STATS_INC() In-Reply-To: <20250311163344.1534122-2-khorenko@virtuozzo.com> Message-ID: <202503111634.52BGYxtd1534198@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 62ac42c0bb9f589bda5042f1c610afe378bd388d Author: Sabrina Dubroca Date: Thu Apr 11 10:04:27 2024 +0200 ms/xfrm: interface: use DEV_STATS_INC() JIRA: https://issues.redhat.com/browse/RHEL-31751 commit f7c4e3e5d4f6609b4725a97451948ca2e425379a Author: Eric Dumazet Date: Tue Sep 5 13:23:03 2023 +0000 xfrm: interface: use DEV_STATS_INC() syzbot/KCSAN reported data-races in xfrm whenever dev->stats fields are updated. It appears all of these updates can happen from multiple cpus. Adopt SMP safe DEV_STATS_INC() to update dev->stats fields. BUG: KCSAN: data-race in xfrmi_xmit / xfrmi_xmit read-write to 0xffff88813726b160 of 8 bytes by task 23986 on cpu 1: xfrmi_xmit+0x74e/0xb20 net/xfrm/xfrm_interface_core.c:583 __netdev_start_xmit include/linux/netdevice.h:4889 [inline] netdev_start_xmit include/linux/netdevice.h:4903 [inline] xmit_one net/core/dev.c:3544 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 __dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 dev_queue_xmit include/linux/netdevice.h:3082 [inline] neigh_connected_output+0x231/0x2a0 net/core/neighbour.c:1581 neigh_output include/net/neighbour.h:542 [inline] ip_finish_output2+0x74a/0x850 net/ipv4/ip_output.c:230 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:293 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:458 [inline] ip_local_out net/ipv4/ip_output.c:127 [inline] ip_send_skb+0x72/0xe0 net/ipv4/ip_output.c:1487 udp_send_skb+0x6a4/0x990 net/ipv4/udp.c:963 udp_sendmsg+0x1249/0x12d0 net/ipv4/udp.c:1246 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:840 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2540 ___sys_sendmsg net/socket.c:2594 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2680 __do_sys_sendmmsg net/socket.c:2709 [inline] __se_sys_sendmmsg net/socket.c:2706 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2706 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read-write to 0xffff88813726b160 of 8 bytes by task 23987 on cpu 0: xfrmi_xmit+0x74e/0xb20 net/xfrm/xfrm_interface_core.c:583 __netdev_start_xmit include/linux/netdevice.h:4889 [inline] netdev_start_xmit include/linux/netdevice.h:4903 [inline] xmit_one net/core/dev.c:3544 [inline] dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 __dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 dev_queue_xmit include/linux/netdevice.h:3082 [inline] neigh_connected_output+0x231/0x2a0 net/core/neighbour.c:1581 neigh_output include/net/neighbour.h:542 [inline] ip_finish_output2+0x74a/0x850 net/ipv4/ip_output.c:230 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:318 NF_HOOK_COND include/linux/netfilter.h:293 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:432 dst_output include/net/dst.h:458 [inline] ip_local_out net/ipv4/ip_output.c:127 [inline] ip_send_skb+0x72/0xe0 net/ipv4/ip_output.c:1487 udp_send_skb+0x6a4/0x990 net/ipv4/udp.c:963 udp_sendmsg+0x1249/0x12d0 net/ipv4/udp.c:1246 inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:840 sock_sendmsg_nosec net/socket.c:730 [inline] sock_sendmsg net/socket.c:753 [inline] ____sys_sendmsg+0x37c/0x4d0 net/socket.c:2540 ___sys_sendmsg net/socket.c:2594 [inline] __sys_sendmmsg+0x269/0x500 net/socket.c:2680 __do_sys_sendmmsg net/socket.c:2709 [inline] __se_sys_sendmmsg net/socket.c:2706 [inline] __x64_sys_sendmmsg+0x57/0x60 net/socket.c:2706 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00000000000010d7 -> 0x00000000000010d8 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 23987 Comm: syz-executor.5 Not tainted 6.5.0-syzkaller-10885-g0468be89b3fa #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 6b50b0bac39b13db9edce8899ee4de363dc95ef9) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- net/xfrm/xfrm_interface_core.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 3baf81d9974c..47e502310c5e 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -334,8 +334,8 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) skb->dev = dev; if (err) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_dropped); return 0; } @@ -380,7 +380,6 @@ static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; @@ -427,7 +426,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) tdev = dst->dev; if (tdev == dev) { - stats->collisions++; + DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; @@ -466,13 +465,13 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { - stats->tx_errors++; - stats->tx_aborted_errors++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); @@ -482,7 +481,6 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; @@ -499,7 +497,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); @@ -515,7 +513,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); @@ -534,8 +532,8 @@ static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_err: - stats->tx_errors++; - stats->tx_dropped++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } From khorenko at virtuozzo.com Tue Mar 11 19:35:26 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:35:26 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] ms/xfrm: fix a data-race in xfrm_gen_index() In-Reply-To: <20250311163344.1534122-3-khorenko@virtuozzo.com> Message-ID: <202503111635.52BGZQQc1534235@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit fc4a07295cdf335abfdd1a13646b6862c9ebc2d8 Author: Sabrina Dubroca Date: Thu Apr 11 10:04:27 2024 +0200 ms/xfrm: fix a data-race in xfrm_gen_index() JIRA: https://issues.redhat.com/browse/RHEL-31751 commit 3e4bc23926b83c3c67e5f61ae8571602754131a6 Author: Eric Dumazet Date: Fri Sep 8 18:13:59 2023 +0000 xfrm: fix a data-race in xfrm_gen_index() xfrm_gen_index() mutual exclusion uses net->xfrm.xfrm_policy_lock. This means we must use a per-netns idx_generator variable, instead of a static one. Alternative would be to use an atomic variable. syzbot reported: BUG: KCSAN: data-race in xfrm_sk_policy_insert / xfrm_sk_policy_insert write to 0xffffffff87005938 of 4 bytes by task 29466 on cpu 0: xfrm_gen_index net/xfrm/xfrm_policy.c:1385 [inline] xfrm_sk_policy_insert+0x262/0x640 net/xfrm/xfrm_policy.c:2347 xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffffffff87005938 of 4 bytes by task 29460 on cpu 1: xfrm_sk_policy_insert+0x13e/0x640 xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 __sys_setsockopt+0x1c9/0x230 net/socket.c:2263 __do_sys_setsockopt net/socket.c:2274 [inline] __se_sys_setsockopt net/socket.c:2271 [inline] __x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x00006ad8 -> 0x00006b18 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 29460 Comm: syz-executor.1 Not tainted 6.5.0-rc5-syzkaller-00243-g9106536c1aa3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Fixes: 1121994c803f ("netns xfrm: policy insertion in netns") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Cc: Herbert Xu Acked-by: Herbert Xu Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 1a178039befae55a1518ce39da04e221b60b8544) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- include/net/netns/xfrm.h | 1 + net/xfrm/xfrm_policy.c | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index bd7c3be4af5d..423b52eca908 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -50,6 +50,7 @@ struct netns_xfrm { struct list_head policy_all; struct hlist_head *policy_byidx; unsigned int policy_idx_hmask; + unsigned int idx_generator; struct hlist_head policy_inexact[XFRM_POLICY_MAX]; struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index aa7f6615ecc5..2f615a3cc70b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1375,8 +1375,6 @@ EXPORT_SYMBOL(xfrm_policy_hash_rebuild); * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { - static u32 idx_generator; - for (;;) { struct hlist_head *list; struct xfrm_policy *p; @@ -1384,8 +1382,8 @@ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) int found; if (!index) { - idx = (idx_generator | dir); - idx_generator += 8; + idx = (net->xfrm.idx_generator | dir); + net->xfrm.idx_generator += 8; } else { idx = index; index = 0; From khorenko at virtuozzo.com Tue Mar 11 19:35:52 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:35:52 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] ms/xfrm: annotate data-race around use_time In-Reply-To: <20250311163344.1534122-4-khorenko@virtuozzo.com> Message-ID: <202503111635.52BGZqxb1534287@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit e3a186b220eac01b476c6281d266174e5c29cc34 Author: Sabrina Dubroca Date: Thu Apr 11 10:04:27 2024 +0200 ms/xfrm: annotate data-race around use_time JIRA: https://issues.redhat.com/browse/RHEL-31751 commit 0a9e5794b21e2d1303759ff8fe5f9215db7757ba Author: Eric Dumazet Date: Thu Jan 26 11:21:30 2023 +0000 xfrm: annotate data-race around use_time KCSAN reported multiple cpus can update use_time at the same time. Adds READ_ONCE()/WRITE_ONCE() annotations. Note that 32bit arches are not fully protected, but they will probably no longer be supported/used in 2106. BUG: KCSAN: data-race in __xfrm_policy_check / __xfrm_policy_check write to 0xffff88813e7ec108 of 8 bytes by interrupt on cpu 0: __xfrm_policy_check+0x6ae/0x17f0 net/xfrm/xfrm_policy.c:3664 __xfrm_policy_check2 include/net/xfrm.h:1174 [inline] xfrm_policy_check include/net/xfrm.h:1179 [inline] xfrm6_policy_check+0x2e9/0x320 include/net/xfrm.h:1189 udpv6_queue_rcv_one_skb+0x48/0xa30 net/ipv6/udp.c:703 udpv6_queue_rcv_skb+0x2d6/0x310 net/ipv6/udp.c:792 udp6_unicast_rcv_skb+0x16b/0x190 net/ipv6/udp.c:935 __udp6_lib_rcv+0x84b/0x9b0 net/ipv6/udp.c:1020 udpv6_rcv+0x4b/0x50 net/ipv6/udp.c:1133 ip6_protocol_deliver_rcu+0x99e/0x1020 net/ipv6/ip6_input.c:439 ip6_input_finish net/ipv6/ip6_input.c:484 [inline] NF_HOOK include/linux/netfilter.h:302 [inline] ip6_input+0xca/0x180 net/ipv6/ip6_input.c:493 dst_input include/net/dst.h:454 [inline] ip6_rcv_finish+0x1e9/0x2d0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:302 [inline] ipv6_rcv+0x85/0x140 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5482 [inline] __netif_receive_skb+0x8b/0x1b0 net/core/dev.c:5596 process_backlog+0x23f/0x3b0 net/core/dev.c:5924 __napi_poll+0x65/0x390 net/core/dev.c:6485 napi_poll net/core/dev.c:6552 [inline] net_rx_action+0x37e/0x730 net/core/dev.c:6663 __do_softirq+0xf2/0x2c7 kernel/softirq.c:571 do_softirq+0xb1/0xf0 kernel/softirq.c:472 __local_bh_enable_ip+0x6f/0x80 kernel/softirq.c:396 __raw_read_unlock_bh include/linux/rwlock_api_smp.h:257 [inline] _raw_read_unlock_bh+0x17/0x20 kernel/locking/spinlock.c:284 wg_socket_send_skb_to_peer+0x107/0x120 drivers/net/wireguard/socket.c:184 wg_packet_create_data_done drivers/net/wireguard/send.c:251 [inline] wg_packet_tx_worker+0x142/0x360 drivers/net/wireguard/send.c:276 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 write to 0xffff88813e7ec108 of 8 bytes by interrupt on cpu 1: __xfrm_policy_check+0x6ae/0x17f0 net/xfrm/xfrm_policy.c:3664 __xfrm_policy_check2 include/net/xfrm.h:1174 [inline] xfrm_policy_check include/net/xfrm.h:1179 [inline] xfrm6_policy_check+0x2e9/0x320 include/net/xfrm.h:1189 udpv6_queue_rcv_one_skb+0x48/0xa30 net/ipv6/udp.c:703 udpv6_queue_rcv_skb+0x2d6/0x310 net/ipv6/udp.c:792 udp6_unicast_rcv_skb+0x16b/0x190 net/ipv6/udp.c:935 __udp6_lib_rcv+0x84b/0x9b0 net/ipv6/udp.c:1020 udpv6_rcv+0x4b/0x50 net/ipv6/udp.c:1133 ip6_protocol_deliver_rcu+0x99e/0x1020 net/ipv6/ip6_input.c:439 ip6_input_finish net/ipv6/ip6_input.c:484 [inline] NF_HOOK include/linux/netfilter.h:302 [inline] ip6_input+0xca/0x180 net/ipv6/ip6_input.c:493 dst_input include/net/dst.h:454 [inline] ip6_rcv_finish+0x1e9/0x2d0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:302 [inline] ipv6_rcv+0x85/0x140 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5482 [inline] __netif_receive_skb+0x8b/0x1b0 net/core/dev.c:5596 process_backlog+0x23f/0x3b0 net/core/dev.c:5924 __napi_poll+0x65/0x390 net/core/dev.c:6485 napi_poll net/core/dev.c:6552 [inline] net_rx_action+0x37e/0x730 net/core/dev.c:6663 __do_softirq+0xf2/0x2c7 kernel/softirq.c:571 do_softirq+0xb1/0xf0 kernel/softirq.c:472 __local_bh_enable_ip+0x6f/0x80 kernel/softirq.c:396 __raw_read_unlock_bh include/linux/rwlock_api_smp.h:257 [inline] _raw_read_unlock_bh+0x17/0x20 kernel/locking/spinlock.c:284 wg_socket_send_skb_to_peer+0x107/0x120 drivers/net/wireguard/socket.c:184 wg_packet_create_data_done drivers/net/wireguard/send.c:251 [inline] wg_packet_tx_worker+0x142/0x360 drivers/net/wireguard/send.c:276 process_one_work+0x3d3/0x720 kernel/workqueue.c:2289 worker_thread+0x618/0xa70 kernel/workqueue.c:2436 kthread+0x1a9/0x1e0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 value changed: 0x0000000063c62d6f -> 0x0000000063c62d70 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 4185 Comm: kworker/1:2 Tainted: G W 6.2.0-rc4-syzkaller-00009-gd532dd102151-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: wg-crypt-wg0 wg_packet_tx_worker Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Cc: Arnd Bergmann Acked-by: Arnd Bergmann Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit c137a781a237e3be2fb178b0d65d1385ac6d5068) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- net/xfrm/xfrm_policy.c | 11 +++++++---- net/xfrm/xfrm_state.c | 10 +++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 2f615a3cc70b..52d45c5a7fb0 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t) } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + - (xp->curlft.use_time ? : xp->curlft.add_time) - now; + (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; @@ -3670,7 +3670,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, return 1; } - pol->curlft.use_time = ktime_get_real_seconds(); + /* This lockless write can happen from different cpus. */ + WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds()); pols[0] = pol; npols++; @@ -3685,7 +3686,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, xfrm_pol_put(pols[0]); return 0; } - pols[1]->curlft.use_time = ktime_get_real_seconds(); + /* This write can happen from different cpus. */ + WRITE_ONCE(pols[1]->curlft.use_time, + ktime_get_real_seconds()); npols++; } } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 821e4ea52f3f..f9e56d481562 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -595,7 +595,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } if (x->lft.hard_use_expires_seconds) { time64_t tmo = x->lft.hard_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) @@ -617,7 +617,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) } if (x->lft.soft_use_expires_seconds) { time64_t tmo = x->lft.soft_use_expires_seconds + - (x->curlft.use_time ? : now) - now; + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) @@ -1902,7 +1902,7 @@ int xfrm_state_update(struct xfrm_state *x) hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); - if (x1->curlft.use_time) + if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { @@ -1936,8 +1936,8 @@ int xfrm_state_check_expire(struct xfrm_state *x) { xfrm_dev_state_update_curlft(x); - if (!x->curlft.use_time) - x->curlft.use_time = ktime_get_real_seconds(); + if (!READ_ONCE(x->curlft.use_time)) + WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { From khorenko at virtuozzo.com Tue Mar 11 19:36:12 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:36:12 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] ms/xfrm: fix a data-race in xfrm_lookup_with_ifid() In-Reply-To: <20250311163344.1534122-5-khorenko@virtuozzo.com> Message-ID: <202503111636.52BGaCmN1534320@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 02b59edf11b859ab70e8711ece7df6bee3ea2072 Author: Sabrina Dubroca Date: Thu Apr 11 10:04:27 2024 +0200 ms/xfrm: fix a data-race in xfrm_lookup_with_ifid() JIRA: https://issues.redhat.com/browse/RHEL-31751 commit de5724ca38fd5e442bae9c1fab31942b6544012d Author: Eric Dumazet Date: Wed Oct 11 10:24:29 2023 +0000 xfrm: fix a data-race in xfrm_lookup_with_ifid() syzbot complains about a race in xfrm_lookup_with_ifid() [1] When preparing commit 0a9e5794b21e ("xfrm: annotate data-race around use_time") I thought xfrm_lookup_with_ifid() was modifying a still private structure. [1] BUG: KCSAN: data-race in xfrm_lookup_with_ifid / xfrm_lookup_with_ifid write to 0xffff88813ea41108 of 8 bytes by task 8150 on cpu 1: xfrm_lookup_with_ifid+0xce7/0x12d0 net/xfrm/xfrm_policy.c:3218 xfrm_lookup net/xfrm/xfrm_policy.c:3270 [inline] xfrm_lookup_route+0x3b/0x100 net/xfrm/xfrm_policy.c:3281 ip6_dst_lookup_flow+0x98/0xc0 net/ipv6/ip6_output.c:1246 send6+0x241/0x3c0 drivers/net/wireguard/socket.c:139 wg_socket_send_skb_to_peer+0xbd/0x130 drivers/net/wireguard/socket.c:178 wg_socket_send_buffer_to_peer+0xd6/0x100 drivers/net/wireguard/socket.c:200 wg_packet_send_handshake_initiation drivers/net/wireguard/send.c:40 [inline] wg_packet_handshake_send_worker+0x10c/0x150 drivers/net/wireguard/send.c:51 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2703 worker_thread+0x525/0x730 kernel/workqueue.c:2784 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 write to 0xffff88813ea41108 of 8 bytes by task 15867 on cpu 0: xfrm_lookup_with_ifid+0xce7/0x12d0 net/xfrm/xfrm_policy.c:3218 xfrm_lookup net/xfrm/xfrm_policy.c:3270 [inline] xfrm_lookup_route+0x3b/0x100 net/xfrm/xfrm_policy.c:3281 ip6_dst_lookup_flow+0x98/0xc0 net/ipv6/ip6_output.c:1246 send6+0x241/0x3c0 drivers/net/wireguard/socket.c:139 wg_socket_send_skb_to_peer+0xbd/0x130 drivers/net/wireguard/socket.c:178 wg_socket_send_buffer_to_peer+0xd6/0x100 drivers/net/wireguard/socket.c:200 wg_packet_send_handshake_initiation drivers/net/wireguard/send.c:40 [inline] wg_packet_handshake_send_worker+0x10c/0x150 drivers/net/wireguard/send.c:51 process_one_work kernel/workqueue.c:2630 [inline] process_scheduled_works+0x5b8/0xa30 kernel/workqueue.c:2703 worker_thread+0x525/0x730 kernel/workqueue.c:2784 kthread+0x1d7/0x210 kernel/kthread.c:388 ret_from_fork+0x48/0x60 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 value changed: 0x00000000651cd9d1 -> 0x00000000651cd9d2 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 15867 Comm: kworker/u4:58 Not tainted 6.6.0-rc4-syzkaller-00016-g5e62ed3b1c8a #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/06/2023 Workqueue: wg-kex-wg2 wg_packet_handshake_send_worker Fixes: 0a9e5794b21e ("xfrm: annotate data-race around use_time") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Steffen Klassert Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit a8d0aa5863643d67a2a61a9e9d6135a24d248f37) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 52d45c5a7fb0..ce785a556cef 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3221,7 +3221,7 @@ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, } for (i = 0; i < num_pols; i++) - pols[i]->curlft.use_time = ktime_get_real_seconds(); + WRITE_ONCE(pols[i]->curlft.use_time, ktime_get_real_seconds()); if (num_xfrms < 0) { /* Prohibit the flow */ From khorenko at virtuozzo.com Tue Mar 11 19:36:33 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:36:33 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] ms/net: skbuff: don't include to In-Reply-To: <20250311163344.1534122-6-khorenko@virtuozzo.com> Message-ID: <202503111636.52BGaXtM1534355@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 7fc62078afccd5b99ea1a1a5090e8aa7ce274344 Author: Sabrina Dubroca Date: Thu Apr 11 10:04:27 2024 +0200 ms/net: skbuff: don't include to JIRA: https://issues.redhat.com/browse/RHEL-31751 Conflicts: context around #include in net/core/skbuff.c commit 75eaf63ea7afeafd026ffef03bdc69e31f10829b Author: Alexander Lobakin Date: Fri Aug 4 20:05:25 2023 +0200 net: skbuff: don't include to Currently, touching triggers a rebuild of more than half of the kernel. That's because it's included in . And each new include to page_pool/types.h adds more [useless] data for the toolchain to process per each source file from that pile. In commit 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling"), Matteo included it to be able to call a couple of functions defined there. Then, in commit 57f05bc2ab24 ("page_pool: keep pp info as long as page pool owns the page") one of the calls was removed, so only one was left. It's the call to page_pool_return_skb_page() in napi_frag_unref(). The function is external and doesn't have any dependencies. Having very niche page_pool_types.h included only for that looks like an overkill. As %PP_SIGNATURE is not local to page_pool.c (was only in the early submissions), nothing holds this function there. Teleport page_pool_return_skb_page() to skbuff.c, just next to the main consumer, skb_pp_recycle(), and rename it to napi_pp_put_page(), as it doesn't work with skbs at all and the former name tells nothing. The #if guards here are only to not compile and have it in the vmlinux when not needed -- both call sites are already guarded. Now, touching page_pool_types.h only triggers rebuilding of the drivers using it and a couple of core networking files. Suggested-by: Jakub Kicinski # make skbuff.h less heavy Suggested-by: Alexander Duyck # move to skbuff.c Signed-off-by: Alexander Lobakin Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/20230804180529.2483231-3-aleksander.lobakin at intel.com Signed-off-by: Jakub Kicinski Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 06fe287412e5781c2073b0a53d4d83fe020cf4b5) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- include/linux/skbuff.h | 5 +++-- include/net/page_pool/types.h | 2 -- net/core/page_pool.c | 39 ------------------------------------- net/core/skbuff.c | 45 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 46 insertions(+), 45 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c6744d96a854..9c140d57d051 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -38,7 +38,6 @@ #include #include #include -#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif @@ -3330,13 +3329,15 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +bool napi_pp_put_page(struct page *page, bool napi_safe); + static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { struct page *page = skb_frag_page(frag); #ifdef CONFIG_PAGE_POOL - if (recycle && page_pool_return_skb_page(page, napi_safe)) + if (recycle && napi_pp_put_page(page, napi_safe)) return; #endif put_page(page); diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index efb5733ba31d..070ccbc375b3 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -185,8 +185,6 @@ struct page_pool { struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp); -bool page_pool_return_skb_page(struct page *page, bool napi_safe); - struct page_pool *page_pool_create(const struct page_pool_params *params); struct xdp_mem_info; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index e4d44ad61921..6667cbf68009 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -947,42 +947,3 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); - -bool page_pool_return_skb_page(struct page *page, bool napi_safe) -{ - struct napi_struct *napi; - struct page_pool *pp; - bool allow_direct; - - page = compound_head(page); - - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) - return false; - - pp = page->pp; - - /* Allow direct recycle if we have reasons to believe that we are - * in the same context as the consumer would run, so there's - * no possible race. - */ - napi = READ_ONCE(pp->p.napi); - allow_direct = napi_safe && napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); - - /* Driver set this to memory recycling info. Reset it on recycle. - * This will *not* work for NIC using a split-page memory model. - * The page will be returned to the pool here regardless of the - * 'flipped' fragment being in use or not. - */ - page_pool_put_full_page(pp, page, allow_direct); - - return true; -} -EXPORT_SYMBOL(page_pool_return_skb_page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6afcb1aab87b..7bab2941e8ef 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -72,7 +72,7 @@ #include #include #include -#include +#include #include #include @@ -853,11 +853,52 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page, bool napi_safe) +{ + struct napi_struct *napi; + struct page_pool *pp; + bool allow_direct; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + */ + napi = READ_ONCE(pp->p.napi); + allow_direct = napi_safe && napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, allow_direct); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data), napi_safe); } static void skb_free_head(struct sk_buff *skb, bool napi_safe) From khorenko at virtuozzo.com Tue Mar 11 19:36:53 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:36:53 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] ms/net: esp: fix bad handling of pages from page_pool In-Reply-To: <20250311163344.1534122-7-khorenko@virtuozzo.com> Message-ID: <202503111636.52BGaruX1534389@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.19 ------> commit 4b9f03c19938b2ee14eeb9a218025e923ea3cbf1 Author: Sabrina Dubroca Date: Thu Apr 11 10:04:27 2024 +0200 ms/net: esp: fix bad handling of pages from page_pool JIRA: https://issues.redhat.com/browse/RHEL-31751 commit c3198822c6cb9fb588e446540485669cc81c5d34 Author: Dragos Tatulea Date: Fri Mar 8 17:26:00 2024 +0200 net: esp: fix bad handling of pages from page_pool When the skb is reorganized during esp_output (!esp->inline), the pages coming from the original skb fragments are supposed to be released back to the system through put_page. But if the skb fragment pages are originating from a page_pool, calling put_page on them will trigger a page_pool leak which will eventually result in a crash. This leak can be easily observed when using CONFIG_DEBUG_VM and doing ipsec + gre (non offloaded) forwarding: BUG: Bad page state in process ksoftirqd/16 pfn:1451b6 page:00000000de2b8d32 refcount:0 mapcount:0 mapping:0000000000000000 index:0x1451b6000 pfn:0x1451b6 flags: 0x200000000000000(node=0|zone=2) page_type: 0xffffffff() raw: 0200000000000000 dead000000000040 ffff88810d23c000 0000000000000000 raw: 00000001451b6000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: page_pool leak Modules linked in: ip_gre gre mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat nf_nat xt_addrtype br_netfilter rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core overlay zram zsmalloc fuse [last unloaded: mlx5_core] CPU: 16 PID: 96 Comm: ksoftirqd/16 Not tainted 6.8.0-rc4+ #22 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x36/0x50 bad_page+0x70/0xf0 free_unref_page_prepare+0x27a/0x460 free_unref_page+0x38/0x120 esp_ssg_unref.isra.0+0x15f/0x200 esp_output_tail+0x66d/0x780 esp_xmit+0x2c5/0x360 validate_xmit_xfrm+0x313/0x370 ? validate_xmit_skb+0x1d/0x330 validate_xmit_skb_list+0x4c/0x70 sch_direct_xmit+0x23e/0x350 __dev_queue_xmit+0x337/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x25e/0x580 iptunnel_xmit+0x19b/0x240 ip_tunnel_xmit+0x5fb/0xb60 ipgre_xmit+0x14d/0x280 [ip_gre] dev_hard_start_xmit+0xc3/0x1c0 __dev_queue_xmit+0x208/0xba0 ? nf_hook_slow+0x3f/0xd0 ip_finish_output2+0x1ca/0x580 ip_sublist_rcv_finish+0x32/0x40 ip_sublist_rcv+0x1b2/0x1f0 ? ip_rcv_finish_core.constprop.0+0x460/0x460 ip_list_rcv+0x103/0x130 __netif_receive_skb_list_core+0x181/0x1e0 netif_receive_skb_list_internal+0x1b3/0x2c0 napi_gro_receive+0xc8/0x200 gro_cell_poll+0x52/0x90 __napi_poll+0x25/0x1a0 net_rx_action+0x28e/0x300 __do_softirq+0xc3/0x276 ? sort_range+0x20/0x20 run_ksoftirqd+0x1e/0x30 smpboot_thread_fn+0xa6/0x130 kthread+0xcd/0x100 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x31/0x50 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork_asm+0x11/0x20 The suggested fix is to introduce a new wrapper (skb_page_unref) that covers page refcounting for page_pool pages as well. Cc: stable at vger.kernel.org Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reported-and-tested-by: Anatoli N.Chechelnickiy Reported-by: Ian Kumlien Link: https://lore.kernel.org/netdev/CAA85sZvvHtrpTQRqdaOx6gd55zPAVsqMYk_Lwh4Md5knTq7AyA at mail.gmail.com Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Reviewed-by: Jakub Kicinski Acked-by: Ilias Apalodimas Signed-off-by: Steffen Klassert Signed-off-by: Sabrina Dubroca https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from CentOS Stream commit 419aa90c9f4d29f48693335be3a66409d5561357) Signed-off-by: Konstantin Khorenko Feature: fix ms/xfrm --- include/linux/skbuff.h | 10 ++++++++++ net/ipv4/esp4.c | 8 ++++---- net/ipv6/esp6.c | 8 ++++---- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9c140d57d051..7e913bc1d21b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3331,6 +3331,16 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) bool napi_pp_put_page(struct page *page, bool napi_safe); +static inline void +skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe) +{ +#ifdef CONFIG_PAGE_POOL + if (skb->pp_recycle && napi_pp_put_page(page, napi_safe)) + return; +#endif + put_page(page); +} + static inline void napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) { diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 96902b32adca..c18a1a23f957 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -95,7 +95,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; @@ -116,7 +116,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - put_page(sg_page(sg)); + skb_page_unref(skb, sg_page(sg), false); } #ifdef CONFIG_INET_ESPINTCP @@ -262,7 +262,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); kfree(tmp); if (xo && (xo->flags & XFRM_DEV_RESUME)) { @@ -643,7 +643,7 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * } if (sg != dsg) - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b906a85f8e2e..979aba172f46 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -112,7 +112,7 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, __alignof__(struct scatterlist)); } -static void esp_ssg_unref(struct xfrm_state *x, void *tmp) +static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) { struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; @@ -132,7 +132,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) */ if (req->src != req->dst) for (sg = sg_next(req->src); sg; sg = sg_next(sg)) - put_page(sg_page(sg)); + skb_page_unref(skb, sg_page(sg), false); } #ifdef CONFIG_INET6_ESPINTCP @@ -295,7 +295,7 @@ static void esp_output_done(struct crypto_async_request *base, int err) } tmp = ESP_SKB_CB(skb)->tmp; - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); kfree(tmp); esp_output_encap_csum(skb); @@ -678,7 +678,7 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } if (sg != dsg) - esp_ssg_unref(x, tmp); + esp_ssg_unref(x, tmp, skb); if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) err = esp_output_tail_tcp(x, skb); From khorenko at virtuozzo.com Tue Mar 11 19:39:38 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:39:38 +0100 Subject: [Devel] [PATCH vz9] ms/mm/page_pool: catch page_pool memory leaks Message-ID: <20250311163938.1534439-1-khorenko@virtuozzo.com> From: Jesper Dangaard Brouer Pages belonging to a page_pool (PP) instance must be freed through the PP APIs in-order to correctly release any DMA mappings and release refcnt on the DMA device when freeing PP instance. When PP release a page (page_pool_release_page) the page->pp_magic value is cleared. This patch detect a leaked PP page in free_page_is_bad() via unexpected state of page->pp_magic value being PP_SIGNATURE. We choose to report and treat it as a bad page. It would be possible to release the page via returning it to the PP instance as the page->pp pointer is likely still valid. Notice this code is only activated when either compiled with CONFIG_DEBUG_VM or boot cmdline debug_pagealloc=on, and CONFIG_PAGE_POOL. Reduced example output of leak with PP_SIGNATURE = dead000000000040: BUG: Bad page state in process swapper/4 pfn:141fa6 page:000000006dbf8062 refcount:0 mapcount:0 mapping:0000000000000000 index:0x141fa6000 pfn:0x141fa6 flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) page_type: 0xffffffff() raw: 002fffff80000000 dead000000000040 ffff88814888a000 0000000000000000 raw: 0000000141fa6000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: page_pool leak [...] Call Trace: dump_stack_lvl+0x32/0x50 bad_page+0x70/0xf0 free_unref_page_prepare+0x263/0x430 free_unref_page+0x34/0x130 mlx5e_free_rx_mpwqe+0x190/0x1c0 [mlx5_core] mlx5e_post_rx_mpwqes+0x1ac/0x280 [mlx5_core] mlx5e_napi_poll+0x12b/0x710 [mlx5_core] ? skb_free_head+0x4f/0x90 __napi_poll+0x2b/0x1c0 net_rx_action+0x27b/0x360 The advantage is the Call Trace directly points to the function leaking the PP page, which in this case is an on purpose bug introduced into the mlx5 driver to test this code change. Currently PP will periodically in page_pool_release_retry() printk warning "stalled pool shutdown" which cannot be directly corrolated to leaking and might as well be a false positive due to SKBs being stuck on a socket for an extended period. After this patch we should be able to remove this printk. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller This warning helped while investigating the following bug: https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from commit dba1b8a7ab6853a84bf3afdbeac1c2f2370d3444) Signed-off-by: Konstantin Khorenko Feature: fix ms/mm --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b8c5556f460..fdda76685561 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1293,6 +1293,9 @@ static inline bool page_expected_state(struct page *page, page_ref_count(page) | #ifdef CONFIG_MEMCG page->memcg_data | +#endif +#ifdef CONFIG_PAGE_POOL + ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | #endif (page->flags & check_flags))) return false; @@ -1319,6 +1322,10 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) #ifdef CONFIG_MEMCG if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; +#endif +#ifdef CONFIG_PAGE_POOL + if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + bad_reason = "page_pool leak"; #endif return bad_reason; } -- 2.43.0 From khorenko at virtuozzo.com Tue Mar 11 19:40:19 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Tue, 11 Mar 2025 17:40:19 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] ms/mm/page_pool: catch page_pool memory leaks In-Reply-To: <20250311163938.1534439-1-khorenko@virtuozzo.com> Message-ID: <202503111640.52BGeJHh1534517@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.21 ------> commit a7ece4829001b3285d9861c0e29af55d9d925065 Author: Jesper Dangaard Brouer Date: Fri Nov 24 11:16:52 2023 +0100 ms/mm/page_pool: catch page_pool memory leaks Pages belonging to a page_pool (PP) instance must be freed through the PP APIs in-order to correctly release any DMA mappings and release refcnt on the DMA device when freeing PP instance. When PP release a page (page_pool_release_page) the page->pp_magic value is cleared. This patch detect a leaked PP page in free_page_is_bad() via unexpected state of page->pp_magic value being PP_SIGNATURE. We choose to report and treat it as a bad page. It would be possible to release the page via returning it to the PP instance as the page->pp pointer is likely still valid. Notice this code is only activated when either compiled with CONFIG_DEBUG_VM or boot cmdline debug_pagealloc=on, and CONFIG_PAGE_POOL. Reduced example output of leak with PP_SIGNATURE = dead000000000040: BUG: Bad page state in process swapper/4 pfn:141fa6 page:000000006dbf8062 refcount:0 mapcount:0 mapping:0000000000000000 index:0x141fa6000 pfn:0x141fa6 flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) page_type: 0xffffffff() raw: 002fffff80000000 dead000000000040 ffff88814888a000 0000000000000000 raw: 0000000141fa6000 0000000000000001 00000000ffffffff 0000000000000000 page dumped because: page_pool leak [...] Call Trace: dump_stack_lvl+0x32/0x50 bad_page+0x70/0xf0 free_unref_page_prepare+0x263/0x430 free_unref_page+0x34/0x130 mlx5e_free_rx_mpwqe+0x190/0x1c0 [mlx5_core] mlx5e_post_rx_mpwqes+0x1ac/0x280 [mlx5_core] mlx5e_napi_poll+0x12b/0x710 [mlx5_core] ? skb_free_head+0x4f/0x90 __napi_poll+0x2b/0x1c0 net_rx_action+0x27b/0x360 The advantage is the Call Trace directly points to the function leaking the PP page, which in this case is an on purpose bug introduced into the mlx5 driver to test this code change. Currently PP will periodically in page_pool_release_retry() printk warning "stalled pool shutdown" which cannot be directly corrolated to leaking and might as well be a false positive due to SKBs being stuck on a socket for an extended period. After this patch we should be able to remove this printk. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller This warning helped while investigating the following bug: https://virtuozzo.atlassian.net/browse/VSTOR-101702 (cherry picked from commit dba1b8a7ab6853a84bf3afdbeac1c2f2370d3444) Signed-off-by: Konstantin Khorenko Feature: fix ms/mm --- mm/page_alloc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b8c5556f460..fdda76685561 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1293,6 +1293,9 @@ static inline bool page_expected_state(struct page *page, page_ref_count(page) | #ifdef CONFIG_MEMCG page->memcg_data | +#endif +#ifdef CONFIG_PAGE_POOL + ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) | #endif (page->flags & check_flags))) return false; @@ -1319,6 +1322,10 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) #ifdef CONFIG_MEMCG if (unlikely(page->memcg_data)) bad_reason = "page still charged to cgroup"; +#endif +#ifdef CONFIG_PAGE_POOL + if (unlikely((page->pp_magic & ~0x3UL) == PP_SIGNATURE)) + bad_reason = "page_pool leak"; #endif return bad_reason; } From kui.liu at virtuozzo.com Fri Mar 21 04:59:11 2025 From: kui.liu at virtuozzo.com (Liu Kui) Date: Fri, 21 Mar 2025 09:59:11 +0800 Subject: [Devel] [PATCH VZ9] fs/fuse kio: add safety check in kpcs_dev_ioctl() Message-ID: <20250321015911.89339-1-kui.liu@virtuozzo.com> Apparently fc->kio.ctx needs to be checked before being used. However the check should be done in a way that can avoid a race condition between kpcs_dev_ioctl() and fuse_conn_destroy() where both can run concurrently Related to #VSTOR-102040 Signed-off-by: Liu Kui --- fs/fuse/inode.c | 4 +--- fs/fuse/kio/pcs/pcs_cluster.c | 4 ++++ fs/fuse/kio/pcs/pcs_cluster.h | 3 +++ fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 29 +++++++++++++++++++++++------ 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b27422d1ee38..a22e0ffb3a8f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -2221,10 +2221,8 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; - if (fc->kio.op) { /* At this point all pending kio must be completed. */ + if (fc->kio.op) /* At this point all pending kio must be completed. */ fc->kio.op->conn_fini(fm); - fc->kio.ctx = NULL; - } fuse_abort_conn(fc); fuse_wait_aborted(fc); diff --git a/fs/fuse/kio/pcs/pcs_cluster.c b/fs/fuse/kio/pcs/pcs_cluster.c index c87313b90ab3..710087c83fe6 100644 --- a/fs/fuse/kio/pcs/pcs_cluster.c +++ b/fs/fuse/kio/pcs/pcs_cluster.c @@ -603,6 +603,8 @@ int pcs_cluster_init(struct pcs_fuse_cluster *pfc, struct workqueue_struct *wq, INIT_LIST_HEAD(&pfc->list); pfc->fc = fc; + atomic_set(&pfc->refcnt, 1); + init_waitqueue_head(&pfc->waitq); /* core init */ if (pcs_cc_init(&pfc->cc, wq, info->cluster_name, &attr)) @@ -617,6 +619,8 @@ int pcs_cluster_init(struct pcs_fuse_cluster *pfc, struct workqueue_struct *wq, void pcs_cluster_fini(struct pcs_fuse_cluster *pfc) { + if (!atomic_dec_and_test(&pfc->refcnt)) + wait_event(pfc->waitq, atomic_read(&pfc->refcnt) == 0); pcs_cc_fini(&pfc->cc); kvfree(pfc); } diff --git a/fs/fuse/kio/pcs/pcs_cluster.h b/fs/fuse/kio/pcs/pcs_cluster.h index 8693d1bf38d7..914d1ad7865c 100644 --- a/fs/fuse/kio/pcs/pcs_cluster.h +++ b/fs/fuse/kio/pcs/pcs_cluster.h @@ -54,6 +54,9 @@ struct pcs_fuse_cluster { struct list_head list; struct pcs_cluster_core cc; struct fuse_conn *fc; + + atomic_t refcnt; + wait_queue_head_t waitq; }; struct pcs_fuse_work { diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c index 8da9550cc156..e3049ddaa091 100644 --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c @@ -296,17 +296,19 @@ static int kpcs_conn_init(struct fuse_mount *fm) static void kpcs_conn_fini(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + struct pcs_fuse_cluster *pfc = READ_ONCE(fc->kio.ctx); - if (!fc->kio.ctx) + if (!pfc) return; - TRACE("%s fc:%p\n", __FUNCTION__, fc); - unregister_client(fc->kio.ctx); + + WRITE_ONCE(fc->kio.ctx, NULL); + unregister_client(pfc); synchronize_rcu(); flush_workqueue(pcs_wq); flush_workqueue(pcs_cpu_wq); flush_workqueue(pcs_cleanup_wq); - pcs_cluster_fini((struct pcs_fuse_cluster *) fc->kio.ctx); + pcs_cluster_fini(pfc); if (fc->ktrace) fuse_ktrace_remove(fc); @@ -1921,10 +1923,21 @@ static int kpcs_ioctl(struct file *file, struct inode *inode, unsigned int cmd, static int kpcs_dev_ioctl(struct fuse_conn *fc, unsigned int cmd, unsigned long arg, int len) { - struct pcs_fuse_cluster *pfc = fc->kio.ctx; - struct pcs_cluster_core *cc = &pfc->cc; + struct pcs_fuse_cluster *pfc; + struct pcs_cluster_core *cc; int res; + rcu_read_lock(); + pfc = READ_ONCE(fc->kio.ctx); + if (!pfc) { + rcu_read_unlock(); + return -EINVAL; + } + atomic_inc(&pfc->refcnt); + rcu_read_unlock(); + + cc = &pfc->cc; + switch (cmd) { case PCS_IOC_KRPC_CREATE: { @@ -2007,6 +2020,10 @@ static int kpcs_dev_ioctl(struct fuse_conn *fc, unsigned int cmd, unsigned long res = -ENOIOCTLCMD; break; } + + if (atomic_dec_and_test(&pfc->refcnt)) + wake_up(&pfc->waitq); + return res; } -- 2.39.5 (Apple Git-154) From alexander.atanasov at virtuozzo.com Fri Mar 21 18:17:13 2025 From: alexander.atanasov at virtuozzo.com (Alexander Atanasov) Date: Fri, 21 Mar 2025 17:17:13 +0200 Subject: [Devel] [PATCH vz9] dm-ploop: fix resize and grow to use the new way of updating md pages Message-ID: <20250321151715.1906502-1-alexander.atanasov@virtuozzo.com> We missed the fact that resize/grow touch md0 page to update ploop parameters, other pages are not linked so there is no issue. But in case there is a concurrent update to md0 piwb is not handled correctly, also using sync updates in parallel is not okay. To fix this update code to use the new mechanism with MD_UPDATING flag and instead of using sync operations pass the updates to runner threads. https://virtuozzo.atlassian.net/browse/VSTOR-101871 Signed-off-by: Alexander Atanasov --- drivers/md/dm-ploop-map.c | 37 ++++++++++++++++++++++++++++--------- drivers/md/dm-ploop.h | 7 ++++++- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index ef280a8b0f90..04e71c851b64 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -613,11 +613,6 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio) ploop_dispatch_pios(ploop, NULL, &pio_list); } -static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) -{ - llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); -} - static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md) { bool new = false; @@ -1031,6 +1026,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, struct pio *pio; map_index_t *to; u8 level; + int ret = -ENOMEM; lockdep_assert_held(&ploop->bat_lock); @@ -1050,6 +1046,9 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, if (WARN_ON(md->piwb)) { PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n", md, md->piwb, md->piwb->type, type); + spin_unlock(&md->md_lock); + ret = -EBUSY; + goto err; } md->piwb = piwb; piwb->md = md; @@ -1094,7 +1093,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, return 0; err: ploop_free_piwb(piwb); - return -ENOMEM; + return ret; } void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, @@ -2692,31 +2691,45 @@ static void ploop_handle_cleanup(struct ploop *ploop, struct pio *pio) * another index instead of existing. If so, management of * old bat_entries[@clu] and of related holes_bitmap bit * is caller duty. + * Caller must clear MD_UPDATING and comply to add_for_wb */ int ploop_prepare_reloc_index_wb(struct ploop *ploop, struct md_page **ret_md, u32 clu, u32 *dst_clu, - struct file *file) + struct file *file, + int *add_for_wb) { enum piwb_type type = PIWB_TYPE_ALLOC; u32 page_id = ploop_bat_clu_to_page_nr(clu); struct md_page *md = ploop_md_page_find(ploop, page_id); struct ploop_index_wb *piwb; int err; + bool add_to_wblist; if (dst_clu) type = PIWB_TYPE_RELOC; err = -EIO; + + spin_lock_irq(&ploop->bat_lock); - if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) { - PL_ERR("Unexpected md status: %lx", md->status); + spin_lock(&md->md_lock); + if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status) + || test_bit(MD_UPDATING, &md->status)) { + err = -EBUSY; + spin_unlock(&md->md_lock); goto out_error; + } else { + set_bit(MD_UPDATING, &md->status); } + spin_unlock(&md->md_lock); + err = ploop_prepare_bat_update(ploop, md, type); if (err) goto out_error; + add_to_wblist = ploop_md_make_dirty(ploop, md); + piwb = md->piwb; if (dst_clu) { @@ -2734,12 +2747,18 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop, spin_unlock_irq(&ploop->bat_lock); *ret_md = md; + *add_for_wb = add_to_wblist ? 1 : 0; + return 0; out_reset: ploop_break_bat_update(ploop, md, piwb); out_error: + if (add_to_wblist) + clear_bit(MD_DIRTY, &md->status); + clear_bit(MD_UPDATING, &md->status); spin_unlock_irq(&ploop->bat_lock); + return err; } ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO); diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index 46450cac8c7d..db4d92c9679a 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -578,6 +578,11 @@ static inline const char *ploop_device_name(struct ploop *ploop) return ploop->ti->table->md->disk->disk_name; } +static inline void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) +{ + llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); +} + #define PL_FMT(fmt) "ploop: %s: " fmt "\n" #define PL_ERR(fmt, ...) pr_err(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__) #define PL_ERR_ONCE(fmt, ...) pr_err_once(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__) @@ -612,7 +617,7 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu, struct pio *pio, u8 level); extern int ploop_prepare_reloc_index_wb(struct ploop *ploop, struct md_page **ret_md, u32 clu, u32 *dst_clu, - struct file *file); + struct file *file, int *add_for_wb); extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, struct ploop_index_wb *piwb); extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *); -- 2.43.0 From khorenko at virtuozzo.com Fri Mar 21 18:33:38 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 21 Mar 2025 16:33:38 +0100 Subject: [Devel] [PATCH vz9] dm-ploop: fix resize and grow to use the new way of updating md pages In-Reply-To: <20250321151715.1906502-1-alexander.atanasov@virtuozzo.com> References: <20250321151715.1906502-1-alexander.atanasov@virtuozzo.com> Message-ID: <94d00e05-1cc1-4e29-9d89-5f0ed468d59e@virtuozzo.com> Please, add reviewer to CC. -- Best regards, Konstantin Khorenko, Virtuozzo Linux Kernel Team On 21.03.2025 16:17, Alexander Atanasov wrote: > We missed the fact that resize/grow touch md0 page to update ploop > parameters, other pages are not linked so there is no issue. > But in case there is a concurrent update to md0 piwb is not handled > correctly, also using sync updates in parallel is not okay. > To fix this update code to use the new mechanism with MD_UPDATING > flag and instead of using sync operations pass the updates to > runner threads. > > https://virtuozzo.atlassian.net/browse/VSTOR-101871 > Signed-off-by: Alexander Atanasov > --- > drivers/md/dm-ploop-map.c | 37 ++++++++++++++++++++++++++++--------- > drivers/md/dm-ploop.h | 7 ++++++- > 2 files changed, 34 insertions(+), 10 deletions(-) > > diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c > index ef280a8b0f90..04e71c851b64 100644 > --- a/drivers/md/dm-ploop-map.c > +++ b/drivers/md/dm-ploop-map.c > @@ -613,11 +613,6 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio) > ploop_dispatch_pios(ploop, NULL, &pio_list); > } > > -static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) > -{ > - llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); > -} > - > static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md) > { > bool new = false; > @@ -1031,6 +1026,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, > struct pio *pio; > map_index_t *to; > u8 level; > + int ret = -ENOMEM; > > lockdep_assert_held(&ploop->bat_lock); > > @@ -1050,6 +1046,9 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, > if (WARN_ON(md->piwb)) { > PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n", > md, md->piwb, md->piwb->type, type); > + spin_unlock(&md->md_lock); > + ret = -EBUSY; > + goto err; > } > md->piwb = piwb; > piwb->md = md; > @@ -1094,7 +1093,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, > return 0; > err: > ploop_free_piwb(piwb); > - return -ENOMEM; > + return ret; > } > > void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, > @@ -2692,31 +2691,45 @@ static void ploop_handle_cleanup(struct ploop *ploop, struct pio *pio) > * another index instead of existing. If so, management of > * old bat_entries[@clu] and of related holes_bitmap bit > * is caller duty. > + * Caller must clear MD_UPDATING and comply to add_for_wb > */ > int ploop_prepare_reloc_index_wb(struct ploop *ploop, > struct md_page **ret_md, > u32 clu, u32 *dst_clu, > - struct file *file) > + struct file *file, > + int *add_for_wb) > { > enum piwb_type type = PIWB_TYPE_ALLOC; > u32 page_id = ploop_bat_clu_to_page_nr(clu); > struct md_page *md = ploop_md_page_find(ploop, page_id); > struct ploop_index_wb *piwb; > int err; > + bool add_to_wblist; > > if (dst_clu) > type = PIWB_TYPE_RELOC; > > err = -EIO; > + > + > spin_lock_irq(&ploop->bat_lock); > - if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) { > - PL_ERR("Unexpected md status: %lx", md->status); > + spin_lock(&md->md_lock); > + if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status) > + || test_bit(MD_UPDATING, &md->status)) { > + err = -EBUSY; > + spin_unlock(&md->md_lock); > goto out_error; > + } else { > + set_bit(MD_UPDATING, &md->status); > } > + spin_unlock(&md->md_lock); > + > err = ploop_prepare_bat_update(ploop, md, type); > if (err) > goto out_error; > > + add_to_wblist = ploop_md_make_dirty(ploop, md); > + > piwb = md->piwb; > > if (dst_clu) { > @@ -2734,12 +2747,18 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop, > spin_unlock_irq(&ploop->bat_lock); > > *ret_md = md; > + *add_for_wb = add_to_wblist ? 1 : 0; > + > return 0; > > out_reset: > ploop_break_bat_update(ploop, md, piwb); > out_error: > + if (add_to_wblist) > + clear_bit(MD_DIRTY, &md->status); > + clear_bit(MD_UPDATING, &md->status); > spin_unlock_irq(&ploop->bat_lock); > + > return err; > } > ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO); > diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h > index 46450cac8c7d..db4d92c9679a 100644 > --- a/drivers/md/dm-ploop.h > +++ b/drivers/md/dm-ploop.h > @@ -578,6 +578,11 @@ static inline const char *ploop_device_name(struct ploop *ploop) > return ploop->ti->table->md->disk->disk_name; > } > > +static inline void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) > +{ > + llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); > +} > + > #define PL_FMT(fmt) "ploop: %s: " fmt "\n" > #define PL_ERR(fmt, ...) pr_err(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__) > #define PL_ERR_ONCE(fmt, ...) pr_err_once(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__) > @@ -612,7 +617,7 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu, > struct pio *pio, u8 level); > extern int ploop_prepare_reloc_index_wb(struct ploop *ploop, > struct md_page **ret_md, u32 clu, u32 *dst_clu, > - struct file *file); > + struct file *file, int *add_for_wb); > extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, > struct ploop_index_wb *piwb); > extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *); From alexander.atanasov at virtuozzo.com Fri Mar 21 18:37:46 2025 From: alexander.atanasov at virtuozzo.com (Alexander Atanasov) Date: Fri, 21 Mar 2025 17:37:46 +0200 Subject: [Devel] [PATCH vz9] dm-ploop: fix resize and grow to use the new way of updating md pages In-Reply-To: <94d00e05-1cc1-4e29-9d89-5f0ed468d59e@virtuozzo.com> References: <20250321151715.1906502-1-alexander.atanasov@virtuozzo.com> <94d00e05-1cc1-4e29-9d89-5f0ed468d59e@virtuozzo.com> Message-ID: On 21.03.25 17:33, Konstantin Khorenko wrote: > Please, add reviewer to CC. Whomever is available, please, review. > -- > Best regards, > > Konstantin Khorenko, > Virtuozzo Linux Kernel Team > > On 21.03.2025 16:17, Alexander Atanasov wrote: >> We missed the fact that resize/grow touch md0 page to update ploop >> parameters, other pages are not linked so there is no issue. >> But in case there is a concurrent update to md0 piwb is not handled >> correctly, also using sync updates in parallel is not okay. >> To fix this update code to use the new mechanism with MD_UPDATING >> flag and instead of using sync operations pass the updates to >> runner threads. >> >> https://virtuozzo.atlassian.net/browse/VSTOR-101871 >> Signed-off-by: Alexander Atanasov >> --- >> ? drivers/md/dm-ploop-map.c | 37 ++++++++++++++++++++++++++++--------- >> ? drivers/md/dm-ploop.h???? |? 7 ++++++- >> ? 2 files changed, 34 insertions(+), 10 deletions(-) >> >> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c >> index ef280a8b0f90..04e71c851b64 100644 >> --- a/drivers/md/dm-ploop-map.c >> +++ b/drivers/md/dm-ploop-map.c >> @@ -613,11 +613,6 @@ static void ploop_unlink_completed_pio(struct >> ploop *ploop, struct pio *pio) >> ????????? ploop_dispatch_pios(ploop, NULL, &pio_list); >> ? } >> -static void ploop_add_dirty_for_wb(struct ploop *ploop, struct >> md_page *md) >> -{ >> -??? llist_add((struct llist_node *)&md->wb_link, &ploop- >> >wb_batch_llist); >> -} >> - >> ? static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page >> *md) >> ? { >> ????? bool new = false; >> @@ -1031,6 +1026,7 @@ static int ploop_prepare_bat_update(struct ploop >> *ploop, struct md_page *md, >> ????? struct pio *pio; >> ????? map_index_t *to; >> ????? u8 level; >> +??? int ret = -ENOMEM; >> ????? lockdep_assert_held(&ploop->bat_lock); >> @@ -1050,6 +1046,9 @@ static int ploop_prepare_bat_update(struct ploop >> *ploop, struct md_page *md, >> ????? if (WARN_ON(md->piwb)) { >> ????????? PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n", >> ????????????? md, md->piwb, md->piwb->type, type); >> +??????? spin_unlock(&md->md_lock); >> +??????? ret = -EBUSY; >> +??????? goto err; >> ????? } >> ????? md->piwb = piwb; >> ????? piwb->md = md; >> @@ -1094,7 +1093,7 @@ static int ploop_prepare_bat_update(struct ploop >> *ploop, struct md_page *md, >> ????? return 0; >> ? err: >> ????? ploop_free_piwb(piwb); >> -??? return -ENOMEM; >> +??? return ret; >> ? } >> ? void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, >> @@ -2692,31 +2691,45 @@ static void ploop_handle_cleanup(struct ploop >> *ploop, struct pio *pio) >> ?? * another index instead of existing. If so, management of >> ?? * old bat_entries[@clu] and of related holes_bitmap bit >> ?? * is caller duty. >> + * Caller must clear MD_UPDATING and comply to add_for_wb >> ?? */ >> ? int ploop_prepare_reloc_index_wb(struct ploop *ploop, >> ?????????????????? struct md_page **ret_md, >> ?????????????????? u32 clu, u32 *dst_clu, >> -???????????????? struct file *file) >> +???????????????? struct file *file, >> +???????????????? int *add_for_wb) >> ? { >> ????? enum piwb_type type = PIWB_TYPE_ALLOC; >> ????? u32 page_id = ploop_bat_clu_to_page_nr(clu); >> ????? struct md_page *md = ploop_md_page_find(ploop, page_id); >> ????? struct ploop_index_wb *piwb; >> ????? int err; >> +??? bool add_to_wblist; >> ????? if (dst_clu) >> ????????? type = PIWB_TYPE_RELOC; >> ????? err = -EIO; >> + >> + >> ????? spin_lock_irq(&ploop->bat_lock); >> -??? if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, >> &md->status)) { >> -??????? PL_ERR("Unexpected md status: %lx", md->status); >> +??? spin_lock(&md->md_lock); >> +??? if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, >> &md->status) >> +??????? || test_bit(MD_UPDATING, &md->status)) { >> +??????? err = -EBUSY; >> +??????? spin_unlock(&md->md_lock); >> ????????? goto out_error; >> +??? } else { >> +??????? set_bit(MD_UPDATING, &md->status); >> ????? } >> +??? spin_unlock(&md->md_lock); >> + >> ????? err = ploop_prepare_bat_update(ploop, md, type); >> ????? if (err) >> ????????? goto out_error; >> +??? add_to_wblist = ploop_md_make_dirty(ploop, md); >> + >> ????? piwb = md->piwb; >> ????? if (dst_clu) { >> @@ -2734,12 +2747,18 @@ int ploop_prepare_reloc_index_wb(struct ploop >> *ploop, >> ????? spin_unlock_irq(&ploop->bat_lock); >> ????? *ret_md = md; >> +??? *add_for_wb = add_to_wblist ? 1 : 0; >> + >> ????? return 0; >> ? out_reset: >> ????? ploop_break_bat_update(ploop, md, piwb); >> ? out_error: >> +??? if (add_to_wblist) >> +??????? clear_bit(MD_DIRTY, &md->status); >> +??? clear_bit(MD_UPDATING, &md->status); >> ????? spin_unlock_irq(&ploop->bat_lock); >> + >> ????? return err; >> ? } >> ? ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO); >> diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h >> index 46450cac8c7d..db4d92c9679a 100644 >> --- a/drivers/md/dm-ploop.h >> +++ b/drivers/md/dm-ploop.h >> @@ -578,6 +578,11 @@ static inline const char >> *ploop_device_name(struct ploop *ploop) >> ????? return ploop->ti->table->md->disk->disk_name; >> ? } >> +static inline void ploop_add_dirty_for_wb(struct ploop *ploop, struct >> md_page *md) >> +{ >> +??? llist_add((struct llist_node *)&md->wb_link, &ploop- >> >wb_batch_llist); >> +} >> + >> ? #define PL_FMT(fmt) "ploop: %s: " fmt "\n" >> ? #define PL_ERR(fmt, ...) pr_err(PL_FMT(fmt), >> ploop_device_name(ploop), ##__VA_ARGS__) >> ? #define PL_ERR_ONCE(fmt, ...) pr_err_once(PL_FMT(fmt), >> ploop_device_name(ploop), ##__VA_ARGS__) >> @@ -612,7 +617,7 @@ extern void ploop_map_and_submit_rw(struct ploop >> *ploop, u32 dst_clu, >> ????????????????????? struct pio *pio, u8 level); >> ? extern int ploop_prepare_reloc_index_wb(struct ploop *ploop, >> ????????????????????? struct md_page **ret_md, u32 clu, u32 *dst_clu, >> -??????????????????? struct file *file); >> +??????????????????? struct file *file, int *add_for_wb); >> ? extern void ploop_break_bat_update(struct ploop *ploop, struct >> md_page *md, >> ???????????????????? struct ploop_index_wb *piwb); >> ? extern void ploop_index_wb_submit(struct ploop *, struct >> ploop_index_wb *); -- Regards, Alexander Atanasov From khorenko at virtuozzo.com Thu Mar 27 13:42:53 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Thu, 27 Mar 2025 11:42:53 +0100 Subject: [Devel] [PATCH vz9] dm-ploop: fix resize and grow to use the new way of updating md pages In-Reply-To: References: <20250321151715.1906502-1-alexander.atanasov@virtuozzo.com> <94d00e05-1cc1-4e29-9d89-5f0ed468d59e@virtuozzo.com> Message-ID: <98e582a6-57a1-44d2-af61-b00de26ba50d@virtuozzo.com> Andrey, can you please review the patch? Thank you. -- Best regards, Konstantin Khorenko, Virtuozzo Linux Kernel Team On 21.03.2025 16:37, Alexander Atanasov wrote: > On 21.03.25 17:33, Konstantin Khorenko wrote: >> Please, add reviewer to CC. > > > Whomever is available, please, review. > >> -- >> Best regards, >> >> Konstantin Khorenko, >> Virtuozzo Linux Kernel Team >> >> On 21.03.2025 16:17, Alexander Atanasov wrote: >>> We missed the fact that resize/grow touch md0 page to update ploop >>> parameters, other pages are not linked so there is no issue. >>> But in case there is a concurrent update to md0 piwb is not handled >>> correctly, also using sync updates in parallel is not okay. >>> To fix this update code to use the new mechanism with MD_UPDATING >>> flag and instead of using sync operations pass the updates to >>> runner threads. >>> >>> https://virtuozzo.atlassian.net/browse/VSTOR-101871 >>> Signed-off-by: Alexander Atanasov >>> --- >>> ? drivers/md/dm-ploop-map.c | 37 ++++++++++++++++++++++++++++--------- >>> ? drivers/md/dm-ploop.h???? |? 7 ++++++- >>> ? 2 files changed, 34 insertions(+), 10 deletions(-) >>> >>> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c >>> index ef280a8b0f90..04e71c851b64 100644 >>> --- a/drivers/md/dm-ploop-map.c >>> +++ b/drivers/md/dm-ploop-map.c >>> @@ -613,11 +613,6 @@ static void ploop_unlink_completed_pio(struct >>> ploop *ploop, struct pio *pio) >>> ????????? ploop_dispatch_pios(ploop, NULL, &pio_list); >>> ? } >>> -static void ploop_add_dirty_for_wb(struct ploop *ploop, struct >>> md_page *md) >>> -{ >>> -??? llist_add((struct llist_node *)&md->wb_link, &ploop- >>>> wb_batch_llist); >>> -} >>> - >>> ? static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page >>> *md) >>> ? { >>> ????? bool new = false; >>> @@ -1031,6 +1026,7 @@ static int ploop_prepare_bat_update(struct ploop >>> *ploop, struct md_page *md, >>> ????? struct pio *pio; >>> ????? map_index_t *to; >>> ????? u8 level; >>> +??? int ret = -ENOMEM; >>> ????? lockdep_assert_held(&ploop->bat_lock); >>> @@ -1050,6 +1046,9 @@ static int ploop_prepare_bat_update(struct ploop >>> *ploop, struct md_page *md, >>> ????? if (WARN_ON(md->piwb)) { >>> ????????? PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n", >>> ????????????? md, md->piwb, md->piwb->type, type); >>> +??????? spin_unlock(&md->md_lock); >>> +??????? ret = -EBUSY; >>> +??????? goto err; >>> ????? } >>> ????? md->piwb = piwb; >>> ????? piwb->md = md; >>> @@ -1094,7 +1093,7 @@ static int ploop_prepare_bat_update(struct ploop >>> *ploop, struct md_page *md, >>> ????? return 0; >>> ? err: >>> ????? ploop_free_piwb(piwb); >>> -??? return -ENOMEM; >>> +??? return ret; >>> ? } >>> ? void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, >>> @@ -2692,31 +2691,45 @@ static void ploop_handle_cleanup(struct ploop >>> *ploop, struct pio *pio) >>> ?? * another index instead of existing. If so, management of >>> ?? * old bat_entries[@clu] and of related holes_bitmap bit >>> ?? * is caller duty. >>> + * Caller must clear MD_UPDATING and comply to add_for_wb >>> ?? */ >>> ? int ploop_prepare_reloc_index_wb(struct ploop *ploop, >>> ?????????????????? struct md_page **ret_md, >>> ?????????????????? u32 clu, u32 *dst_clu, >>> -???????????????? struct file *file) >>> +???????????????? struct file *file, >>> +???????????????? int *add_for_wb) >>> ? { >>> ????? enum piwb_type type = PIWB_TYPE_ALLOC; >>> ????? u32 page_id = ploop_bat_clu_to_page_nr(clu); >>> ????? struct md_page *md = ploop_md_page_find(ploop, page_id); >>> ????? struct ploop_index_wb *piwb; >>> ????? int err; >>> +??? bool add_to_wblist; >>> ????? if (dst_clu) >>> ????????? type = PIWB_TYPE_RELOC; >>> ????? err = -EIO; >>> + >>> + >>> ????? spin_lock_irq(&ploop->bat_lock); >>> -??? if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, >>> &md->status)) { >>> -??????? PL_ERR("Unexpected md status: %lx", md->status); >>> +??? spin_lock(&md->md_lock); >>> +??? if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, >>> &md->status) >>> +??????? || test_bit(MD_UPDATING, &md->status)) { >>> +??????? err = -EBUSY; >>> +??????? spin_unlock(&md->md_lock); >>> ????????? goto out_error; >>> +??? } else { >>> +??????? set_bit(MD_UPDATING, &md->status); >>> ????? } >>> +??? spin_unlock(&md->md_lock); >>> + >>> ????? err = ploop_prepare_bat_update(ploop, md, type); >>> ????? if (err) >>> ????????? goto out_error; >>> +??? add_to_wblist = ploop_md_make_dirty(ploop, md); >>> + >>> ????? piwb = md->piwb; >>> ????? if (dst_clu) { >>> @@ -2734,12 +2747,18 @@ int ploop_prepare_reloc_index_wb(struct ploop >>> *ploop, >>> ????? spin_unlock_irq(&ploop->bat_lock); >>> ????? *ret_md = md; >>> +??? *add_for_wb = add_to_wblist ? 1 : 0; >>> + >>> ????? return 0; >>> ? out_reset: >>> ????? ploop_break_bat_update(ploop, md, piwb); >>> ? out_error: >>> +??? if (add_to_wblist) >>> +??????? clear_bit(MD_DIRTY, &md->status); >>> +??? clear_bit(MD_UPDATING, &md->status); >>> ????? spin_unlock_irq(&ploop->bat_lock); >>> + >>> ????? return err; >>> ? } >>> ? ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO); >>> diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h >>> index 46450cac8c7d..db4d92c9679a 100644 >>> --- a/drivers/md/dm-ploop.h >>> +++ b/drivers/md/dm-ploop.h >>> @@ -578,6 +578,11 @@ static inline const char >>> *ploop_device_name(struct ploop *ploop) >>> ????? return ploop->ti->table->md->disk->disk_name; >>> ? } >>> +static inline void ploop_add_dirty_for_wb(struct ploop *ploop, struct >>> md_page *md) >>> +{ >>> +??? llist_add((struct llist_node *)&md->wb_link, &ploop- >>>> wb_batch_llist); >>> +} >>> + >>> ? #define PL_FMT(fmt) "ploop: %s: " fmt "\n" >>> ? #define PL_ERR(fmt, ...) pr_err(PL_FMT(fmt), >>> ploop_device_name(ploop), ##__VA_ARGS__) >>> ? #define PL_ERR_ONCE(fmt, ...) pr_err_once(PL_FMT(fmt), >>> ploop_device_name(ploop), ##__VA_ARGS__) >>> @@ -612,7 +617,7 @@ extern void ploop_map_and_submit_rw(struct ploop >>> *ploop, u32 dst_clu, >>> ????????????????????? struct pio *pio, u8 level); >>> ? extern int ploop_prepare_reloc_index_wb(struct ploop *ploop, >>> ????????????????????? struct md_page **ret_md, u32 clu, u32 *dst_clu, >>> -??????????????????? struct file *file); >>> +??????????????????? struct file *file, int *add_for_wb); >>> ? extern void ploop_break_bat_update(struct ploop *ploop, struct >>> md_page *md, >>> ???????????????????? struct ploop_index_wb *piwb); >>> ? extern void ploop_index_wb_submit(struct ploop *, struct >>> ploop_index_wb *); > > From khorenko at virtuozzo.com Thu Mar 27 14:11:07 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Thu, 27 Mar 2025 12:11:07 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse kio: add safety check in kpcs_dev_ioctl() In-Reply-To: <20250321015911.89339-1-kui.liu@virtuozzo.com> Message-ID: <202503271111.52RBB74G2638739@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.21 ------> commit 8fd436febdb4022fd5679e047b1f7bc4adfbe307 Author: Liu Kui Date: Fri Mar 21 09:59:11 2025 +0800 fs/fuse kio: add safety check in kpcs_dev_ioctl() Apparently fc->kio.ctx needs to be checked before being used. However the check should be done in a way that can avoid a race condition between kpcs_dev_ioctl() and fuse_conn_destroy() where both can run concurrently. https://virtuozzo.atlassian.net/browse/VSTOR-102040 Signed-off-by: Liu Kui Feature: fuse: kRPC - single RPC for kernel and userspace --- fs/fuse/inode.c | 4 +--- fs/fuse/kio/pcs/pcs_cluster.c | 4 ++++ fs/fuse/kio/pcs/pcs_cluster.h | 3 +++ fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 29 +++++++++++++++++++++++------ 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b27422d1ee38..a22e0ffb3a8f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -2221,10 +2221,8 @@ void fuse_conn_destroy(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; - if (fc->kio.op) { /* At this point all pending kio must be completed. */ + if (fc->kio.op) /* At this point all pending kio must be completed. */ fc->kio.op->conn_fini(fm); - fc->kio.ctx = NULL; - } fuse_abort_conn(fc); fuse_wait_aborted(fc); diff --git a/fs/fuse/kio/pcs/pcs_cluster.c b/fs/fuse/kio/pcs/pcs_cluster.c index c87313b90ab3..710087c83fe6 100644 --- a/fs/fuse/kio/pcs/pcs_cluster.c +++ b/fs/fuse/kio/pcs/pcs_cluster.c @@ -603,6 +603,8 @@ int pcs_cluster_init(struct pcs_fuse_cluster *pfc, struct workqueue_struct *wq, INIT_LIST_HEAD(&pfc->list); pfc->fc = fc; + atomic_set(&pfc->refcnt, 1); + init_waitqueue_head(&pfc->waitq); /* core init */ if (pcs_cc_init(&pfc->cc, wq, info->cluster_name, &attr)) @@ -617,6 +619,8 @@ int pcs_cluster_init(struct pcs_fuse_cluster *pfc, struct workqueue_struct *wq, void pcs_cluster_fini(struct pcs_fuse_cluster *pfc) { + if (!atomic_dec_and_test(&pfc->refcnt)) + wait_event(pfc->waitq, atomic_read(&pfc->refcnt) == 0); pcs_cc_fini(&pfc->cc); kvfree(pfc); } diff --git a/fs/fuse/kio/pcs/pcs_cluster.h b/fs/fuse/kio/pcs/pcs_cluster.h index 8693d1bf38d7..914d1ad7865c 100644 --- a/fs/fuse/kio/pcs/pcs_cluster.h +++ b/fs/fuse/kio/pcs/pcs_cluster.h @@ -54,6 +54,9 @@ struct pcs_fuse_cluster { struct list_head list; struct pcs_cluster_core cc; struct fuse_conn *fc; + + atomic_t refcnt; + wait_queue_head_t waitq; }; struct pcs_fuse_work { diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c index 8da9550cc156..e3049ddaa091 100644 --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c @@ -296,17 +296,19 @@ static int kpcs_conn_init(struct fuse_mount *fm) static void kpcs_conn_fini(struct fuse_mount *fm) { struct fuse_conn *fc = fm->fc; + struct pcs_fuse_cluster *pfc = READ_ONCE(fc->kio.ctx); - if (!fc->kio.ctx) + if (!pfc) return; - TRACE("%s fc:%p\n", __FUNCTION__, fc); - unregister_client(fc->kio.ctx); + + WRITE_ONCE(fc->kio.ctx, NULL); + unregister_client(pfc); synchronize_rcu(); flush_workqueue(pcs_wq); flush_workqueue(pcs_cpu_wq); flush_workqueue(pcs_cleanup_wq); - pcs_cluster_fini((struct pcs_fuse_cluster *) fc->kio.ctx); + pcs_cluster_fini(pfc); if (fc->ktrace) fuse_ktrace_remove(fc); @@ -1921,10 +1923,21 @@ static int kpcs_ioctl(struct file *file, struct inode *inode, unsigned int cmd, static int kpcs_dev_ioctl(struct fuse_conn *fc, unsigned int cmd, unsigned long arg, int len) { - struct pcs_fuse_cluster *pfc = fc->kio.ctx; - struct pcs_cluster_core *cc = &pfc->cc; + struct pcs_fuse_cluster *pfc; + struct pcs_cluster_core *cc; int res; + rcu_read_lock(); + pfc = READ_ONCE(fc->kio.ctx); + if (!pfc) { + rcu_read_unlock(); + return -EINVAL; + } + atomic_inc(&pfc->refcnt); + rcu_read_unlock(); + + cc = &pfc->cc; + switch (cmd) { case PCS_IOC_KRPC_CREATE: { @@ -2007,6 +2020,10 @@ static int kpcs_dev_ioctl(struct fuse_conn *fc, unsigned int cmd, unsigned long res = -ENOIOCTLCMD; break; } + + if (atomic_dec_and_test(&pfc->refcnt)) + wake_up(&pfc->waitq); + return res; } From alexander.atanasov at virtuozzo.com Thu Mar 27 15:18:05 2025 From: alexander.atanasov at virtuozzo.com (Alexander Atanasov) Date: Thu, 27 Mar 2025 14:18:05 +0200 Subject: [Devel] [PATCH vz9 v2] dm-ploop: fix resize and grow to use the new way of updating md pages Message-ID: <20250327121809.2980593-1-alexander.atanasov@virtuozzo.com> We missed the fact that resize/grow touch md0 page to update ploop parameters, other pages are not linked so there is no issue. But in case there is a concurrent update to md0 piwb is not handled correctly, also using sync updates in parallel is not okay. To fix this update code to use the new mechanism with MD_UPDATING flag and instead of using sync operations pass the updates to runner threads. https://virtuozzo.atlassian.net/browse/VSTOR-101871 Signed-off-by: Alexander Atanasov --- drivers/md/dm-ploop-cmd.c | 39 ++++++++++++++++++++++++++++++++------- drivers/md/dm-ploop-map.c | 37 ++++++++++++++++++++++++++++--------- drivers/md/dm-ploop.h | 7 ++++++- 3 files changed, 66 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c index d2eb4797ab6e..7d79b900eb1a 100644 --- a/drivers/md/dm-ploop-cmd.c +++ b/drivers/md/dm-ploop-cmd.c @@ -286,6 +286,8 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, struct md_page *md; bool is_locked; int ret = 0; + int tries = 5; + int add_for_wb = 0; dst_clu = cmd->resize.dst_clu; @@ -308,6 +310,7 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, } spin_unlock_irq(&ploop->bat_lock); +reread: /* Read full clu sync */ ret = ploop_read_cluster_sync(ploop, pio, dst_clu); if (ret < 0) { @@ -316,8 +319,13 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, } ret = ploop_prepare_reloc_index_wb(ploop, &md, clu, &new_dst, - ploop_top_delta(ploop)->file); + ploop_top_delta(ploop)->file, + &add_for_wb); if (ret < 0) { + if (ret == -EBUSY && tries-- > 0) { + PL_ERR("md0 busy, retry:%d\n", tries); + goto reread; + } PL_ERR("reloc: can't prepare it: %d", ret); goto out; } @@ -332,13 +340,16 @@ static int ploop_grow_relocate_cluster(struct ploop *ploop, goto out; } - set_bit(MD_WRITEBACK, &md->status); init_completion(&comp); piwb->comp = ∁ piwb->comp_bi_status = &bi_status; /* Write new index on disk */ - ploop_index_wb_submit(ploop, piwb); + ploop_disable_writeback_delay(ploop); + if (add_for_wb) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); wait_for_completion(&comp); + ploop_enable_writeback_delay(ploop); ret = blk_status_to_errno(bi_status); if (ret) { @@ -378,12 +389,22 @@ static int ploop_grow_update_header(struct ploop *ploop, struct md_page *md; u64 sectors; int ret; + int tries = 5; + int add_for_wb = false; +retry: /* hdr is in the same page as bat_entries[0] index */ ret = ploop_prepare_reloc_index_wb(ploop, &md, 0, NULL, - ploop_top_delta(ploop)->file); - if (ret) + ploop_top_delta(ploop)->file, + &add_for_wb); + if (ret) { + if (ret == -EBUSY && tries-- > 0) { + PL_ERR("md0 busy, retry:%d\n", tries); + schedule(); + goto retry; + } return ret; + } piwb = md->piwb; size = (PLOOP_MAP_OFFSET + cmd->resize.nr_bat_entries); @@ -398,12 +419,16 @@ static int ploop_grow_update_header(struct ploop *ploop, offset = hdr->m_FirstBlockOffset = cpu_to_le32(first_block_off); kunmap_local(hdr); - set_bit(MD_WRITEBACK, &md->status); init_completion(&comp); piwb->comp = ∁ piwb->comp_bi_status = &bi_status; - ploop_index_wb_submit(ploop, piwb); + + ploop_disable_writeback_delay(ploop); + if (add_for_wb) + ploop_add_dirty_for_wb(ploop, md); + clear_bit(MD_UPDATING, &md->status); wait_for_completion(&comp); + ploop_enable_writeback_delay(ploop); ret = blk_status_to_errno(bi_status); if (!ret) { diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index ef280a8b0f90..04e71c851b64 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -613,11 +613,6 @@ static void ploop_unlink_completed_pio(struct ploop *ploop, struct pio *pio) ploop_dispatch_pios(ploop, NULL, &pio_list); } -static void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) -{ - llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); -} - static bool ploop_md_make_dirty(struct ploop *ploop, struct md_page *md) { bool new = false; @@ -1031,6 +1026,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, struct pio *pio; map_index_t *to; u8 level; + int ret = -ENOMEM; lockdep_assert_held(&ploop->bat_lock); @@ -1050,6 +1046,9 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, if (WARN_ON(md->piwb)) { PL_ERR("md %p has piwb: %p type:%d ourtype:%d\n", md, md->piwb, md->piwb->type, type); + spin_unlock(&md->md_lock); + ret = -EBUSY; + goto err; } md->piwb = piwb; piwb->md = md; @@ -1094,7 +1093,7 @@ static int ploop_prepare_bat_update(struct ploop *ploop, struct md_page *md, return 0; err: ploop_free_piwb(piwb); - return -ENOMEM; + return ret; } void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, @@ -2692,31 +2691,45 @@ static void ploop_handle_cleanup(struct ploop *ploop, struct pio *pio) * another index instead of existing. If so, management of * old bat_entries[@clu] and of related holes_bitmap bit * is caller duty. + * Caller must clear MD_UPDATING and comply to add_for_wb */ int ploop_prepare_reloc_index_wb(struct ploop *ploop, struct md_page **ret_md, u32 clu, u32 *dst_clu, - struct file *file) + struct file *file, + int *add_for_wb) { enum piwb_type type = PIWB_TYPE_ALLOC; u32 page_id = ploop_bat_clu_to_page_nr(clu); struct md_page *md = ploop_md_page_find(ploop, page_id); struct ploop_index_wb *piwb; int err; + bool add_to_wblist; if (dst_clu) type = PIWB_TYPE_RELOC; err = -EIO; + + spin_lock_irq(&ploop->bat_lock); - if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status)) { - PL_ERR("Unexpected md status: %lx", md->status); + spin_lock(&md->md_lock); + if (test_bit(MD_DIRTY, &md->status) || test_bit(MD_WRITEBACK, &md->status) + || test_bit(MD_UPDATING, &md->status)) { + err = -EBUSY; + spin_unlock(&md->md_lock); goto out_error; + } else { + set_bit(MD_UPDATING, &md->status); } + spin_unlock(&md->md_lock); + err = ploop_prepare_bat_update(ploop, md, type); if (err) goto out_error; + add_to_wblist = ploop_md_make_dirty(ploop, md); + piwb = md->piwb; if (dst_clu) { @@ -2734,12 +2747,18 @@ int ploop_prepare_reloc_index_wb(struct ploop *ploop, spin_unlock_irq(&ploop->bat_lock); *ret_md = md; + *add_for_wb = add_to_wblist ? 1 : 0; + return 0; out_reset: ploop_break_bat_update(ploop, md, piwb); out_error: + if (add_to_wblist) + clear_bit(MD_DIRTY, &md->status); + clear_bit(MD_UPDATING, &md->status); spin_unlock_irq(&ploop->bat_lock); + return err; } ALLOW_ERROR_INJECTION(ploop_prepare_reloc_index_wb, ERRNO); diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index 46450cac8c7d..db4d92c9679a 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -578,6 +578,11 @@ static inline const char *ploop_device_name(struct ploop *ploop) return ploop->ti->table->md->disk->disk_name; } +static inline void ploop_add_dirty_for_wb(struct ploop *ploop, struct md_page *md) +{ + llist_add((struct llist_node *)&md->wb_link, &ploop->wb_batch_llist); +} + #define PL_FMT(fmt) "ploop: %s: " fmt "\n" #define PL_ERR(fmt, ...) pr_err(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__) #define PL_ERR_ONCE(fmt, ...) pr_err_once(PL_FMT(fmt), ploop_device_name(ploop), ##__VA_ARGS__) @@ -612,7 +617,7 @@ extern void ploop_map_and_submit_rw(struct ploop *ploop, u32 dst_clu, struct pio *pio, u8 level); extern int ploop_prepare_reloc_index_wb(struct ploop *ploop, struct md_page **ret_md, u32 clu, u32 *dst_clu, - struct file *file); + struct file *file, int *add_for_wb); extern void ploop_break_bat_update(struct ploop *ploop, struct md_page *md, struct ploop_index_wb *piwb); extern void ploop_index_wb_submit(struct ploop *, struct ploop_index_wb *); -- 2.43.0 From kui.liu at virtuozzo.com Thu Mar 27 17:45:52 2025 From: kui.liu at virtuozzo.com (Liu Kui) Date: Thu, 27 Mar 2025 22:45:52 +0800 Subject: [Devel] [PATCH VZ9] fs/fuse kio: convert BUG_ON check to error handling Message-ID: <20250327144552.7578-1-kui.liu@virtuozzo.com> The BUG_ON condition here is legitimate that can happen during fuse connection teardown. So a fatal error should be returned instead of crash. Fixes #VSTOR-102865 Signed-off-by: Liu Kui --- fs/fuse/kio/pcs/pcs_krpc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_krpc.c b/fs/fuse/kio/pcs/pcs_krpc.c index 115bd67aa0d2..323205a3e2df 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.c +++ b/fs/fuse/kio/pcs/pcs_krpc.c @@ -486,7 +486,10 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se chunk->type = KRPC_CHUNK_TYPE_MR; chunk->mr = pcs_mr_get(&cc_from_krpc(krpc)->mrs, iocmsg->hdr_chunk.mr_id); - BUG_ON(!chunk->mr); + if (!chunk->mr) { + res = -ENXIO; + goto err_free_data_chunk; + } kreq->hdr_buf = (char *) kmap(pcs_umem_page(chunk->mr->umem, chunk->addr)); kreq->hdr_kv.iov_base = kreq->hdr_buf; @@ -595,7 +598,8 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se err_free_data_chunk: kreq_release_data_chunks(kreq); - pcs_mr_put(kreq->hdr_chunk.mr); + if (kreq->hdr_chunk.mr) + pcs_mr_put(kreq->hdr_chunk.mr); err_free_kreq: krpc_req_free(kreq); -- 2.39.5 (Apple Git-154) From kuznet at virtuozzo.com Thu Mar 27 20:59:20 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 01:59:20 +0800 Subject: [Devel] [PATCH VZ9] fs/fuse kio: convert BUG_ON check to error handling In-Reply-To: <20250327144552.7578-1-kui.liu@virtuozzo.com> References: <20250327144552.7578-1-kui.liu@virtuozzo.com> Message-ID: Ack On Thu, Mar 27, 2025 at 10:56?PM Liu Kui wrote: > > The BUG_ON condition here is legitimate that can happen during fuse > connection teardown. So a fatal error should be returned instead of > crash. > > Fixes #VSTOR-102865 > > Signed-off-by: Liu Kui > --- > fs/fuse/kio/pcs/pcs_krpc.c | 8 ++++++-- > 1 file changed, 6 insertions(+), 2 deletions(-) > > diff --git a/fs/fuse/kio/pcs/pcs_krpc.c b/fs/fuse/kio/pcs/pcs_krpc.c > index 115bd67aa0d2..323205a3e2df 100644 > --- a/fs/fuse/kio/pcs/pcs_krpc.c > +++ b/fs/fuse/kio/pcs/pcs_krpc.c > @@ -486,7 +486,10 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se > chunk->type = KRPC_CHUNK_TYPE_MR; > > chunk->mr = pcs_mr_get(&cc_from_krpc(krpc)->mrs, iocmsg->hdr_chunk.mr_id); > - BUG_ON(!chunk->mr); > + if (!chunk->mr) { > + res = -ENXIO; > + goto err_free_data_chunk; > + } > > kreq->hdr_buf = (char *) kmap(pcs_umem_page(chunk->mr->umem, chunk->addr)); > kreq->hdr_kv.iov_base = kreq->hdr_buf; > @@ -595,7 +598,8 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se > > err_free_data_chunk: > kreq_release_data_chunks(kreq); > - pcs_mr_put(kreq->hdr_chunk.mr); > + if (kreq->hdr_chunk.mr) > + pcs_mr_put(kreq->hdr_chunk.mr); > > err_free_kreq: > krpc_req_free(kreq); > -- > 2.39.5 (Apple Git-154) From khorenko at virtuozzo.com Fri Mar 28 00:19:15 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Thu, 27 Mar 2025 22:19:15 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse kio: convert BUG_ON check to error handling In-Reply-To: <20250327144552.7578-1-kui.liu@virtuozzo.com> Message-ID: <202503272119.52RLJF2r2666141@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.22 ------> commit c7c6b9d910e80fdbfb43e1abf057e6f00358f51b Author: Liu Kui Date: Thu Mar 27 22:45:52 2025 +0800 fs/fuse kio: convert BUG_ON check to error handling The BUG_ON condition here is legitimate that can happen during fuse connection teardown. So a fatal error should be returned instead of crash. Fixes: 0f8ffab52203 ("fs/fuse kio: implement pcs_krpc - export kernel RPC to userspace") https://virtuozzo.atlassian.net/browse/VSTOR-102865 Signed-off-by: Liu Kui Acked-by: Alexey Kuznetsov Feature: fuse: kRPC - single RPC for kernel and userspace --- fs/fuse/kio/pcs/pcs_krpc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_krpc.c b/fs/fuse/kio/pcs/pcs_krpc.c index 115bd67aa0d2..323205a3e2df 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.c +++ b/fs/fuse/kio/pcs/pcs_krpc.c @@ -486,7 +486,10 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se chunk->type = KRPC_CHUNK_TYPE_MR; chunk->mr = pcs_mr_get(&cc_from_krpc(krpc)->mrs, iocmsg->hdr_chunk.mr_id); - BUG_ON(!chunk->mr); + if (!chunk->mr) { + res = -ENXIO; + goto err_free_data_chunk; + } kreq->hdr_buf = (char *) kmap(pcs_umem_page(chunk->mr->umem, chunk->addr)); kreq->hdr_kv.iov_base = kreq->hdr_buf; @@ -595,7 +598,8 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se err_free_data_chunk: kreq_release_data_chunks(kreq); - pcs_mr_put(kreq->hdr_chunk.mr); + if (kreq->hdr_chunk.mr) + pcs_mr_put(kreq->hdr_chunk.mr); err_free_kreq: krpc_req_free(kreq); From kuznet at virtuozzo.com Fri Mar 28 15:00:42 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 20:00:42 +0800 Subject: [Devel] [PATCH VZ9 1/7] fs/fuse/kio: correct return of error from io backed to rpc Message-ID: Backport from user space. The bug is unlikely to see in kernel and we have never observe it. Yet, it is better to keep this place clean. Cloning comment from corresponsing user space commit: We used to have an ugly problem there, when ->done is called from IO backend which is not aware about rpc logic it considered all the errors as local. The damage at least on client->cs rpc users is severe, local errors are considered not inflicted by failing cluster neighbors, but blamed on local host, so that cluster recovery process is not triggered. It exposed itself big time on KRPC (in user space), but it can be dangerous for plain sock/rdma backends too. The bug is ancient, it was present since day zero, but we never noticed it, because error of this kind are very rare with TCP: write is non-blocking as rule, we must have filled sndbuf when the socket aborts and to lose error we should not have any uncompleted rpc requests, as error for them will trigger correct path. Not easy to see the bug, yet possible. Let's not overthink this, it is enough to fix the issue in client-cs path, which can be done with special kludge in cs_sent. NOTE: special ugly exception for PCS_ERR_NOMEM is inherited from older pcs_set_rpc_error which in fact serves the same function in rpc context. Affects: #VSTOR-100586 Signed-off-by: Alexey Kuznetsov --- fs/fuse/kio/pcs/pcs_cs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c index 0707575..ad398ac 100644 --- a/fs/fuse/kio/pcs/pcs_cs.c +++ b/fs/fuse/kio/pcs/pcs_cs.c @@ -551,6 +551,10 @@ static void cs_sent(struct pcs_msg *msg) { msg->done = cs_response_done; if (pcs_if_error(&msg->error)) { + if (msg->rpc && !msg->error.remote && msg->error.value != PCS_ERR_NOMEM) { + msg->error.remote = 1; + msg->error.offender = msg->rpc->peer_id; + } msg->done(msg); return; } -- 1.8.3.1 From kuznet at virtuozzo.com Fri Mar 28 15:00:47 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 20:00:47 +0800 Subject: [Devel] [PATCH VZ9 2/7] fs/fuse/kio: some pages were not unlocked while revoke Message-ID: Request migrates from fiq to fpq for awhile it is out of revocation lists. So, we must recheck revoke status. The bug is old. Affects: #VSTOR-100953 Signed-off-by: Alexey Kuznetsov --- fs/fuse/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 118613f..b437b8d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1421,6 +1421,15 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto out_end; } + if (args->page_cache && args->inode) { + struct fuse_inode *fi = get_fuse_inode(args->inode); + + if (test_bit(FUSE_I_INVAL_FILES, &fi->state) || args->killed) { + req->out.h.error = -EIO; + err = -EAGAIN; + goto out_end; + } + } list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; -- 1.8.3.1 From kuznet at virtuozzo.com Fri Mar 28 15:00:53 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 20:00:53 +0800 Subject: [Devel] [PATCH VZ9 3/7] fs/fuse: keep req page refcnt sane Message-ID: It was _our_ ancient patch, fb7ae3cf4ca1e052335b94b86b9f43f09b9740f0, subj: "fuse: fuse_prepare_write() cannot handle page from killed request" The patch was entirely crazy, I would veto it a moment I have seen it, but obviously I was distracted and this abomination entered the kernel from behind. :-) So, do not mangle original queued request, only the requestor is allowed do this. Do not put pages hold by original request, only the requestor may do this. Yet, keep unlocking pages with proper error status, this is dirty and requires attention from the requestor to avoid double unlock, yet we do not have any alternative solution. Also, an ancient page leakage has been found, which was difficult to detect, since invalidate_inode_pages2 forcably detached pages from page cache and they live in dormant state. Affects: #VSTOR-100953 Signed-off-by: Alexey Kuznetsov --- fs/fuse/file.c | 31 +++++++++++++++---------------- fs/fuse/fuse_i.h | 5 +---- fs/fuse/inode.c | 5 ----- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4b701af..cfc5da8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1115,7 +1115,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, } static int fuse_do_readpage(struct file *file, struct page *page, - bool page_needs_release, bool *killed_p) + bool *killed_p) { struct inode *inode = page->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); @@ -1128,7 +1128,6 @@ static int fuse_do_readpage(struct file *file, struct page *page, .ap.pages = &page, .ap.descs = &desc, .ap.args.page_cache = 1, - .ap.args.page_needs_release = page_needs_release, }; ssize_t res; u64 attr_ver; @@ -1179,7 +1178,7 @@ static int fuse_read_folio(struct file *file, struct folio *folio) if (fuse_is_bad(inode)) goto out; - err = fuse_do_readpage(file, page, false, &killed); + err = fuse_do_readpage(file, page, &killed); fuse_invalidate_atime(inode); out: if (!killed) @@ -1207,14 +1206,15 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { int i; + int killed = args->killed; struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_args_pages *ap = &ia->ap; size_t count = ia->read.in.size; size_t num_read = args->out_args[0].size; struct inode *inode = args->inode; - if (args->killed) - goto killed; + if (unlikely(killed)) + err = -EIO; /* * Short read means EOF. If file size is larger, truncate it @@ -1225,14 +1225,15 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, for (i = 0; i < ap->num_pages; i++) { struct page *page = ap->pages[i]; - if (!err) - SetPageUptodate(page); - else - SetPageError(page); - unlock_page(page); + if (likely(!killed)) { + if (!err) + SetPageUptodate(page); + else + SetPageError(page); + unlock_page(page); + } put_page(page); } -killed: fuse_invalidate_atime(inode); if (ia->ff) @@ -1255,7 +1256,6 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) ap->args.page_zeroing = true; ap->args.page_replace = true; ap->args.page_cache = 1; - ap->args.page_needs_release = false; /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { @@ -2732,7 +2732,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, zero_user_segment(page, 0, off); goto success; } - err = fuse_do_readpage(file, page, true, &killed); + err = fuse_do_readpage(file, page, &killed); if (err) goto cleanup; success: @@ -2740,10 +2740,9 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, return 0; cleanup: - if (!killed) { + if (!killed) unlock_page(page); - put_page(page); - } + put_page(page); error: return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 3548828..28c495c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -324,10 +324,7 @@ struct fuse_args { /** Request contains pages from page-cache */ unsigned page_cache:1; - /** Request pages need page_cache_release() */ - unsigned page_needs_release:1; - - /** Request was killed -- pages were released */ + /** Request was killed -- pages were unlocked */ unsigned killed:1; struct inode *io_inode; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a22e0ff..82baa1d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -502,12 +502,7 @@ void fuse_kill_requests(struct fuse_conn *fc, struct inode *inode, struct page *page = ia->ap.pages[i]; SetPageError(page); unlock_page(page); - if (req->args->page_needs_release) - put_page(page); - ia->ap.pages[i] = NULL; } - - ia->ap.num_pages = 0; } } EXPORT_SYMBOL_GPL(fuse_kill_requests); -- 1.8.3.1 From kuznet at virtuozzo.com Fri Mar 28 15:00:58 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 20:00:58 +0800 Subject: [Devel] [PATCH VZ9 4/7] fs/fuse/kio: keep kio requests on revoke list at all times Message-ID: Before kio request is submitted downstream is it off revoke list. This was a bug, but it was directly required by bogus revoke logic which stole page cache pages from requests. So, we processed pages while the req was isolated from revocation, which would result in deadlock. Now we keep pages accounted as they should, so we can move adding to revocation list as early as possible. Note: also this means requests queued for expand/truncate were invisible to pending request list, now we fix this. Note2: we should consider adding request to revocation list immediately when it is created and becomes parsable. This would make fuse_invalidate_files() a lot simpler and less bug prone, yet there is danger to clash with place where abort is skipped intentionally due to being of the lists. This work is already done, but it is still not in a state appropriate for release. Affects: #VSTOR-100953 Signed-off-by: Alexey Kuznetsov --- fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 44 +++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c index e3049dd..1b3bc8f 100644 --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c @@ -849,14 +849,18 @@ static bool kqueue_insert(struct pcs_dentry_info *di, struct fuse_req *req) return true; } +static void kqueue_remove(struct pcs_dentry_info *di, struct fuse_req *req) +{ + spin_lock(&di->kq_lock); + list_del_init(&req->list); + spin_unlock(&di->kq_lock); +} + static inline int req_wait_grow_queue(struct pcs_fuse_req *r, off_t offset, size_t size) { struct pcs_dentry_info *di = get_pcs_inode(r->req.args->io_inode); struct fuse_inode *fi = get_fuse_inode(r->req.args->io_inode); - if (!kqueue_insert(di, &r->req)) - return -EIO; - BUG_ON(r->req.in.h.opcode != FUSE_WRITE && r->req.in.h.opcode != FUSE_FALLOCATE); fuse_write_dio_begin(fi); @@ -980,9 +984,10 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r) BUG(); } - if (!kqueue_insert(di, req)) + if (req->args->ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &req->args->ff->ff_state)) ret = -EIO; - else if (req->in.h.opcode == FUSE_READ || req->in.h.opcode == FUSE_FSYNC || req->in.h.opcode == FUSE_FLUSH) + else if (req->in.h.opcode == FUSE_READ || req->in.h.opcode == FUSE_FSYNC || + req->in.h.opcode == FUSE_FLUSH) fuse_read_dio_begin(fi); else fuse_write_dio_begin(fi); @@ -992,14 +997,13 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r) return ret; } -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) +static int pcs_fuse_submit_prepare(struct pcs_fuse_cluster *pfc, struct fuse_req *req) { struct pcs_fuse_req *r = pcs_req_from_fuse(req); struct fuse_args *args = req->args; struct fuse_inode *fi = get_fuse_inode(args->io_inode); struct pcs_dentry_info *di = pcs_inode_from_fuse(fi); - struct pcs_int_request* ireq; - int ret; + int ret = 0; BUG_ON(!di); BUG_ON(req->cache != pcs_fuse_req_cachep); @@ -1007,8 +1011,21 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) /* Init pcs_fuse_req */ memset(&r->exec, 0, sizeof(r->exec)); /* Use inline request structure */ - ireq = &r->exec.ireq; - ireq_init(di, ireq); + ireq_init(di, &r->exec.ireq); + + spin_lock(&di->lock); + if (!kqueue_insert(di, req)) + ret = -EIO; + spin_unlock(&di->lock); + return ret; +} + +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) +{ + struct pcs_fuse_req *r = pcs_req_from_fuse(req); + struct pcs_int_request *ireq = &r->exec.ireq; + struct pcs_dentry_info *di = ireq->dentry; + int ret; switch (req->in.h.opcode) { case FUSE_WRITE: @@ -1026,7 +1043,7 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) } break; case FUSE_FALLOCATE: { - struct fuse_fallocate_in *inarg = (void*) args->in_args[0].value; + struct fuse_fallocate_in *inarg = (void *)req->args->in_args[0].value; size_t sz = READ_ONCE(di->fileinfo.attr.size); if (pfc->fc->no_fallocate) { @@ -1062,7 +1079,7 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) * and as i_size is still not advanced all the following ones are. */ WARN_ON_ONCE(inarg->offset + inarg->length > sz && - !inode_is_locked(&fi->inode)); + !inode_is_locked(req->args->io_inode)); } ret = pcs_fuse_prep_rw(r); @@ -1097,6 +1114,7 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) error: DTRACE("do fuse_request_end req:%p op:%d err:%d\n", req, req->in.h.opcode, req->out.h.error); + kqueue_remove(di, req); __fuse_request_end(req, false); return; @@ -1320,6 +1338,8 @@ static void kpcs_req_send(struct fuse_req *req, bool bg) refcount_inc(&req->count); __clear_bit(FR_PENDING, &req->flags); + pcs_fuse_submit_prepare(pfc, req); + pcs_fuse_submit(pfc, req); if (!bg) wait_event(req->waitq, -- 1.8.3.1 From kuznet at virtuozzo.com Fri Mar 28 15:01:03 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 20:01:03 +0800 Subject: [Devel] [PATCH VZ9 5/7] fs/fuse/kio: tidy up RPC_AFFINITY_RSS Message-ID: Also, enable it for rdma (tested with mellanox) and unix sockets. Now it provides essentially perfect affinity when socket contexts never hit lock contention and cache bouncing provided RSS and XPS are configured correctly. Change fallback when rx_cpu is not available from RPC_AFFINITY_RETENT to RPC_AFFINITY_FAIR_SPREAD. Unfortunatley, we cannot enable it by default, since enabling RSS/XPS is an advanced performance tuning. Also, change fallback when rx_cpu is unknown from RPC_AFFINITY_RETENT to RPC_AFFINITY_FAIR_SPREAD Signed-off-by: Alexey Kuznetsov --- fs/fuse/kio/pcs/pcs_rdma_io.c | 1 + fs/fuse/kio/pcs/pcs_rpc.c | 25 ++++++++++++++++--------- fs/fuse/kio/pcs/pcs_rpc.h | 1 + fs/fuse/kio/pcs/pcs_sock_io.c | 5 ++--- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.c b/fs/fuse/kio/pcs/pcs_rdma_io.c index 2755b13..d50f2c1 100644 --- a/fs/fuse/kio/pcs/pcs_rdma_io.c +++ b/fs/fuse/kio/pcs/pcs_rdma_io.c @@ -1096,6 +1096,7 @@ static void pcs_rdma_cq_comp_handler(struct ib_cq *cq, void *private) set_bit(PCS_RDMA_IO_CQE, &rio->io_flags); wake_up(&rio->waitq); + ep->rx_cpu = smp_processor_id(); pcs_rpc_kick_queue(ep); } diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c index b9774ce1..71c2a3b 100644 --- a/fs/fuse/kio/pcs/pcs_rpc.c +++ b/fs/fuse/kio/pcs/pcs_rpc.c @@ -339,6 +339,7 @@ void pcs_rpc_attach_new_ep(struct pcs_rpc * ep, struct pcs_rpc_engine * eng) atomic_set(&ep->netlat_cnt, 0); atomic64_set(&ep->netlat_avg, 0); ep->cpu = WORK_CPU_UNBOUND; + ep->rx_cpu = WORK_CPU_UNBOUND; ep->gc = NULL; if (eng->max_gc_index) @@ -863,27 +864,33 @@ static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle) ep->cpu = WORK_CPU_UNBOUND; } break; - case RPC_AFFINITY_RSS: - if (!(ep->flags & PCS_RPC_F_LOCAL) && ep->addr.type != PCS_ADDRTYPE_RDMA) - break; + case RPC_AFFINITY_RSS: { + int rx_cpu = READ_ONCE(ep->rx_cpu); + + if (rx_cpu != WORK_CPU_UNBOUND && ep->cpu != rx_cpu) + ep->cpu = rx_cpu; fallthrough; + } + case RPC_AFFINITY_FAIR_SPREAD: + if (ep->cpu == WORK_CPU_UNBOUND || + (time_is_before_jiffies(ep->cpu_stamp) && was_idle)) + pcs_rpc_cpu_select(ep); + break; case RPC_AFFINITY_RETENT: /* Naive socket-to-cpu binding approach */ - if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) { + if (ep->cpu == WORK_CPU_UNBOUND || + (time_is_before_jiffies(ep->cpu_stamp) && was_idle)) { ep->cpu_stamp = jiffies + rpc_cpu_time_slice; ep->cpu = smp_processor_id(); } break; case RPC_AFFINITY_SPREAD: - if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) { + if (ep->cpu == WORK_CPU_UNBOUND || + (time_is_before_jiffies(ep->cpu_stamp) && was_idle)) { ep->cpu_stamp = jiffies + rpc_cpu_time_slice; ep->cpu = pcs_rpc_cpu_next(); } break; - case RPC_AFFINITY_FAIR_SPREAD: - if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) - pcs_rpc_cpu_select(ep); - break; default: pr_err("Unknown affinity mode: %u\n", rpc_affinity_mode); } diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h index cb18557..0bafc8a 100644 --- a/fs/fuse/kio/pcs/pcs_rpc.h +++ b/fs/fuse/kio/pcs/pcs_rpc.h @@ -142,6 +142,7 @@ struct pcs_rpc int cpu; unsigned long cpu_stamp; struct delayed_work cpu_timer_work; /* reset cpu affinity after being idle */ + int rx_cpu; struct mutex mutex; u64 accounted; diff --git a/fs/fuse/kio/pcs/pcs_sock_io.c b/fs/fuse/kio/pcs/pcs_sock_io.c index 7c62f48..805b8f1 100644 --- a/fs/fuse/kio/pcs/pcs_sock_io.c +++ b/fs/fuse/kio/pcs/pcs_sock_io.c @@ -561,9 +561,8 @@ static void pcs_sk_kick_queue(struct sock *sk) sio = rcu_dereference_sk_user_data(sk); if (sio) { struct pcs_rpc *ep = sio->netio.parent; - TRACE(PEER_FMT" queue cpu=%d\n", PEER_ARGS(ep), smp_processor_id()); - if (rpc_affinity_mode == RPC_AFFINITY_RSS && !(ep->flags & PCS_RPC_F_LOCAL)) - ep->cpu = smp_processor_id(); + DTRACE(PEER_FMT" queue cpu=%d\n", PEER_ARGS(ep), smp_processor_id()); + ep->rx_cpu = smp_processor_id(); pcs_rpc_kick_queue(ep); } rcu_read_unlock(); -- 1.8.3.1 From kuznet at virtuozzo.com Fri Mar 28 15:01:08 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 20:01:08 +0800 Subject: [Devel] [PATCH VZ9 6/7] fs/fuse/kio: create krpc request in special thread Message-ID: Overhead of mapping rpc request is pretty high. And in cases when vstorage-mount even loop is saturated it makes sense to create shadow kernel thread which mm/files shared with user space. This is one series of patches (others are in user space), which increase raid read iops more than twice. It is en/disabled with module parameter "pcs_krpc_use_thread", which can be tuned at run time. Additionally, the patch fixes some old bugs of various fatality found during development and testing: buffer overflow and wrong error code returned. This part is the reason we push the patch to release, it is too intertangled with new request processing to be considered standalone. The patch is combo of two patches, merged together because the second one moves chunks of code around and it does not make sense to know about these bowel movent. Comment from the second patch: fs/fuse kio: properly return errors from sendmsg over kRPC Unifies the return of errors in sendmsg by passing the error as a result of krpc request completion. The error will then by returned to userpsace in recvmsg. However a linux error will be returned as return value of ioctl call, whereas a pcs error will be returned as the result of recvmsg. Signed-off-by: Alexey Kuznetsov Signed-off-by: Liu Kui --- fs/fuse/kio/pcs/pcs_krpc.c | 145 ++++++++++++++++++++++++++++++++++++++------- fs/fuse/kio/pcs/pcs_krpc.h | 7 ++- 2 files changed, 130 insertions(+), 22 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_krpc.c b/fs/fuse/kio/pcs/pcs_krpc.c index 323205a..a0e0799 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.c +++ b/fs/fuse/kio/pcs/pcs_krpc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "pcs_types.h" @@ -24,6 +25,10 @@ module_param(pcs_krpc_csaccel, uint, 0644); MODULE_PARM_DESC(pcs_krpc_csaccel, "Enable krpc local cs bypass"); +unsigned int pcs_krpc_use_thread = 1; +module_param(pcs_krpc_use_thread, uint, 0644); +MODULE_PARM_DESC(pcs_krpc_use_thread, "Offload creating the request to a thread"); + extern unsigned int pcs_krpc_version; struct kmem_cache *krpc_req_cachep; @@ -82,7 +87,8 @@ static void krpc_req_complete(struct krpc_req *kreq, int error) comp->result = error; kreq_release_data_chunks(kreq); - pcs_mr_put(kreq->hdr_chunk.mr); + if (kreq->hdr_chunk.mr) + pcs_mr_put(kreq->hdr_chunk.mr); spin_lock(&krpc->lock); list_del(&kreq->link); @@ -292,7 +298,10 @@ static int pcs_krpc_ioctl_recv_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_re if (copy_to_user((void __user *)iocmsg->buf.addr, comp->_data_buf, comp->data_len)) res = -EFAULT; } else { /* response */ - iocmsg->result = comp->result; + if (comp->result >= 0) + iocmsg->result = comp->result; + else + res = comp->result; /* internal error */ } krpc_completion_free(comp); @@ -449,25 +458,23 @@ static int try_local_bypass(struct pcs_krpc *krpc, struct krpc_req *kreq) return 0; } -static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_sendmsg *iocmsg) +static int kreq_make_sendmsg(struct krpc_req *kreq) { - struct krpc_req *kreq; + struct pcs_krpc *krpc = kreq->krpc; struct pcs_msg *msg; struct pcs_krpc_buf_desc *chunk_bd; struct krpc_chunk *chunk; int res, i; struct bio_vec *bvec; + struct pcs_krpc_ioc_sendmsg *iocmsg; - kreq = krpc_req_alloc(); - if (!kreq) - return -ENOMEM; + iocmsg = &kreq->iocmsg; if (iocmsg->nr_data_chunks > NR_KRPC_DATA_CHUNKS_INLINE) { - kreq->data_chunks = kzalloc(iocmsg->nr_data_chunks, GFP_NOIO); - if (!kreq->data_chunks) { - res = -ENOMEM; - goto err_free_kreq; - } + kreq->data_chunks = kcalloc(iocmsg->nr_data_chunks, sizeof(struct krpc_chunk), + GFP_NOIO); + if (!kreq->data_chunks) + return -ENOMEM; } else { kreq->data_chunks = &kreq->inline_data_chunks[0]; } @@ -536,7 +543,7 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se chunk->addr = chunk_bdzc->offset; chunk->req = fuse_dev_find_request(chunk_bdzc->devfd, chunk_bdzc->unique); if (!chunk->req || chunk->req->args->killed) { - res = PCS_ERR_NET; + res = PCS_ERR_INV_PARAMS; goto err_free_data_chunk; } break; @@ -580,7 +587,6 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se goto err_free_data_chunk; } atomic_inc(&krpc->iocount); - kreq->krpc = pcs_krpc_get(krpc); list_add_tail(&kreq->link, &krpc->pending_queue); spin_unlock(&krpc->lock); @@ -600,12 +606,93 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se kreq_release_data_chunks(kreq); if (kreq->hdr_chunk.mr) pcs_mr_put(kreq->hdr_chunk.mr); - -err_free_kreq: - krpc_req_free(kreq); return res; } +static void kreq_submit(struct krpc_req *kreq) +{ + int res; + + res = kreq_make_sendmsg(kreq); + if (res) { + kreq->data_chunks = &kreq->inline_data_chunks[0]; + kreq->data_len = 0; + kreq->nr_data_chunks = 0; + kreq->nr_data_bvecs = 0; + kreq->hdr_chunk.mr = NULL; + kreq->completion.xid = kreq->iocmsg.xid; + kreq->completion.private = kreq; + INIT_LIST_HEAD(&kreq->link); + + krpc_req_complete(kreq, res); + } +} + +static int krpc_threadfn(void *data) +{ + struct pcs_krpc_set *krpcs = data; + + for (;;) { + struct llist_node *ll; + + set_current_state(TASK_INTERRUPTIBLE); + + ll = llist_del_all(&krpcs->req_llist); + + if (ll == NULL) { + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + schedule(); + continue; + } + + __set_current_state(TASK_RUNNING); + + while (ll) { + struct llist_node *next = ll->next; + struct krpc_req *kreq = container_of(ll, struct krpc_req, llist_link); + + kreq_submit(kreq); + + ll = next; + } + } +} + +static int pcs_krpc_ioctl_send_msg(struct krpc_req *kreq) +{ + struct task_struct *tsk; + struct pcs_cluster_core *cc; + + if (pcs_krpc_use_thread) { + cc = container_of(kreq->krpc->krpcs, struct pcs_cluster_core, krpcs); + tsk = cc->krpcs.krpc_task; + if (unlikely(tsk == NULL)) { + tsk = kthread_create(krpc_threadfn, &cc->krpcs, "krpc_send"); + if (tsk && !IS_ERR(tsk)) { + cc->krpcs.krpc_task = get_task_struct(tsk); + mmget(current->mm); + tsk->mm = current->mm; + tsk->active_mm = current->mm; + atomic_inc(¤t->files->count); + tsk->files = current->files; + } + } + + if (likely(tsk)) { + llist_add(&kreq->llist_link, &kreq->krpc->krpcs->req_llist); + wake_up_process(tsk); + return 0; + } + } + + kreq_submit(kreq); + + return 0; +} + static int pcs_krpc_abort(struct pcs_krpc *krpc) { struct krpc_req *kreq, *tmp; @@ -733,12 +820,23 @@ static long pcs_krpc_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case PCS_KRPC_IOC_SEND_MSG: { - struct pcs_krpc_ioc_sendmsg req; + struct krpc_req *kreq; - if (copy_from_user(&req, (void __user *)arg, sizeof(req))) + kreq = krpc_req_alloc(); + if (!kreq) + return -ENOMEM; + + if (copy_from_user(&kreq->iocmsg, (void __user *)arg, sizeof(kreq->iocmsg))) { + krpc_req_free(kreq); return -EFAULT; + } - res = pcs_krpc_ioctl_send_msg(krpc, &req); + kreq->krpc = pcs_krpc_get(krpc); + res = pcs_krpc_ioctl_send_msg(kreq); + if (res) { + pcs_krpc_put(krpc); + krpc_req_free(kreq); + } break; } case PCS_KRPC_IOC_RECV_MSG: { @@ -1068,7 +1166,8 @@ void pcs_krpcset_init(struct pcs_krpc_set *krpcs) INIT_LIST_HEAD(&krpcs->list); krpcs->nkrpc = 0; - + krpcs->krpc_task = NULL; + init_llist_head(&krpcs->req_llist); spin_lock_init(&krpcs->lock); } @@ -1094,6 +1193,10 @@ void pcs_krpcset_fini(struct pcs_krpc_set *krpcs) } spin_unlock(&krpcs->lock); + if (krpcs->krpc_task) { + kthread_stop(krpcs->krpc_task); + put_task_struct(krpcs->krpc_task); + } BUG_ON(!list_empty(&krpcs->list)); BUG_ON(krpcs->nkrpc != 0); } diff --git a/fs/fuse/kio/pcs/pcs_krpc.h b/fs/fuse/kio/pcs/pcs_krpc.h index c6b867b..8021b02 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.h +++ b/fs/fuse/kio/pcs/pcs_krpc.h @@ -36,7 +36,9 @@ struct pcs_krpc_set { struct list_head list; unsigned int nkrpc; - spinlock_t lock; + spinlock_t lock; + struct task_struct *krpc_task; + struct llist_head req_llist; }; enum { @@ -127,6 +129,9 @@ struct krpc_req { struct bio_vec data_bvecs[KRPC_MAX_DATA_PAGES]; struct krpc_completion completion; + + struct llist_node llist_link; + struct pcs_krpc_ioc_sendmsg iocmsg; }; static inline u32 pcs_krpc_msg_size(u32 size, u8 flags) -- 1.8.3.1 From kuznet at virtuozzo.com Fri Mar 28 15:01:13 2025 From: kuznet at virtuozzo.com (Alexey Kuznetsov) Date: Fri, 28 Mar 2025 20:01:13 +0800 Subject: [Devel] [PATCH VZ9 7/7] fs/fuse/krpc: prevent krpc request from crossing reconnect Message-ID: Do it with already existing generation id. kreq stores genid of connection at time of enqueue, and if it races with reconnect, it is rejected. Signed-off-by: Alexey Kuznetsov --- fs/fuse/kio/pcs/pcs_krpc.c | 5 ++++- fs/fuse/kio/pcs/pcs_krpc.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/fuse/kio/pcs/pcs_krpc.c b/fs/fuse/kio/pcs/pcs_krpc.c index a0e0799..786cef2 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.c +++ b/fs/fuse/kio/pcs/pcs_krpc.c @@ -581,7 +581,8 @@ static int kreq_make_sendmsg(struct krpc_req *kreq) msg->get_iter = krpc_msg_get_data; spin_lock(&krpc->lock); - if (krpc->state != PCS_KRPC_STATE_CONNECTED) { + if (krpc->state != PCS_KRPC_STATE_CONNECTED || + krpc->gen != kreq->gen) { spin_unlock(&krpc->lock); res = -ECONNABORTED; goto err_free_data_chunk; @@ -826,6 +827,8 @@ static long pcs_krpc_ioctl(struct file *file, unsigned int cmd, unsigned long ar if (!kreq) return -ENOMEM; + kreq->gen = ctx->gen; + if (copy_from_user(&kreq->iocmsg, (void __user *)arg, sizeof(kreq->iocmsg))) { krpc_req_free(kreq); return -EFAULT; diff --git a/fs/fuse/kio/pcs/pcs_krpc.h b/fs/fuse/kio/pcs/pcs_krpc.h index 8021b02..15d9f77 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.h +++ b/fs/fuse/kio/pcs/pcs_krpc.h @@ -130,6 +130,7 @@ struct krpc_req { struct krpc_completion completion; + u32 gen; struct llist_node llist_link; struct pcs_krpc_ioc_sendmsg iocmsg; }; -- 1.8.3.1 From khorenko at virtuozzo.com Fri Mar 28 23:50:58 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 28 Mar 2025 21:50:58 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse/kio: correct return of error from io backed to rpc In-Reply-To: Message-ID: <202503282050.52SKowpB2711603@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.23 ------> commit 1ac58e6cf72f21a007c1032d73e7ce7659836426 Author: Alexey Kuznetsov Date: Fri Mar 28 20:00:42 2025 +0800 fs/fuse/kio: correct return of error from io backed to rpc Backport from user space. The bug is unlikely to see in kernel and we have never observe it. Yet, it is better to keep this place clean. Cloning comment from corresponsing user space commit: We used to have an ugly problem there, when ->done is called from IO backend which is not aware about rpc logic it considered all the errors as local. The damage at least on client->cs rpc users is severe, local errors are considered not inflicted by failing cluster neighbors, but blamed on local host, so that cluster recovery process is not triggered. It exposed itself big time on KRPC (in user space), but it can be dangerous for plain sock/rdma backends too. The bug is ancient, it was present since day zero, but we never noticed it, because error of this kind are very rare with TCP: write is non-blocking as rule, we must have filled sndbuf when the socket aborts and to lose error we should not have any uncompleted rpc requests, as error for them will trigger correct path. Not easy to see the bug, yet possible. Let's not overthink this, it is enough to fix the issue in client-cs path, which can be done with special kludge in cs_sent. NOTE: special ugly exception for PCS_ERR_NOMEM is inherited from older pcs_set_rpc_error which in fact serves the same function in rpc context. Affects: #VSTOR-100586 https://virtuozzo.atlassian.net/browse/VSTOR-100586 Signed-off-by: Alexey Kuznetsov Feature: vStorage --- fs/fuse/kio/pcs/pcs_cs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c index 07075759a658..ad398acb03ef 100644 --- a/fs/fuse/kio/pcs/pcs_cs.c +++ b/fs/fuse/kio/pcs/pcs_cs.c @@ -551,6 +551,10 @@ static void cs_sent(struct pcs_msg *msg) { msg->done = cs_response_done; if (pcs_if_error(&msg->error)) { + if (msg->rpc && !msg->error.remote && msg->error.value != PCS_ERR_NOMEM) { + msg->error.remote = 1; + msg->error.offender = msg->rpc->peer_id; + } msg->done(msg); return; } From khorenko at virtuozzo.com Fri Mar 28 23:50:59 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 28 Mar 2025 21:50:59 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse/kio: some pages were not unlocked while revoke In-Reply-To: Message-ID: <202503282050.52SKoxeC2711674@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.23 ------> commit 06d704548a63f5afc73fe4915f90674f98c77656 Author: Alexey Kuznetsov Date: Fri Mar 28 20:00:47 2025 +0800 fs/fuse/kio: some pages were not unlocked while revoke Request migrates from fiq to fpq for awhile it is out of revocation lists. So, we must recheck revoke status. The bug is old. Affects: #VSTOR-100953 https://virtuozzo.atlassian.net/browse/VSTOR-100953 Signed-off-by: Alexey Kuznetsov Feature: vStorage --- fs/fuse/dev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 118613f17b10..b437b8de19bb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1421,6 +1421,15 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto out_end; } + if (args->page_cache && args->inode) { + struct fuse_inode *fi = get_fuse_inode(args->inode); + + if (test_bit(FUSE_I_INVAL_FILES, &fi->state) || args->killed) { + req->out.h.error = -EIO; + err = -EAGAIN; + goto out_end; + } + } list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; From khorenko at virtuozzo.com Fri Mar 28 23:51:00 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 28 Mar 2025 21:51:00 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse: keep req page refcnt sane In-Reply-To: Message-ID: <202503282051.52SKp0hJ2711745@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.23 ------> commit b191d0a510fbfcdf9b5a8c9d792a5b924ad6b48d Author: Alexey Kuznetsov Date: Fri Mar 28 20:00:53 2025 +0800 fs/fuse: keep req page refcnt sane It was _our_ ancient patch, fb7ae3cf4ca1e052335b94b86b9f43f09b9740f0, subj: "fuse: fuse_prepare_write() cannot handle page from killed request" The patch was entirely crazy, I would veto it a moment I have seen it, but obviously I was distracted and this abomination entered the kernel from behind. :-) So, do not mangle original queued request, only the requestor is allowed do this. Do not put pages hold by original request, only the requestor may do this. Yet, keep unlocking pages with proper error status, this is dirty and requires attention from the requestor to avoid double unlock, yet we do not have any alternative solution. Also, an ancient page leakage has been found, which was difficult to detect, since invalidate_inode_pages2 forcably detached pages from page cache and they live in dormant state. Fixes: fb7ae3cf4ca1 ("fuse: fuse_prepare_write() cannot handle page from killed request") Affects: #VSTOR-100953 https://virtuozzo.atlassian.net/browse/VSTOR-100953 Signed-off-by: Alexey Kuznetsov Feature: vStorage --- fs/fuse/file.c | 31 +++++++++++++++---------------- fs/fuse/fuse_i.h | 5 +---- fs/fuse/inode.c | 5 ----- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4b701af5a205..cfc5da819438 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1115,7 +1115,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, } static int fuse_do_readpage(struct file *file, struct page *page, - bool page_needs_release, bool *killed_p) + bool *killed_p) { struct inode *inode = page->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); @@ -1128,7 +1128,6 @@ static int fuse_do_readpage(struct file *file, struct page *page, .ap.pages = &page, .ap.descs = &desc, .ap.args.page_cache = 1, - .ap.args.page_needs_release = page_needs_release, }; ssize_t res; u64 attr_ver; @@ -1179,7 +1178,7 @@ static int fuse_read_folio(struct file *file, struct folio *folio) if (fuse_is_bad(inode)) goto out; - err = fuse_do_readpage(file, page, false, &killed); + err = fuse_do_readpage(file, page, &killed); fuse_invalidate_atime(inode); out: if (!killed) @@ -1207,14 +1206,15 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { int i; + int killed = args->killed; struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_args_pages *ap = &ia->ap; size_t count = ia->read.in.size; size_t num_read = args->out_args[0].size; struct inode *inode = args->inode; - if (args->killed) - goto killed; + if (unlikely(killed)) + err = -EIO; /* * Short read means EOF. If file size is larger, truncate it @@ -1225,14 +1225,15 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, for (i = 0; i < ap->num_pages; i++) { struct page *page = ap->pages[i]; - if (!err) - SetPageUptodate(page); - else - SetPageError(page); - unlock_page(page); + if (likely(!killed)) { + if (!err) + SetPageUptodate(page); + else + SetPageError(page); + unlock_page(page); + } put_page(page); } -killed: fuse_invalidate_atime(inode); if (ia->ff) @@ -1255,7 +1256,6 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) ap->args.page_zeroing = true; ap->args.page_replace = true; ap->args.page_cache = 1; - ap->args.page_needs_release = false; /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { @@ -2732,7 +2732,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, zero_user_segment(page, 0, off); goto success; } - err = fuse_do_readpage(file, page, true, &killed); + err = fuse_do_readpage(file, page, &killed); if (err) goto cleanup; success: @@ -2740,10 +2740,9 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, return 0; cleanup: - if (!killed) { + if (!killed) unlock_page(page); - put_page(page); - } + put_page(page); error: return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 35488283cea3..28c495cb2a46 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -324,10 +324,7 @@ struct fuse_args { /** Request contains pages from page-cache */ unsigned page_cache:1; - /** Request pages need page_cache_release() */ - unsigned page_needs_release:1; - - /** Request was killed -- pages were released */ + /** Request was killed -- pages were unlocked */ unsigned killed:1; struct inode *io_inode; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index a22e0ffb3a8f..82baa1d0b800 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -502,12 +502,7 @@ void fuse_kill_requests(struct fuse_conn *fc, struct inode *inode, struct page *page = ia->ap.pages[i]; SetPageError(page); unlock_page(page); - if (req->args->page_needs_release) - put_page(page); - ia->ap.pages[i] = NULL; } - - ia->ap.num_pages = 0; } } EXPORT_SYMBOL_GPL(fuse_kill_requests); From khorenko at virtuozzo.com Fri Mar 28 23:51:00 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 28 Mar 2025 21:51:00 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse/kio: keep kio requests on revoke list at all times In-Reply-To: Message-ID: <202503282051.52SKp02E2711816@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.23 ------> commit dd6421ab35d0cf479033061b28f055404d7a16c7 Author: Alexey Kuznetsov Date: Fri Mar 28 20:00:58 2025 +0800 fs/fuse/kio: keep kio requests on revoke list at all times Before kio request is submitted downstream is it off revoke list. This was a bug, but it was directly required by bogus revoke logic which stole page cache pages from requests. So, we processed pages while the req was isolated from revocation, which would result in deadlock. Now we keep pages accounted as they should, so we can move adding to revocation list as early as possible. Note: also this means requests queued for expand/truncate were invisible to pending request list, now we fix this. Note2: we should consider adding request to revocation list immediately when it is created and becomes parsable. This would make fuse_invalidate_files() a lot simpler and less bug prone, yet there is danger to clash with place where abort is skipped intentionally due to being of the lists. This work is already done, but it is still not in a state appropriate for release. Affects: #VSTOR-100953 https://virtuozzo.atlassian.net/browse/VSTOR-100953 Signed-off-by: Alexey Kuznetsov Feature: vStorage --- fs/fuse/kio/pcs/pcs_fuse_kdirect.c | 44 +++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c index e3049ddaa091..1b3bc8f45563 100644 --- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c +++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c @@ -849,14 +849,18 @@ static bool kqueue_insert(struct pcs_dentry_info *di, struct fuse_req *req) return true; } +static void kqueue_remove(struct pcs_dentry_info *di, struct fuse_req *req) +{ + spin_lock(&di->kq_lock); + list_del_init(&req->list); + spin_unlock(&di->kq_lock); +} + static inline int req_wait_grow_queue(struct pcs_fuse_req *r, off_t offset, size_t size) { struct pcs_dentry_info *di = get_pcs_inode(r->req.args->io_inode); struct fuse_inode *fi = get_fuse_inode(r->req.args->io_inode); - if (!kqueue_insert(di, &r->req)) - return -EIO; - BUG_ON(r->req.in.h.opcode != FUSE_WRITE && r->req.in.h.opcode != FUSE_FALLOCATE); fuse_write_dio_begin(fi); @@ -980,9 +984,10 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r) BUG(); } - if (!kqueue_insert(di, req)) + if (req->args->ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &req->args->ff->ff_state)) ret = -EIO; - else if (req->in.h.opcode == FUSE_READ || req->in.h.opcode == FUSE_FSYNC || req->in.h.opcode == FUSE_FLUSH) + else if (req->in.h.opcode == FUSE_READ || req->in.h.opcode == FUSE_FSYNC || + req->in.h.opcode == FUSE_FLUSH) fuse_read_dio_begin(fi); else fuse_write_dio_begin(fi); @@ -992,14 +997,13 @@ static int pcs_fuse_prep_rw(struct pcs_fuse_req *r) return ret; } -static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) +static int pcs_fuse_submit_prepare(struct pcs_fuse_cluster *pfc, struct fuse_req *req) { struct pcs_fuse_req *r = pcs_req_from_fuse(req); struct fuse_args *args = req->args; struct fuse_inode *fi = get_fuse_inode(args->io_inode); struct pcs_dentry_info *di = pcs_inode_from_fuse(fi); - struct pcs_int_request* ireq; - int ret; + int ret = 0; BUG_ON(!di); BUG_ON(req->cache != pcs_fuse_req_cachep); @@ -1007,8 +1011,21 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) /* Init pcs_fuse_req */ memset(&r->exec, 0, sizeof(r->exec)); /* Use inline request structure */ - ireq = &r->exec.ireq; - ireq_init(di, ireq); + ireq_init(di, &r->exec.ireq); + + spin_lock(&di->lock); + if (!kqueue_insert(di, req)) + ret = -EIO; + spin_unlock(&di->lock); + return ret; +} + +static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) +{ + struct pcs_fuse_req *r = pcs_req_from_fuse(req); + struct pcs_int_request *ireq = &r->exec.ireq; + struct pcs_dentry_info *di = ireq->dentry; + int ret; switch (req->in.h.opcode) { case FUSE_WRITE: @@ -1026,7 +1043,7 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) } break; case FUSE_FALLOCATE: { - struct fuse_fallocate_in *inarg = (void*) args->in_args[0].value; + struct fuse_fallocate_in *inarg = (void *)req->args->in_args[0].value; size_t sz = READ_ONCE(di->fileinfo.attr.size); if (pfc->fc->no_fallocate) { @@ -1062,7 +1079,7 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) * and as i_size is still not advanced all the following ones are. */ WARN_ON_ONCE(inarg->offset + inarg->length > sz && - !inode_is_locked(&fi->inode)); + !inode_is_locked(req->args->io_inode)); } ret = pcs_fuse_prep_rw(r); @@ -1097,6 +1114,7 @@ static void pcs_fuse_submit(struct pcs_fuse_cluster *pfc, struct fuse_req *req) error: DTRACE("do fuse_request_end req:%p op:%d err:%d\n", req, req->in.h.opcode, req->out.h.error); + kqueue_remove(di, req); __fuse_request_end(req, false); return; @@ -1320,6 +1338,8 @@ static void kpcs_req_send(struct fuse_req *req, bool bg) refcount_inc(&req->count); __clear_bit(FR_PENDING, &req->flags); + pcs_fuse_submit_prepare(pfc, req); + pcs_fuse_submit(pfc, req); if (!bg) wait_event(req->waitq, From khorenko at virtuozzo.com Fri Mar 28 23:51:01 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 28 Mar 2025 21:51:01 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse/kio: tidy up RPC_AFFINITY_RSS In-Reply-To: Message-ID: <202503282051.52SKp1vR2711887@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.23 ------> commit 44b11631cff26667d0d2d63f1e3dd8cb3394d43a Author: Alexey Kuznetsov Date: Fri Mar 28 20:01:03 2025 +0800 fs/fuse/kio: tidy up RPC_AFFINITY_RSS Also, enable it for rdma (tested with mellanox) and unix sockets. Now it provides essentially perfect affinity when socket contexts never hit lock contention and cache bouncing provided RSS and XPS are configured correctly. Change fallback when rx_cpu is not available from RPC_AFFINITY_RETENT to RPC_AFFINITY_FAIR_SPREAD. Unfortunatley, we cannot enable it by default, since enabling RSS/XPS is an advanced performance tuning. Also, change fallback when rx_cpu is unknown from RPC_AFFINITY_RETENT to RPC_AFFINITY_FAIR_SPREAD Signed-off-by: Alexey Kuznetsov Feature: vStorage --- fs/fuse/kio/pcs/pcs_rdma_io.c | 1 + fs/fuse/kio/pcs/pcs_rpc.c | 25 ++++++++++++++++--------- fs/fuse/kio/pcs/pcs_rpc.h | 1 + fs/fuse/kio/pcs/pcs_sock_io.c | 5 ++--- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_rdma_io.c b/fs/fuse/kio/pcs/pcs_rdma_io.c index 2755b13fb8a5..d50f2c1e97e3 100644 --- a/fs/fuse/kio/pcs/pcs_rdma_io.c +++ b/fs/fuse/kio/pcs/pcs_rdma_io.c @@ -1096,6 +1096,7 @@ static void pcs_rdma_cq_comp_handler(struct ib_cq *cq, void *private) set_bit(PCS_RDMA_IO_CQE, &rio->io_flags); wake_up(&rio->waitq); + ep->rx_cpu = smp_processor_id(); pcs_rpc_kick_queue(ep); } diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c index b9774ce1ab34..71c2a3b54da7 100644 --- a/fs/fuse/kio/pcs/pcs_rpc.c +++ b/fs/fuse/kio/pcs/pcs_rpc.c @@ -339,6 +339,7 @@ void pcs_rpc_attach_new_ep(struct pcs_rpc * ep, struct pcs_rpc_engine * eng) atomic_set(&ep->netlat_cnt, 0); atomic64_set(&ep->netlat_avg, 0); ep->cpu = WORK_CPU_UNBOUND; + ep->rx_cpu = WORK_CPU_UNBOUND; ep->gc = NULL; if (eng->max_gc_index) @@ -863,27 +864,33 @@ static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle) ep->cpu = WORK_CPU_UNBOUND; } break; - case RPC_AFFINITY_RSS: - if (!(ep->flags & PCS_RPC_F_LOCAL) && ep->addr.type != PCS_ADDRTYPE_RDMA) - break; + case RPC_AFFINITY_RSS: { + int rx_cpu = READ_ONCE(ep->rx_cpu); + + if (rx_cpu != WORK_CPU_UNBOUND && ep->cpu != rx_cpu) + ep->cpu = rx_cpu; fallthrough; + } + case RPC_AFFINITY_FAIR_SPREAD: + if (ep->cpu == WORK_CPU_UNBOUND || + (time_is_before_jiffies(ep->cpu_stamp) && was_idle)) + pcs_rpc_cpu_select(ep); + break; case RPC_AFFINITY_RETENT: /* Naive socket-to-cpu binding approach */ - if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) { + if (ep->cpu == WORK_CPU_UNBOUND || + (time_is_before_jiffies(ep->cpu_stamp) && was_idle)) { ep->cpu_stamp = jiffies + rpc_cpu_time_slice; ep->cpu = smp_processor_id(); } break; case RPC_AFFINITY_SPREAD: - if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) { + if (ep->cpu == WORK_CPU_UNBOUND || + (time_is_before_jiffies(ep->cpu_stamp) && was_idle)) { ep->cpu_stamp = jiffies + rpc_cpu_time_slice; ep->cpu = pcs_rpc_cpu_next(); } break; - case RPC_AFFINITY_FAIR_SPREAD: - if (time_is_before_jiffies(ep->cpu_stamp) && was_idle) - pcs_rpc_cpu_select(ep); - break; default: pr_err("Unknown affinity mode: %u\n", rpc_affinity_mode); } diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h index cb18557a3da5..0bafc8a74263 100644 --- a/fs/fuse/kio/pcs/pcs_rpc.h +++ b/fs/fuse/kio/pcs/pcs_rpc.h @@ -142,6 +142,7 @@ struct pcs_rpc int cpu; unsigned long cpu_stamp; struct delayed_work cpu_timer_work; /* reset cpu affinity after being idle */ + int rx_cpu; struct mutex mutex; u64 accounted; diff --git a/fs/fuse/kio/pcs/pcs_sock_io.c b/fs/fuse/kio/pcs/pcs_sock_io.c index 7c62f483ea45..805b8f1e56b0 100644 --- a/fs/fuse/kio/pcs/pcs_sock_io.c +++ b/fs/fuse/kio/pcs/pcs_sock_io.c @@ -561,9 +561,8 @@ static void pcs_sk_kick_queue(struct sock *sk) sio = rcu_dereference_sk_user_data(sk); if (sio) { struct pcs_rpc *ep = sio->netio.parent; - TRACE(PEER_FMT" queue cpu=%d\n", PEER_ARGS(ep), smp_processor_id()); - if (rpc_affinity_mode == RPC_AFFINITY_RSS && !(ep->flags & PCS_RPC_F_LOCAL)) - ep->cpu = smp_processor_id(); + DTRACE(PEER_FMT" queue cpu=%d\n", PEER_ARGS(ep), smp_processor_id()); + ep->rx_cpu = smp_processor_id(); pcs_rpc_kick_queue(ep); } rcu_read_unlock(); From khorenko at virtuozzo.com Fri Mar 28 23:51:02 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 28 Mar 2025 21:51:02 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse/kio: create krpc request in special thread In-Reply-To: Message-ID: <202503282051.52SKp2rs2711958@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.23 ------> commit 55d45425282dd8e1b1a126fe5ebcc2b7c0177c5d Author: Alexey Kuznetsov Date: Fri Mar 28 20:01:08 2025 +0800 fs/fuse/kio: create krpc request in special thread Overhead of mapping rpc request is pretty high. And in cases when vstorage-mount even loop is saturated it makes sense to create shadow kernel thread which mm/files shared with user space. This is one series of patches (others are in user space), which increase raid read iops more than twice. It is en/disabled with module parameter "pcs_krpc_use_thread", which can be tuned at run time. Additionally, the patch fixes some old bugs of various fatality found during development and testing: buffer overflow and wrong error code returned. This part is the reason we push the patch to release, it is too intertangled with new request processing to be considered standalone. The patch is combo of two patches, merged together because the second one moves chunks of code around and it does not make sense to know about these bowel movent. Comment from the second patch: fs/fuse kio: properly return errors from sendmsg over kRPC Unifies the return of errors in sendmsg by passing the error as a result of krpc request completion. The error will then by returned to userpsace in recvmsg. However a linux error will be returned as return value of ioctl call, whereas a pcs error will be returned as the result of recvmsg. Signed-off-by: Alexey Kuznetsov Signed-off-by: Liu Kui Feature: vStorage --- fs/fuse/kio/pcs/pcs_krpc.c | 145 ++++++++++++++++++++++++++++++++++++++------- fs/fuse/kio/pcs/pcs_krpc.h | 7 ++- 2 files changed, 130 insertions(+), 22 deletions(-) diff --git a/fs/fuse/kio/pcs/pcs_krpc.c b/fs/fuse/kio/pcs/pcs_krpc.c index 323205a3e2df..58a9ceebfee2 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.c +++ b/fs/fuse/kio/pcs/pcs_krpc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include "pcs_types.h" @@ -24,6 +25,10 @@ unsigned int pcs_krpc_csaccel; module_param(pcs_krpc_csaccel, uint, 0644); MODULE_PARM_DESC(pcs_krpc_csaccel, "Enable krpc local cs bypass"); +unsigned int pcs_krpc_use_thread = 1; +module_param(pcs_krpc_use_thread, uint, 0644); +MODULE_PARM_DESC(pcs_krpc_use_thread, "Offload creating the request to a thread"); + extern unsigned int pcs_krpc_version; struct kmem_cache *krpc_req_cachep; @@ -82,7 +87,8 @@ static void krpc_req_complete(struct krpc_req *kreq, int error) comp->result = error; kreq_release_data_chunks(kreq); - pcs_mr_put(kreq->hdr_chunk.mr); + if (kreq->hdr_chunk.mr) + pcs_mr_put(kreq->hdr_chunk.mr); spin_lock(&krpc->lock); list_del(&kreq->link); @@ -292,7 +298,10 @@ static int pcs_krpc_ioctl_recv_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_re if (copy_to_user((void __user *)iocmsg->buf.addr, comp->_data_buf, comp->data_len)) res = -EFAULT; } else { /* response */ - iocmsg->result = comp->result; + if (comp->result >= 0) + iocmsg->result = comp->result; + else + res = comp->result; /* internal error */ } krpc_completion_free(comp); @@ -449,25 +458,23 @@ static int try_local_bypass(struct pcs_krpc *krpc, struct krpc_req *kreq) return 0; } -static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_sendmsg *iocmsg) +static int kreq_make_sendmsg(struct krpc_req *kreq) { - struct krpc_req *kreq; + struct pcs_krpc *krpc = kreq->krpc; struct pcs_msg *msg; struct pcs_krpc_buf_desc *chunk_bd; struct krpc_chunk *chunk; int res, i; struct bio_vec *bvec; + struct pcs_krpc_ioc_sendmsg *iocmsg; - kreq = krpc_req_alloc(); - if (!kreq) - return -ENOMEM; + iocmsg = &kreq->iocmsg; if (iocmsg->nr_data_chunks > NR_KRPC_DATA_CHUNKS_INLINE) { - kreq->data_chunks = kzalloc(iocmsg->nr_data_chunks, GFP_NOIO); - if (!kreq->data_chunks) { - res = -ENOMEM; - goto err_free_kreq; - } + kreq->data_chunks = kcalloc(iocmsg->nr_data_chunks, sizeof(struct krpc_chunk), + GFP_NOIO); + if (!kreq->data_chunks) + return -ENOMEM; } else { kreq->data_chunks = &kreq->inline_data_chunks[0]; } @@ -536,7 +543,7 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se chunk->addr = chunk_bdzc->offset; chunk->req = fuse_dev_find_request(chunk_bdzc->devfd, chunk_bdzc->unique); if (!chunk->req || chunk->req->args->killed) { - res = PCS_ERR_NET; + res = PCS_ERR_INV_PARAMS; goto err_free_data_chunk; } break; @@ -580,7 +587,6 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se goto err_free_data_chunk; } atomic_inc(&krpc->iocount); - kreq->krpc = pcs_krpc_get(krpc); list_add_tail(&kreq->link, &krpc->pending_queue); spin_unlock(&krpc->lock); @@ -600,12 +606,93 @@ static int pcs_krpc_ioctl_send_msg(struct pcs_krpc *krpc, struct pcs_krpc_ioc_se kreq_release_data_chunks(kreq); if (kreq->hdr_chunk.mr) pcs_mr_put(kreq->hdr_chunk.mr); - -err_free_kreq: - krpc_req_free(kreq); return res; } +static void kreq_submit(struct krpc_req *kreq) +{ + int res; + + res = kreq_make_sendmsg(kreq); + if (res) { + kreq->data_chunks = &kreq->inline_data_chunks[0]; + kreq->data_len = 0; + kreq->nr_data_chunks = 0; + kreq->nr_data_bvecs = 0; + kreq->hdr_chunk.mr = NULL; + kreq->completion.xid = kreq->iocmsg.xid; + kreq->completion.private = kreq; + INIT_LIST_HEAD(&kreq->link); + + krpc_req_complete(kreq, res); + } +} + +static int krpc_threadfn(void *data) +{ + struct pcs_krpc_set *krpcs = data; + + for (;;) { + struct llist_node *ll; + + set_current_state(TASK_INTERRUPTIBLE); + + ll = llist_del_all(&krpcs->req_llist); + + if (ll == NULL) { + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + schedule(); + continue; + } + + __set_current_state(TASK_RUNNING); + + while (ll) { + struct llist_node *next = ll->next; + struct krpc_req *kreq = container_of(ll, struct krpc_req, llist_link); + + kreq_submit(kreq); + + ll = next; + } + } +} + +static int pcs_krpc_ioctl_send_msg(struct krpc_req *kreq) +{ + struct task_struct *tsk; + struct pcs_cluster_core *cc; + + if (pcs_krpc_use_thread) { + cc = container_of(kreq->krpc->krpcs, struct pcs_cluster_core, krpcs); + tsk = cc->krpcs.krpc_task; + if (unlikely(tsk == NULL)) { + tsk = kthread_create(krpc_threadfn, &cc->krpcs, "krpc_send"); + if (tsk && !IS_ERR(tsk)) { + cc->krpcs.krpc_task = get_task_struct(tsk); + mmget(current->mm); + tsk->mm = current->mm; + tsk->active_mm = current->mm; + atomic_inc(¤t->files->count); + tsk->files = current->files; + } + } + + if (likely(tsk)) { + llist_add(&kreq->llist_link, &kreq->krpc->krpcs->req_llist); + wake_up_process(tsk); + return 0; + } + } + + kreq_submit(kreq); + + return 0; +} + static int pcs_krpc_abort(struct pcs_krpc *krpc) { struct krpc_req *kreq, *tmp; @@ -733,12 +820,23 @@ static long pcs_krpc_ioctl(struct file *file, unsigned int cmd, unsigned long ar switch (cmd) { case PCS_KRPC_IOC_SEND_MSG: { - struct pcs_krpc_ioc_sendmsg req; + struct krpc_req *kreq; - if (copy_from_user(&req, (void __user *)arg, sizeof(req))) + kreq = krpc_req_alloc(); + if (!kreq) + return -ENOMEM; + + if (copy_from_user(&kreq->iocmsg, (void __user *)arg, sizeof(kreq->iocmsg))) { + krpc_req_free(kreq); return -EFAULT; + } - res = pcs_krpc_ioctl_send_msg(krpc, &req); + kreq->krpc = pcs_krpc_get(krpc); + res = pcs_krpc_ioctl_send_msg(kreq); + if (res) { + pcs_krpc_put(krpc); + krpc_req_free(kreq); + } break; } case PCS_KRPC_IOC_RECV_MSG: { @@ -1068,7 +1166,8 @@ void pcs_krpcset_init(struct pcs_krpc_set *krpcs) INIT_LIST_HEAD(&krpcs->list); krpcs->nkrpc = 0; - + krpcs->krpc_task = NULL; + init_llist_head(&krpcs->req_llist); spin_lock_init(&krpcs->lock); } @@ -1094,6 +1193,10 @@ void pcs_krpcset_fini(struct pcs_krpc_set *krpcs) } spin_unlock(&krpcs->lock); + if (krpcs->krpc_task) { + kthread_stop(krpcs->krpc_task); + put_task_struct(krpcs->krpc_task); + } BUG_ON(!list_empty(&krpcs->list)); BUG_ON(krpcs->nkrpc != 0); } diff --git a/fs/fuse/kio/pcs/pcs_krpc.h b/fs/fuse/kio/pcs/pcs_krpc.h index c6b867b5fa75..8021b0262560 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.h +++ b/fs/fuse/kio/pcs/pcs_krpc.h @@ -36,7 +36,9 @@ struct pcs_krpc_set { struct list_head list; unsigned int nkrpc; - spinlock_t lock; + spinlock_t lock; + struct task_struct *krpc_task; + struct llist_head req_llist; }; enum { @@ -127,6 +129,9 @@ struct krpc_req { struct bio_vec data_bvecs[KRPC_MAX_DATA_PAGES]; struct krpc_completion completion; + + struct llist_node llist_link; + struct pcs_krpc_ioc_sendmsg iocmsg; }; static inline u32 pcs_krpc_msg_size(u32 size, u8 flags) From khorenko at virtuozzo.com Fri Mar 28 23:51:02 2025 From: khorenko at virtuozzo.com (Konstantin Khorenko) Date: Fri, 28 Mar 2025 21:51:02 +0100 Subject: [Devel] [PATCH RHEL9 COMMIT] fs/fuse/krpc: prevent krpc request from crossing reconnect In-Reply-To: Message-ID: <202503282051.52SKp2Lh2712029@f0.sw.ru> The commit is pushed to "branch-rh9-5.14.0-427.44.1.vz9.80.x-ovz" and will appear at git at bitbucket.org:openvz/vzkernel.git after rh9-5.14.0-427.44.1.vz9.80.23 ------> commit f1b3519cb9cb2dcac93a92d9ae362cfdd740f4d2 Author: Alexey Kuznetsov Date: Fri Mar 28 20:01:13 2025 +0800 fs/fuse/krpc: prevent krpc request from crossing reconnect Do it with already existing generation id. kreq stores genid of connection at time of enqueue, and if it races with reconnect, it is rejected. Signed-off-by: Alexey Kuznetsov Feature: fuse: kRPC - single RPC for kernel and userspace --- fs/fuse/kio/pcs/pcs_krpc.c | 5 ++++- fs/fuse/kio/pcs/pcs_krpc.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/fuse/kio/pcs/pcs_krpc.c b/fs/fuse/kio/pcs/pcs_krpc.c index 58a9ceebfee2..28c6104d7dc3 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.c +++ b/fs/fuse/kio/pcs/pcs_krpc.c @@ -581,7 +581,8 @@ static int kreq_make_sendmsg(struct krpc_req *kreq) msg->get_iter = krpc_msg_get_data; spin_lock(&krpc->lock); - if (krpc->state != PCS_KRPC_STATE_CONNECTED) { + if (krpc->state != PCS_KRPC_STATE_CONNECTED || + krpc->gen != kreq->gen) { spin_unlock(&krpc->lock); res = -ECONNABORTED; goto err_free_data_chunk; @@ -826,6 +827,8 @@ static long pcs_krpc_ioctl(struct file *file, unsigned int cmd, unsigned long ar if (!kreq) return -ENOMEM; + kreq->gen = ctx->gen; + if (copy_from_user(&kreq->iocmsg, (void __user *)arg, sizeof(kreq->iocmsg))) { krpc_req_free(kreq); return -EFAULT; diff --git a/fs/fuse/kio/pcs/pcs_krpc.h b/fs/fuse/kio/pcs/pcs_krpc.h index 8021b0262560..15d9f77aa401 100644 --- a/fs/fuse/kio/pcs/pcs_krpc.h +++ b/fs/fuse/kio/pcs/pcs_krpc.h @@ -130,6 +130,7 @@ struct krpc_req { struct krpc_completion completion; + u32 gen; struct llist_node llist_link; struct pcs_krpc_ioc_sendmsg iocmsg; };