[Devel] [PATCH vz9/vz10] dm-ploop: fallback to kvmalloc for large bvec allocations
Vasileios Almpanis
vasileios.ampanis at virtuozzo.com
Wed Oct 22 16:35:44 MSK 2025
Yes its exactly as you put it.
On 10/22/25 3:34 PM, Pavel Tikhomirov wrote:
>
>
> On 10/22/25 20:38, Alexey Kuznetsov wrote:
>> Hello!
>>
>> Beware, it used GFP_ATOMIC. Does not this mean t his code can be
>> executed in interrupt context?
>> If so, then kvmalloc is a strict no.
>
> The idea was if we require high order allocation (when we create bvec
> from rq) in interrupt (I guess it is ploop_clone_and_map() path)
> instead of just failing, as it was before this patch, we put pio to a
> "to be handled later" list (see ploop_prepare_one_embedded_pio). And
> we handle allocation for this pio in ploop kernel threads, which
> already run in non-atomic context and can use kvmalloc safely.
>
>>
>> On Wed, Oct 22, 2025 at 8:11 PM Vasileios Almpanis
>> <vasileios.almpanis at virtuozzo.com> wrote:
>>>
>>> When handling multiple concurrent dm-ploop requests, large bio_vec
>>> arrays
>>> can be allocated during request processing. These allocations are
>>> currently
>>> done with kmalloc_array(GFP_ATOMIC), which can fail under memory
>>> pressure
>>> for higher orders (order >= 6, ~256KB). Such failures result in
>>> partial or
>>> corrupted I/O, leading to EXT4 directory checksum errors and read-only
>>> remounts under heavy parallel workloads.
>>>
>>> This patch adds a fallback mechanism to use kvmalloc_array for
>>> large or failed allocations. If the estimated allocation order is >=
>>> 6, or
>>> if the kmalloc_array allocation fails. This avoids high-order
>>> GFP_ATOMIC
>>> allocations from interrupt context and ensures more reliable memory
>>> allocation
>>> behavior.
>>>
>>> https://virtuozzo.atlassian.net/browse/VSTOR-109595
>>> Signed-off-by: Vasileios Almpanis <vasileios.almpanis at virtuozzo.com>
>>> Feature: dm-ploop: ploop target driver
>>> ---
>>> drivers/md/dm-ploop-map.c | 46
>>> ++++++++++++++++++++++++++++++---------
>>> drivers/md/dm-ploop.h | 1 +
>>> 2 files changed, 37 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
>>> index 3fb841f8bcea..899b9bf088b3 100644
>>> --- a/drivers/md/dm-ploop-map.c
>>> +++ b/drivers/md/dm-ploop-map.c
>>> @@ -16,6 +16,7 @@
>>> #include <linux/error-injection.h>
>>> #include <linux/uio.h>
>>> #include <linux/blk-mq.h>
>>> +#include <linux/mm.h>
>>> #include <uapi/linux/falloc.h>
>>> #include "dm-ploop.h"
>>> #include "dm-rq.h"
>>> @@ -89,6 +90,7 @@ void ploop_init_pio(struct ploop *ploop, unsigned
>>> int bi_op, struct pio *pio)
>>> pio->ref_index = PLOOP_REF_INDEX_INVALID;
>>> pio->queue_list_id = PLOOP_LIST_DEFERRED;
>>> pio->bi_status = BLK_STS_OK;
>>> + pio->use_kvmalloc = false;
>>> atomic_set(&pio->remaining, 1);
>>> pio->piwb = NULL;
>>> INIT_LIST_HEAD(&pio->list);
>>> @@ -193,8 +195,12 @@ static void ploop_prq_endio(struct pio *pio,
>>> void *prq_ptr,
>>> struct ploop_rq *prq = prq_ptr;
>>> struct request *rq = prq->rq;
>>>
>>> - if (prq->bvec)
>>> - kfree(prq->bvec);
>>> + if (prq->bvec) {
>>> + if (pio->use_kvmalloc)
>>> + kvfree(prq->bvec);
>>> + else
>>> + kfree(prq->bvec);
>>> + }
>>> if (prq->css)
>>> css_put(prq->css);
>>> /*
>>> @@ -1963,26 +1969,40 @@ void ploop_index_wb_submit(struct ploop
>>> *ploop, struct ploop_index_wb *piwb)
>>> ploop_runners_add_work(ploop, pio);
>>> }
>>>
>>> -static struct bio_vec *ploop_create_bvec_from_rq(struct request *rq)
>>> +static struct bio_vec *ploop_create_bvec_from_rq(struct request
>>> *rq, bool use_kvmalloc)
>>> {
>>> struct bio_vec bv, *bvec, *tmp;
>>> struct req_iterator rq_iter;
>>> unsigned int nr_bvec = 0;
>>> + unsigned int order = 0;
>>>
>>> rq_for_each_bvec(bv, rq, rq_iter)
>>> nr_bvec++;
>>>
>>> - bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
>>> - GFP_ATOMIC);
>>> - if (!bvec)
>>> - goto out;
>>> + if (use_kvmalloc) {
>>> + bvec = kvmalloc_array(nr_bvec, sizeof(struct bio_vec),
>>> + GFP_NOIO);
>>> + if (!bvec)
>>> + return ERR_PTR(-ENOMEM);
>>> + } else {
>>> + order = get_order(nr_bvec * sizeof(struct bio_vec));
>>> + /*
>>> + * order 6 is 262144 bytes. Lets defer such big
>>> + * allocations to workqueue.
>>> + */
>>> + if (order >= 6)
>>> + return ERR_PTR(-EAGAIN);
>>> + bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
>>> + GFP_ATOMIC | __GFP_NOWARN);
>>> + if (!bvec)
>>> + return ERR_PTR(-EAGAIN);
>>> + }
>>>
>>> tmp = bvec;
>>> rq_for_each_bvec(bv, rq, rq_iter) {
>>> *tmp = bv;
>>> tmp++;
>>> }
>>> -out:
>>> return bvec;
>>> }
>>> ALLOW_ERROR_INJECTION(ploop_create_bvec_from_rq, NULL);
>>> @@ -2003,9 +2023,15 @@ static void
>>> ploop_prepare_one_embedded_pio(struct ploop *ploop,
>>> * Transform a set of bvec arrays related to bios
>>> * into a single bvec array (which we can iterate).
>>> */
>>> - bvec = ploop_create_bvec_from_rq(rq);
>>> - if (!bvec)
>>> + bvec = ploop_create_bvec_from_rq(rq,
>>> pio->use_kvmalloc);
>>> + if (IS_ERR(bvec)) {
>>> + if (PTR_ERR(bvec) == -EAGAIN) {
>>> + pio->use_kvmalloc = true;
>>> + llist_add((struct llist_node
>>> *)(&pio->list), &ploop->pios[PLOOP_LIST_PREPARE]);
>>> + return;
>>> + }
>>> goto err_nomem;
>>> + }
>>> prq->bvec = bvec;
>>> skip_bvec:
>>> pio->bi_iter.bi_size = blk_rq_bytes(rq);
>>> diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
>>> index fc12efeb0cd9..53e8d12064bd 100644
>>> --- a/drivers/md/dm-ploop.h
>>> +++ b/drivers/md/dm-ploop.h
>>> @@ -316,6 +316,7 @@ struct pio {
>>> unsigned int ref_index:2;
>>>
>>> u8 queue_list_id; /* id in ploop->pios */
>>> + bool use_kvmalloc;
>>>
>>> struct ploop_index_wb *piwb;
>>>
>>> --
>>> 2.43.0
>>>
>>> _______________________________________________
>>> Devel mailing list
>>> Devel at openvz.org
>>> https://lists.openvz.org/mailman/listinfo/devel
>
More information about the Devel
mailing list