[Devel] [PATCH vz9/vz10] dm-ploop: fallback to kvmalloc for large bvec allocations
    Vasileios Almpanis 
    vasileios.ampanis at virtuozzo.com
       
    Wed Oct 22 18:25:50 MSK 2025
    
    
  
On 10/22/25 4:46 PM, Alexey Kuznetsov wrote:
> Hello!
>
> If it really does what you described, it is all right.
>
> But it does not look like it does. From what I see it really calls
> kvmalloc with GFP_ATOMIC.  No?
> GFP_ATOMIC of high order is death to the system, literally.
We want to keep performance and not reschedule in case the node has 
plenty of
ram, so I abstained from placing a concrete limit to allocation in 
atomic context.
>
> Why not to check order when on interrupt to go straight to workqueue
> instead of all the trickery?
>
> Moreover, IMHO block layer does _not_ use GFP_ATOMIC at all, unless it
> is backed by mempool,
> where it is not actually GFP_ATOMIC, but the thing which I described before.
> As I see dm ploop/qcow is the only device which abuses GFP_ATOMIC. Does not
> this smell fishy? :-)
>
> Also, the last version added a formal technical error:
>
> if (flags & GFP_ATOMIC)
>
> GFP_ATOMIC is not a flag, it is combo of many flags, which can in theory
> intersect wth everything
I replaced the flag check with gfpflags_allow_blocking.
>
> On Wed, Oct 22, 2025 at 9:41 PM Pavel Tikhomirov
> <ptikhomirov at virtuozzo.com> wrote:
>>
>>
>> On 10/22/25 20:38, Alexey Kuznetsov wrote:
>>> Hello!
>>>
>>> Beware, it used GFP_ATOMIC. Does not this mean t his code can be
>>> executed in interrupt context?
>>> If so, then kvmalloc is a strict no.
>> The idea was if we require high order allocation (when we create bvec
>> from rq) in interrupt (I guess it is ploop_clone_and_map() path) instead
>> of just failing, as it was before this patch, we put pio to a "to be
>> handled later" list (see ploop_prepare_one_embedded_pio). And we handle
>> allocation for this pio in ploop kernel threads, which already run in
>> non-atomic context and can use kvmalloc safely.
>>
>>> On Wed, Oct 22, 2025 at 8:11 PM Vasileios Almpanis
>>> <vasileios.almpanis at virtuozzo.com> wrote:
>>>> When handling multiple concurrent dm-ploop requests, large bio_vec arrays
>>>> can be allocated during request processing. These allocations are currently
>>>> done with kmalloc_array(GFP_ATOMIC), which can fail under memory pressure
>>>> for higher orders (order >= 6, ~256KB). Such failures result in partial or
>>>> corrupted I/O, leading to EXT4 directory checksum errors and read-only
>>>> remounts under heavy parallel workloads.
>>>>
>>>> This patch adds a fallback mechanism to use kvmalloc_array for
>>>> large or failed allocations. If the estimated allocation order is >= 6, or
>>>> if the kmalloc_array allocation fails. This avoids high-order GFP_ATOMIC
>>>> allocations from interrupt context and ensures more reliable memory allocation
>>>> behavior.
>>>>
>>>> https://virtuozzo.atlassian.net/browse/VSTOR-109595
>>>> Signed-off-by: Vasileios Almpanis <vasileios.almpanis at virtuozzo.com>
>>>> Feature: dm-ploop: ploop target driver
>>>> ---
>>>>    drivers/md/dm-ploop-map.c | 46 ++++++++++++++++++++++++++++++---------
>>>>    drivers/md/dm-ploop.h     |  1 +
>>>>    2 files changed, 37 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
>>>> index 3fb841f8bcea..899b9bf088b3 100644
>>>> --- a/drivers/md/dm-ploop-map.c
>>>> +++ b/drivers/md/dm-ploop-map.c
>>>> @@ -16,6 +16,7 @@
>>>>    #include <linux/error-injection.h>
>>>>    #include <linux/uio.h>
>>>>    #include <linux/blk-mq.h>
>>>> +#include <linux/mm.h>
>>>>    #include <uapi/linux/falloc.h>
>>>>    #include "dm-ploop.h"
>>>>    #include "dm-rq.h"
>>>> @@ -89,6 +90,7 @@ void ploop_init_pio(struct ploop *ploop, unsigned int bi_op, struct pio *pio)
>>>>           pio->ref_index = PLOOP_REF_INDEX_INVALID;
>>>>           pio->queue_list_id = PLOOP_LIST_DEFERRED;
>>>>           pio->bi_status = BLK_STS_OK;
>>>> +       pio->use_kvmalloc = false;
>>>>           atomic_set(&pio->remaining, 1);
>>>>           pio->piwb = NULL;
>>>>           INIT_LIST_HEAD(&pio->list);
>>>> @@ -193,8 +195,12 @@ static void ploop_prq_endio(struct pio *pio, void *prq_ptr,
>>>>           struct ploop_rq *prq = prq_ptr;
>>>>           struct request *rq = prq->rq;
>>>>
>>>> -       if (prq->bvec)
>>>> -               kfree(prq->bvec);
>>>> +       if (prq->bvec) {
>>>> +               if (pio->use_kvmalloc)
>>>> +                       kvfree(prq->bvec);
>>>> +               else
>>>> +                       kfree(prq->bvec);
>>>> +       }
>>>>           if (prq->css)
>>>>                   css_put(prq->css);
>>>>           /*
>>>> @@ -1963,26 +1969,40 @@ void ploop_index_wb_submit(struct ploop *ploop, struct ploop_index_wb *piwb)
>>>>           ploop_runners_add_work(ploop, pio);
>>>>    }
>>>>
>>>> -static struct bio_vec *ploop_create_bvec_from_rq(struct request *rq)
>>>> +static struct bio_vec *ploop_create_bvec_from_rq(struct request *rq, bool use_kvmalloc)
>>>>    {
>>>>           struct bio_vec bv, *bvec, *tmp;
>>>>           struct req_iterator rq_iter;
>>>>           unsigned int nr_bvec = 0;
>>>> +       unsigned int order = 0;
>>>>
>>>>           rq_for_each_bvec(bv, rq, rq_iter)
>>>>                   nr_bvec++;
>>>>
>>>> -       bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
>>>> -                            GFP_ATOMIC);
>>>> -       if (!bvec)
>>>> -               goto out;
>>>> +       if (use_kvmalloc) {
>>>> +               bvec = kvmalloc_array(nr_bvec, sizeof(struct bio_vec),
>>>> +                                     GFP_NOIO);
>>>> +               if (!bvec)
>>>> +                       return ERR_PTR(-ENOMEM);
>>>> +       } else {
>>>> +               order = get_order(nr_bvec * sizeof(struct bio_vec));
>>>> +               /*
>>>> +                * order 6 is 262144 bytes. Lets defer such big
>>>> +                * allocations to workqueue.
>>>> +                */
>>>> +               if (order >= 6)
>>>> +                       return ERR_PTR(-EAGAIN);
>>>> +               bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
>>>> +                                    GFP_ATOMIC | __GFP_NOWARN);
>>>> +               if (!bvec)
>>>> +                       return ERR_PTR(-EAGAIN);
>>>> +       }
>>>>
>>>>           tmp = bvec;
>>>>           rq_for_each_bvec(bv, rq, rq_iter) {
>>>>                   *tmp = bv;
>>>>                   tmp++;
>>>>           }
>>>> -out:
>>>>           return bvec;
>>>>    }
>>>>    ALLOW_ERROR_INJECTION(ploop_create_bvec_from_rq, NULL);
>>>> @@ -2003,9 +2023,15 @@ static void ploop_prepare_one_embedded_pio(struct ploop *ploop,
>>>>                    * Transform a set of bvec arrays related to bios
>>>>                    * into a single bvec array (which we can iterate).
>>>>                    */
>>>> -               bvec = ploop_create_bvec_from_rq(rq);
>>>> -               if (!bvec)
>>>> +               bvec = ploop_create_bvec_from_rq(rq, pio->use_kvmalloc);
>>>> +               if (IS_ERR(bvec)) {
>>>> +                       if (PTR_ERR(bvec) == -EAGAIN) {
>>>> +                               pio->use_kvmalloc = true;
>>>> +                               llist_add((struct llist_node *)(&pio->list), &ploop->pios[PLOOP_LIST_PREPARE]);
>>>> +                               return;
>>>> +                       }
>>>>                           goto err_nomem;
>>>> +               }
>>>>                   prq->bvec = bvec;
>>>>    skip_bvec:
>>>>                   pio->bi_iter.bi_size = blk_rq_bytes(rq);
>>>> diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
>>>> index fc12efeb0cd9..53e8d12064bd 100644
>>>> --- a/drivers/md/dm-ploop.h
>>>> +++ b/drivers/md/dm-ploop.h
>>>> @@ -316,6 +316,7 @@ struct pio {
>>>>           unsigned int ref_index:2;
>>>>
>>>>           u8 queue_list_id; /* id in ploop->pios */
>>>> +       bool use_kvmalloc;
>>>>
>>>>           struct ploop_index_wb *piwb;
>>>>
>>>> --
>>>> 2.43.0
>>>>
>>>> _______________________________________________
>>>> Devel mailing list
>>>> Devel at openvz.org
>>>> https://lists.openvz.org/mailman/listinfo/devel
>> --
>> Best regards, Pavel Tikhomirov
>> Senior Software Developer, Virtuozzo.
    
    
More information about the Devel
mailing list