[Devel] [PATCH rh7] ploop: io_direct: delay f_op->fsync() until index_update for reloc requests

Maxim Patlasov mpatlasov at virtuozzo.com
Tue Jul 19 10:45:21 PDT 2016


Dima,


I have not heard from you since 07/06/2016. Do you agree with that 
reasoning I provided in last email? What's your objection against the 
patch now?


Thanks,

Maxim


On 07/06/2016 11:10 AM, Maxim Patlasov wrote:
> Dima,
>
> On 07/06/2016 04:58 AM, Dmitry Monakhov wrote:
>
>> Maxim Patlasov <mpatlasov at virtuozzo.com> writes:
>>
>>> Commit 9f860e606 introduced an engine to delay fsync: doing
>>> fallocate(FALLOC_FL_CONVERT_UNWRITTEN) dio_post_submit marks
>>> io as PLOOP_IO_FSYNC_DELAYED to ensure that fsync happens
>>> later, when incoming FLUSH|FUA comes.
>>>
>>> That was deemed as important because (PSBM-47026):
>>>
>>>> This optimization becomes more important due to the fact that 
>>>> customers tend to use pcompact heavily => ploop images grow each day.
>>> Now, we can easily re-use the engine to delay fsync for reloc
>>> requests as well. As explained in the description of commit
>>> 5aa3fe09:
>>>
>>>>      1->read_data_from_old_post
>>>>      2->write_to_new_pos
>>>>        ->sumbit_alloc
>>>>           ->submit_pad
>>>>       ->post_submit->convert_unwritten
>>>>      3->update_index ->write_page with FLUSH|FUA
>>>>      4->nullify_old_pos
>>>>     5->issue_flush
>>> by the time of step 3 extent coversion is not yet stable because
>>> belongs to uncommitted transaction. But instead of doing fsync
>>> inside ->post_submit, we can fsync later, as the very first step
>>> of write_page for index_update.
>> NAK from me. What is advantage of this patch?
>
> The advantage is the following: in case of BAT multi-updates, instead 
> of doing many fsync-s (one per dio_post_submit), we'll do only one 
> (when final ->write_page is called).
>
>> Does it makes code more optimal? No
>
> Yes, it does. In the same sense as 9f860e606: saving some fsync-s.
>
>> Does it makes main ploop more asynchronous? No.
>
> Correct, the patch optimizes ploop in the other way. It's not about 
> making ploop more asynchronous.
>
>
>>
>> If you want to make optimization then it is reasonable to
>> queue preq with PLOOP_IO_FSYNC_DELAYED to top_io->fsync_queue
>> before processing PLOOP_E_DATA_WBI  state for  preq with FUA
>> So sequence will looks like follows:
>> ->sumbit_alloc
>>    ->submit_pad
>>    ->post_submit->convert_unwritten-> tag PLOOP_IO_FSYNC_DELAYED
>> ->ploop_req_state_process
>>    case PLOOP_E_DATA_WBI:
>>    if (preq->start & PLOOP_IO_FSYNC_DELAYED_FL) {
>>        preq->start &= ~PLOOP_IO_FSYNC_DELAYED_FL
>>        list_add_tail(&preq->list, &top_io->fsync_queue)
>>        return;
>>     }
>> ##Let fsync_thread do it's work
>> ->ploop_req_state_process
>>     case LOOP_E_DATA_WBI:
>>     update_index->write_page with FUA (FLUSH is not required because 
>> we  already done fsync)
>
> That's another type of optimization: making ploop more asynchronous. I 
> thought about it, but didn't come to conclusion whether it's worthy 
> w.r.t. adding more complexity to ploop-state-machine and possible bugs 
> introduced with that.
>
> Thanks,
> Maxim
>
>>
>>> https://jira.sw.ru/browse/PSBM-47026
>>>
>>> Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
>>> ---
>>>   drivers/block/ploop/dev.c       |    4 ++--
>>>   drivers/block/ploop/io_direct.c |   25 ++++++++++++++++++++++++-
>>>   drivers/block/ploop/io_kaio.c   |    3 ++-
>>>   drivers/block/ploop/map.c       |   17 ++++++++++++-----
>>>   include/linux/ploop/ploop.h     |    3 ++-
>>>   5 files changed, 42 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
>>> index e5f010b..40768b6 100644
>>> --- a/drivers/block/ploop/dev.c
>>> +++ b/drivers/block/ploop/dev.c
>>> @@ -4097,7 +4097,7 @@ static void ploop_relocate(struct ploop_device 
>>> * plo)
>>>       preq->bl.tail = preq->bl.head = NULL;
>>>       preq->req_cluster = 0;
>>>       preq->req_size = 0;
>>> -    preq->req_rw = WRITE_SYNC|REQ_FUA;
>>> +    preq->req_rw = WRITE_SYNC;
>>>       preq->eng_state = PLOOP_E_ENTRY;
>>>       preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_A);
>>>       preq->error = 0;
>>> @@ -4401,7 +4401,7 @@ static void ploop_relocblks_process(struct 
>>> ploop_device *plo)
>>>           preq->bl.tail = preq->bl.head = NULL;
>>>           preq->req_cluster = ~0U; /* uninitialized */
>>>           preq->req_size = 0;
>>> -        preq->req_rw = WRITE_SYNC|REQ_FUA;
>>> +        preq->req_rw = WRITE_SYNC;
>>>           preq->eng_state = PLOOP_E_ENTRY;
>>>           preq->state = (1 << PLOOP_REQ_SYNC) | (1 << 
>>> PLOOP_REQ_RELOC_S);
>>>           preq->error = 0;
>>> diff --git a/drivers/block/ploop/io_direct.c 
>>> b/drivers/block/ploop/io_direct.c
>>> index 1086850..0a5fb15 100644
>>> --- a/drivers/block/ploop/io_direct.c
>>> +++ b/drivers/block/ploop/io_direct.c
>>> @@ -1494,13 +1494,36 @@ dio_read_page(struct ploop_io * io, struct 
>>> ploop_request * preq,
>>>     static void
>>>   dio_write_page(struct ploop_io * io, struct ploop_request * preq,
>>> -           struct page * page, sector_t sec, unsigned long rw)
>>> +           struct page * page, sector_t sec, unsigned long rw,
>>> +           int do_fsync_if_delayed)
>>>   {
>>>       if (!(io->files.file->f_mode & FMODE_WRITE)) {
>>>           PLOOP_FAIL_REQUEST(preq, -EBADF);
>>>           return;
>>>       }
>>>   +    if (do_fsync_if_delayed &&
>>> +        test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)) {
>>> +        struct ploop_device * plo = io->plo;
>>> +        u64 io_count;
>>> +        int err;
>>> +
>>> +        spin_lock_irq(&plo->lock);
>>> +        io_count = io->io_count;
>>> +        spin_unlock_irq(&plo->lock);
>>> +
>>> +        err = io->ops->sync(io);
>>> +        if (err) {
>>> +            PLOOP_FAIL_REQUEST(preq, -EBADF);
>>> +            return;
>>> +        }
>>> +
>>> +        spin_lock_irq(&plo->lock);
>>> +        if (io_count == io->io_count && !(io_count & 1))
>>> +            clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
>>> +        spin_unlock_irq(&plo->lock);
>>> +    }
>>> +
>>>       dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
>>>   }
>>>   diff --git a/drivers/block/ploop/io_kaio.c 
>>> b/drivers/block/ploop/io_kaio.c
>>> index 85863df..0d731ef 100644
>>> --- a/drivers/block/ploop/io_kaio.c
>>> +++ b/drivers/block/ploop/io_kaio.c
>>> @@ -614,7 +614,8 @@ kaio_read_page(struct ploop_io * io, struct 
>>> ploop_request * preq,
>>>     static void
>>>   kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
>>> -         struct page * page, sector_t sec, unsigned long rw)
>>> +        struct page * page, sector_t sec, unsigned long rw,
>>> +        int do_fsync_if_delayed)
>>>   {
>>>       ploop_prepare_tracker(preq, sec);
>>>   diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
>>> index 1883674..96e428b 100644
>>> --- a/drivers/block/ploop/map.c
>>> +++ b/drivers/block/ploop/map.c
>>> @@ -910,6 +910,7 @@ void ploop_index_update(struct ploop_request * 
>>> preq)
>>>       sector_t sec;
>>>       unsigned long rw;
>>>       unsigned long state = READ_ONCE(preq->state);
>>> +    int do_fsync_if_delayed = 0;
>>>         /* No way back, we are going to initiate index write. */
>>>   @@ -970,10 +971,13 @@ void ploop_index_update(struct ploop_request 
>>> * preq)
>>>       preq->req_rw &= ~REQ_FLUSH;
>>>         /* Relocate requires consistent index update */
>>> -    if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>>> +    if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
>>>           rw |= (REQ_FLUSH | REQ_FUA);
>>> +        do_fsync_if_delayed = 1;
>>> +    }
>>>   - top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw);
>>> + top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw,
>>> +                      do_fsync_if_delayed);
>>>         put_page(page);
>>>       return;
>>> @@ -1096,6 +1100,7 @@ static void map_wb_complete(struct map_node * 
>>> m, int err)
>>>       unsigned int idx;
>>>       sector_t sec;
>>>       unsigned long rw;
>>> +    int do_fsync_if_delayed = 0;
>>>         /* First, complete processing of written back indices,
>>>        * finally instantiate indices in mapping cache.
>>> @@ -1193,8 +1198,10 @@ static void map_wb_complete(struct map_node * 
>>> m, int err)
>>>                 state = READ_ONCE(preq->state);
>>>               /* Relocate requires consistent index update */
>>> -            if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>>> +            if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
>>>                   rw |= (REQ_FLUSH | REQ_FUA);
>>> +                do_fsync_if_delayed = 1;
>>> +            }
>>>                 preq->eng_state = PLOOP_E_INDEX_WB;
>>>               get_page(page);
>>> @@ -1221,8 +1228,8 @@ static void map_wb_complete(struct map_node * 
>>> m, int err)
>>>       plo->st.map_multi_writes++;
>>>       top_delta->ops->map_index(top_delta, m->mn_start, &sec);
>>>   - top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec,
>>> -                      rw);
>>> + top_delta->io.ops->write_page(&top_delta->io, main_preq, page, 
>>> sec, rw,
>>> +                      do_fsync_if_delayed);
>>>       put_page(page);
>>>   }
>>>   diff --git a/include/linux/ploop/ploop.h 
>>> b/include/linux/ploop/ploop.h
>>> index deee8a7..b03565b 100644
>>> --- a/include/linux/ploop/ploop.h
>>> +++ b/include/linux/ploop/ploop.h
>>> @@ -164,7 +164,8 @@ struct ploop_io_ops
>>>       void    (*read_page)(struct ploop_io * io, struct 
>>> ploop_request * preq,
>>>                    struct page * page, sector_t sec);
>>>       void    (*write_page)(struct ploop_io * io, struct 
>>> ploop_request * preq,
>>> -                  struct page * page, sector_t sec, unsigned long rw);
>>> +                  struct page * page, sector_t sec, unsigned long rw,
>>> +                  int do_fsync_if_delayed);
>>>           int    (*sync_read)(struct ploop_io * io, struct page * page,
>



More information about the Devel mailing list