[Devel] [PATCH rh7] ploop: io_direct: delay f_op->fsync() until index_update for reloc requests
Maxim Patlasov
mpatlasov at virtuozzo.com
Tue Jul 19 10:45:21 PDT 2016
Dima,
I have not heard from you since 07/06/2016. Do you agree with that
reasoning I provided in last email? What's your objection against the
patch now?
Thanks,
Maxim
On 07/06/2016 11:10 AM, Maxim Patlasov wrote:
> Dima,
>
> On 07/06/2016 04:58 AM, Dmitry Monakhov wrote:
>
>> Maxim Patlasov <mpatlasov at virtuozzo.com> writes:
>>
>>> Commit 9f860e606 introduced an engine to delay fsync: doing
>>> fallocate(FALLOC_FL_CONVERT_UNWRITTEN) dio_post_submit marks
>>> io as PLOOP_IO_FSYNC_DELAYED to ensure that fsync happens
>>> later, when incoming FLUSH|FUA comes.
>>>
>>> That was deemed as important because (PSBM-47026):
>>>
>>>> This optimization becomes more important due to the fact that
>>>> customers tend to use pcompact heavily => ploop images grow each day.
>>> Now, we can easily re-use the engine to delay fsync for reloc
>>> requests as well. As explained in the description of commit
>>> 5aa3fe09:
>>>
>>>> 1->read_data_from_old_post
>>>> 2->write_to_new_pos
>>>> ->sumbit_alloc
>>>> ->submit_pad
>>>> ->post_submit->convert_unwritten
>>>> 3->update_index ->write_page with FLUSH|FUA
>>>> 4->nullify_old_pos
>>>> 5->issue_flush
>>> by the time of step 3 extent coversion is not yet stable because
>>> belongs to uncommitted transaction. But instead of doing fsync
>>> inside ->post_submit, we can fsync later, as the very first step
>>> of write_page for index_update.
>> NAK from me. What is advantage of this patch?
>
> The advantage is the following: in case of BAT multi-updates, instead
> of doing many fsync-s (one per dio_post_submit), we'll do only one
> (when final ->write_page is called).
>
>> Does it makes code more optimal? No
>
> Yes, it does. In the same sense as 9f860e606: saving some fsync-s.
>
>> Does it makes main ploop more asynchronous? No.
>
> Correct, the patch optimizes ploop in the other way. It's not about
> making ploop more asynchronous.
>
>
>>
>> If you want to make optimization then it is reasonable to
>> queue preq with PLOOP_IO_FSYNC_DELAYED to top_io->fsync_queue
>> before processing PLOOP_E_DATA_WBI state for preq with FUA
>> So sequence will looks like follows:
>> ->sumbit_alloc
>> ->submit_pad
>> ->post_submit->convert_unwritten-> tag PLOOP_IO_FSYNC_DELAYED
>> ->ploop_req_state_process
>> case PLOOP_E_DATA_WBI:
>> if (preq->start & PLOOP_IO_FSYNC_DELAYED_FL) {
>> preq->start &= ~PLOOP_IO_FSYNC_DELAYED_FL
>> list_add_tail(&preq->list, &top_io->fsync_queue)
>> return;
>> }
>> ##Let fsync_thread do it's work
>> ->ploop_req_state_process
>> case LOOP_E_DATA_WBI:
>> update_index->write_page with FUA (FLUSH is not required because
>> we already done fsync)
>
> That's another type of optimization: making ploop more asynchronous. I
> thought about it, but didn't come to conclusion whether it's worthy
> w.r.t. adding more complexity to ploop-state-machine and possible bugs
> introduced with that.
>
> Thanks,
> Maxim
>
>>
>>> https://jira.sw.ru/browse/PSBM-47026
>>>
>>> Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
>>> ---
>>> drivers/block/ploop/dev.c | 4 ++--
>>> drivers/block/ploop/io_direct.c | 25 ++++++++++++++++++++++++-
>>> drivers/block/ploop/io_kaio.c | 3 ++-
>>> drivers/block/ploop/map.c | 17 ++++++++++++-----
>>> include/linux/ploop/ploop.h | 3 ++-
>>> 5 files changed, 42 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
>>> index e5f010b..40768b6 100644
>>> --- a/drivers/block/ploop/dev.c
>>> +++ b/drivers/block/ploop/dev.c
>>> @@ -4097,7 +4097,7 @@ static void ploop_relocate(struct ploop_device
>>> * plo)
>>> preq->bl.tail = preq->bl.head = NULL;
>>> preq->req_cluster = 0;
>>> preq->req_size = 0;
>>> - preq->req_rw = WRITE_SYNC|REQ_FUA;
>>> + preq->req_rw = WRITE_SYNC;
>>> preq->eng_state = PLOOP_E_ENTRY;
>>> preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_A);
>>> preq->error = 0;
>>> @@ -4401,7 +4401,7 @@ static void ploop_relocblks_process(struct
>>> ploop_device *plo)
>>> preq->bl.tail = preq->bl.head = NULL;
>>> preq->req_cluster = ~0U; /* uninitialized */
>>> preq->req_size = 0;
>>> - preq->req_rw = WRITE_SYNC|REQ_FUA;
>>> + preq->req_rw = WRITE_SYNC;
>>> preq->eng_state = PLOOP_E_ENTRY;
>>> preq->state = (1 << PLOOP_REQ_SYNC) | (1 <<
>>> PLOOP_REQ_RELOC_S);
>>> preq->error = 0;
>>> diff --git a/drivers/block/ploop/io_direct.c
>>> b/drivers/block/ploop/io_direct.c
>>> index 1086850..0a5fb15 100644
>>> --- a/drivers/block/ploop/io_direct.c
>>> +++ b/drivers/block/ploop/io_direct.c
>>> @@ -1494,13 +1494,36 @@ dio_read_page(struct ploop_io * io, struct
>>> ploop_request * preq,
>>> static void
>>> dio_write_page(struct ploop_io * io, struct ploop_request * preq,
>>> - struct page * page, sector_t sec, unsigned long rw)
>>> + struct page * page, sector_t sec, unsigned long rw,
>>> + int do_fsync_if_delayed)
>>> {
>>> if (!(io->files.file->f_mode & FMODE_WRITE)) {
>>> PLOOP_FAIL_REQUEST(preq, -EBADF);
>>> return;
>>> }
>>> + if (do_fsync_if_delayed &&
>>> + test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)) {
>>> + struct ploop_device * plo = io->plo;
>>> + u64 io_count;
>>> + int err;
>>> +
>>> + spin_lock_irq(&plo->lock);
>>> + io_count = io->io_count;
>>> + spin_unlock_irq(&plo->lock);
>>> +
>>> + err = io->ops->sync(io);
>>> + if (err) {
>>> + PLOOP_FAIL_REQUEST(preq, -EBADF);
>>> + return;
>>> + }
>>> +
>>> + spin_lock_irq(&plo->lock);
>>> + if (io_count == io->io_count && !(io_count & 1))
>>> + clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
>>> + spin_unlock_irq(&plo->lock);
>>> + }
>>> +
>>> dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
>>> }
>>> diff --git a/drivers/block/ploop/io_kaio.c
>>> b/drivers/block/ploop/io_kaio.c
>>> index 85863df..0d731ef 100644
>>> --- a/drivers/block/ploop/io_kaio.c
>>> +++ b/drivers/block/ploop/io_kaio.c
>>> @@ -614,7 +614,8 @@ kaio_read_page(struct ploop_io * io, struct
>>> ploop_request * preq,
>>> static void
>>> kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
>>> - struct page * page, sector_t sec, unsigned long rw)
>>> + struct page * page, sector_t sec, unsigned long rw,
>>> + int do_fsync_if_delayed)
>>> {
>>> ploop_prepare_tracker(preq, sec);
>>> diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
>>> index 1883674..96e428b 100644
>>> --- a/drivers/block/ploop/map.c
>>> +++ b/drivers/block/ploop/map.c
>>> @@ -910,6 +910,7 @@ void ploop_index_update(struct ploop_request *
>>> preq)
>>> sector_t sec;
>>> unsigned long rw;
>>> unsigned long state = READ_ONCE(preq->state);
>>> + int do_fsync_if_delayed = 0;
>>> /* No way back, we are going to initiate index write. */
>>> @@ -970,10 +971,13 @@ void ploop_index_update(struct ploop_request
>>> * preq)
>>> preq->req_rw &= ~REQ_FLUSH;
>>> /* Relocate requires consistent index update */
>>> - if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>>> + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
>>> rw |= (REQ_FLUSH | REQ_FUA);
>>> + do_fsync_if_delayed = 1;
>>> + }
>>> - top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw);
>>> + top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw,
>>> + do_fsync_if_delayed);
>>> put_page(page);
>>> return;
>>> @@ -1096,6 +1100,7 @@ static void map_wb_complete(struct map_node *
>>> m, int err)
>>> unsigned int idx;
>>> sector_t sec;
>>> unsigned long rw;
>>> + int do_fsync_if_delayed = 0;
>>> /* First, complete processing of written back indices,
>>> * finally instantiate indices in mapping cache.
>>> @@ -1193,8 +1198,10 @@ static void map_wb_complete(struct map_node *
>>> m, int err)
>>> state = READ_ONCE(preq->state);
>>> /* Relocate requires consistent index update */
>>> - if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>>> + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
>>> rw |= (REQ_FLUSH | REQ_FUA);
>>> + do_fsync_if_delayed = 1;
>>> + }
>>> preq->eng_state = PLOOP_E_INDEX_WB;
>>> get_page(page);
>>> @@ -1221,8 +1228,8 @@ static void map_wb_complete(struct map_node *
>>> m, int err)
>>> plo->st.map_multi_writes++;
>>> top_delta->ops->map_index(top_delta, m->mn_start, &sec);
>>> - top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec,
>>> - rw);
>>> + top_delta->io.ops->write_page(&top_delta->io, main_preq, page,
>>> sec, rw,
>>> + do_fsync_if_delayed);
>>> put_page(page);
>>> }
>>> diff --git a/include/linux/ploop/ploop.h
>>> b/include/linux/ploop/ploop.h
>>> index deee8a7..b03565b 100644
>>> --- a/include/linux/ploop/ploop.h
>>> +++ b/include/linux/ploop/ploop.h
>>> @@ -164,7 +164,8 @@ struct ploop_io_ops
>>> void (*read_page)(struct ploop_io * io, struct
>>> ploop_request * preq,
>>> struct page * page, sector_t sec);
>>> void (*write_page)(struct ploop_io * io, struct
>>> ploop_request * preq,
>>> - struct page * page, sector_t sec, unsigned long rw);
>>> + struct page * page, sector_t sec, unsigned long rw,
>>> + int do_fsync_if_delayed);
>>> int (*sync_read)(struct ploop_io * io, struct page * page,
>
More information about the Devel
mailing list