[Devel] [PATCH rh7] ploop: io_direct: delay f_op->fsync() until index_update for reloc requests

Maxim Patlasov mpatlasov at virtuozzo.com
Wed Jul 6 11:10:51 PDT 2016


Dima,

On 07/06/2016 04:58 AM, Dmitry Monakhov wrote:

> Maxim Patlasov <mpatlasov at virtuozzo.com> writes:
>
>> Commit 9f860e606 introduced an engine to delay fsync: doing
>> fallocate(FALLOC_FL_CONVERT_UNWRITTEN) dio_post_submit marks
>> io as PLOOP_IO_FSYNC_DELAYED to ensure that fsync happens
>> later, when incoming FLUSH|FUA comes.
>>
>> That was deemed as important because (PSBM-47026):
>>
>>> This optimization becomes more important due to the fact that customers tend to use pcompact heavily => ploop images grow each day.
>> Now, we can easily re-use the engine to delay fsync for reloc
>> requests as well. As explained in the description of commit
>> 5aa3fe09:
>>
>>>      1->read_data_from_old_post
>>>      2->write_to_new_pos
>>>        ->sumbit_alloc
>>>           ->submit_pad
>>>       ->post_submit->convert_unwritten
>>>      3->update_index ->write_page with FLUSH|FUA
>>>      4->nullify_old_pos
>>>     5->issue_flush
>> by the time of step 3 extent coversion is not yet stable because
>> belongs to uncommitted transaction. But instead of doing fsync
>> inside ->post_submit, we can fsync later, as the very first step
>> of write_page for index_update.
> NAK from me. What is advantage of this patch?

The advantage is the following: in case of BAT multi-updates, instead of 
doing many fsync-s (one per dio_post_submit), we'll do only one (when 
final ->write_page is called).

> Does it makes code more optimal? No

Yes, it does. In the same sense as 9f860e606: saving some fsync-s.

> Does it makes main ploop more asynchronous? No.

Correct, the patch optimizes ploop in the other way. It's not about 
making ploop more asynchronous.


>
> If you want to make optimization then it is reasonable to
> queue preq with PLOOP_IO_FSYNC_DELAYED to top_io->fsync_queue
> before processing PLOOP_E_DATA_WBI  state for  preq with FUA
> So sequence will looks like follows:
> ->sumbit_alloc
>    ->submit_pad
>    ->post_submit->convert_unwritten-> tag PLOOP_IO_FSYNC_DELAYED
> ->ploop_req_state_process
>    case PLOOP_E_DATA_WBI:
>    if (preq->start & PLOOP_IO_FSYNC_DELAYED_FL) {
>        preq->start &= ~PLOOP_IO_FSYNC_DELAYED_FL
>        list_add_tail(&preq->list, &top_io->fsync_queue)
>        return;
>     }
> ##Let fsync_thread do it's work
> ->ploop_req_state_process
>     case LOOP_E_DATA_WBI:
>     update_index->write_page with FUA (FLUSH is not required because we  already done fsync)

That's another type of optimization: making ploop more asynchronous. I 
thought about it, but didn't come to conclusion whether it's worthy 
w.r.t. adding more complexity to ploop-state-machine and possible bugs 
introduced with that.

Thanks,
Maxim

>
>> https://jira.sw.ru/browse/PSBM-47026
>>
>> Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
>> ---
>>   drivers/block/ploop/dev.c       |    4 ++--
>>   drivers/block/ploop/io_direct.c |   25 ++++++++++++++++++++++++-
>>   drivers/block/ploop/io_kaio.c   |    3 ++-
>>   drivers/block/ploop/map.c       |   17 ++++++++++++-----
>>   include/linux/ploop/ploop.h     |    3 ++-
>>   5 files changed, 42 insertions(+), 10 deletions(-)
>>
>> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
>> index e5f010b..40768b6 100644
>> --- a/drivers/block/ploop/dev.c
>> +++ b/drivers/block/ploop/dev.c
>> @@ -4097,7 +4097,7 @@ static void ploop_relocate(struct ploop_device * plo)
>>   	preq->bl.tail = preq->bl.head = NULL;
>>   	preq->req_cluster = 0;
>>   	preq->req_size = 0;
>> -	preq->req_rw = WRITE_SYNC|REQ_FUA;
>> +	preq->req_rw = WRITE_SYNC;
>>   	preq->eng_state = PLOOP_E_ENTRY;
>>   	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_A);
>>   	preq->error = 0;
>> @@ -4401,7 +4401,7 @@ static void ploop_relocblks_process(struct ploop_device *plo)
>>   		preq->bl.tail = preq->bl.head = NULL;
>>   		preq->req_cluster = ~0U; /* uninitialized */
>>   		preq->req_size = 0;
>> -		preq->req_rw = WRITE_SYNC|REQ_FUA;
>> +		preq->req_rw = WRITE_SYNC;
>>   		preq->eng_state = PLOOP_E_ENTRY;
>>   		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_S);
>>   		preq->error = 0;
>> diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
>> index 1086850..0a5fb15 100644
>> --- a/drivers/block/ploop/io_direct.c
>> +++ b/drivers/block/ploop/io_direct.c
>> @@ -1494,13 +1494,36 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq,
>>   
>>   static void
>>   dio_write_page(struct ploop_io * io, struct ploop_request * preq,
>> -	       struct page * page, sector_t sec, unsigned long rw)
>> +	       struct page * page, sector_t sec, unsigned long rw,
>> +	       int do_fsync_if_delayed)
>>   {
>>   	if (!(io->files.file->f_mode & FMODE_WRITE)) {
>>   		PLOOP_FAIL_REQUEST(preq, -EBADF);
>>   		return;
>>   	}
>>   
>> +	if (do_fsync_if_delayed &&
>> +	    test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)) {
>> +		struct ploop_device * plo = io->plo;
>> +		u64 io_count;
>> +		int err;
>> +
>> +		spin_lock_irq(&plo->lock);
>> +		io_count = io->io_count;
>> +		spin_unlock_irq(&plo->lock);
>> +
>> +		err = io->ops->sync(io);
>> +		if (err) {
>> +			PLOOP_FAIL_REQUEST(preq, -EBADF);
>> +			return;
>> +		}
>> +
>> +		spin_lock_irq(&plo->lock);
>> +		if (io_count == io->io_count && !(io_count & 1))
>> +			clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
>> +		spin_unlock_irq(&plo->lock);
>> +	}
>> +
>>   	dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
>>   }
>>   
>> diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
>> index 85863df..0d731ef 100644
>> --- a/drivers/block/ploop/io_kaio.c
>> +++ b/drivers/block/ploop/io_kaio.c
>> @@ -614,7 +614,8 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
>>   
>>   static void
>>   kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
>> -		 struct page * page, sector_t sec, unsigned long rw)
>> +		struct page * page, sector_t sec, unsigned long rw,
>> +		int do_fsync_if_delayed)
>>   {
>>   	ploop_prepare_tracker(preq, sec);
>>   
>> diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
>> index 1883674..96e428b 100644
>> --- a/drivers/block/ploop/map.c
>> +++ b/drivers/block/ploop/map.c
>> @@ -910,6 +910,7 @@ void ploop_index_update(struct ploop_request * preq)
>>   	sector_t sec;
>>   	unsigned long rw;
>>   	unsigned long state = READ_ONCE(preq->state);
>> +	int do_fsync_if_delayed = 0;
>>   
>>   	/* No way back, we are going to initiate index write. */
>>   
>> @@ -970,10 +971,13 @@ void ploop_index_update(struct ploop_request * preq)
>>   	preq->req_rw &= ~REQ_FLUSH;
>>   
>>   	/* Relocate requires consistent index update */
>> -	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>> +	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
>>   		rw |= (REQ_FLUSH | REQ_FUA);
>> +		do_fsync_if_delayed = 1;
>> +	}
>>   
>> -	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw);
>> +	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw,
>> +				      do_fsync_if_delayed);
>>   
>>   	put_page(page);
>>   	return;
>> @@ -1096,6 +1100,7 @@ static void map_wb_complete(struct map_node * m, int err)
>>   	unsigned int idx;
>>   	sector_t sec;
>>   	unsigned long rw;
>> +	int do_fsync_if_delayed = 0;
>>   
>>   	/* First, complete processing of written back indices,
>>   	 * finally instantiate indices in mapping cache.
>> @@ -1193,8 +1198,10 @@ static void map_wb_complete(struct map_node * m, int err)
>>   
>>   			state = READ_ONCE(preq->state);
>>   			/* Relocate requires consistent index update */
>> -			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
>> +			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
>>   				rw |= (REQ_FLUSH | REQ_FUA);
>> +				do_fsync_if_delayed = 1;
>> +			}
>>   
>>   			preq->eng_state = PLOOP_E_INDEX_WB;
>>   			get_page(page);
>> @@ -1221,8 +1228,8 @@ static void map_wb_complete(struct map_node * m, int err)
>>   	plo->st.map_multi_writes++;
>>   	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
>>   
>> -	top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec,
>> -				      rw);
>> +	top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw,
>> +				      do_fsync_if_delayed);
>>   	put_page(page);
>>   }
>>   
>> diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
>> index deee8a7..b03565b 100644
>> --- a/include/linux/ploop/ploop.h
>> +++ b/include/linux/ploop/ploop.h
>> @@ -164,7 +164,8 @@ struct ploop_io_ops
>>   	void	(*read_page)(struct ploop_io * io, struct ploop_request * preq,
>>   			     struct page * page, sector_t sec);
>>   	void	(*write_page)(struct ploop_io * io, struct ploop_request * preq,
>> -			      struct page * page, sector_t sec, unsigned long rw);
>> +			      struct page * page, sector_t sec, unsigned long rw,
>> +			      int do_fsync_if_delayed);
>>   
>>   
>>   	int	(*sync_read)(struct ploop_io * io, struct page * page,



More information about the Devel mailing list