[Devel] [PATCH rh7] ploop: io_direct: delay f_op->fsync() until FLUSH|FUA

Thu May 26 14:08:01 PDT 2016

On 05/26/2016 07:58 AM, Dmitry Monakhov wrote:
> Maxim Patlasov <mpatlasov at virtuozzo.com> writes:
>
>> Once we converted extent to initialized it can be part of uncompleted
>> journal transaction, so we have to force transaction commit at some point.
>>
>> Instead of forcing transaction commit immediately, the patch delays it
>> until an incoming bio with FLUSH|FUA arrives. Then, as the very first
>> step of processing such a bio, we sends corresponding preq to fsync_thread
>> to perform f_op->fsync().
>>
>> As a very unlikely case, it is also possible that processing a FLUSH|FUA
>> bio itself results in converting extents. Then, the patch calls f_op->fsync()
>> immediately after conversion to preserve FUA semantics.
> ACK. With minor comments. See below:

Thank you for thoughtful review, Dima!

>> https://jira.sw.ru/browse/PSBM-47026
>>
>> Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
>> ---
>>   drivers/block/ploop/dev.c       |   70 ++++++++++++++++++++++++---------------
>>   drivers/block/ploop/io_direct.c |   28 +++++++++++++++-
>>   include/linux/ploop/ploop.h     |    6 +++
>>   3 files changed, 76 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
>> index 654b60b..03fc289 100644
>> --- a/drivers/block/ploop/dev.c
>> +++ b/drivers/block/ploop/dev.c
>> @@ -1942,46 +1942,62 @@ err:
>>   
>>   /* Main preq state machine */
>>   
>> +static inline bool preq_is_special(struct ploop_request * preq)
>> +{
>> +	return test_bit(PLOOP_REQ_MERGE, &preq->state) ||
>> +		test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
>> +		test_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
>> +		test_bit(PLOOP_REQ_DISCARD, &preq->state) ||
>> +		test_bit(PLOOP_REQ_ZERO, &preq->state);
> Oh. It looks awful. Please use one atomic read here and other places.
> #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
> #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
>
> unsigned long state = READ_ONCE(preq->state);
> ...
>      if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_B_FL) ...

OK. I'll fix this place in v2 of the patch and other places later in a 
separate "clean-up" patch.

>> +}
>> +
>>   static void
>>   ploop_entry_request(struct ploop_request * preq)
>>   {
>>   	struct ploop_device * plo       = preq->plo;
>>   	struct ploop_delta  * top_delta = ploop_top_delta(plo);
>> +	struct ploop_io     * top_io    = &top_delta->io;
>>   	struct ploop_delta  * delta;
>>   	int level;
>>   	int err;
>>   	iblock_t iblk;
>>   
>> -	/* Control request. */
>> -	if (unlikely(preq->bl.head == NULL &&
>> -		     !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
>> -		     !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
>> -		     !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
>> -		     !test_bit(PLOOP_REQ_DISCARD, &preq->state) &&
>> -		     !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
>> -		complete(plo->quiesce_comp);
>> -		wait_for_completion(&plo->relax_comp);
>> -		ploop_complete_request(preq);
>> -		complete(&plo->relaxed_comp);
>> -		return;
>> -	}
>> +	if (!preq_is_special(preq)) {
>> +		/* Control request */
>> +		if (unlikely(preq->bl.head == NULL)) {
>> +			complete(plo->quiesce_comp);
>> +			wait_for_completion(&plo->relax_comp);
>> +			ploop_complete_request(preq);
>> +			complete(&plo->relaxed_comp);
>> +			return;
>> +		}
>>   
>> -	/* Empty flush. */
>> -	if (unlikely(preq->req_size == 0 &&
>> -		     !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
>> -		     !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
>> -		     !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
>> -		     !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
>> -		if (preq->req_rw & REQ_FLUSH) {
>> -			if (top_delta->io.ops->issue_flush) {
>> -				top_delta->io.ops->issue_flush(&top_delta->io, preq);
>> -				return;
>> -			}
>> +		/* Need to fsync before start handling FLUSH */
>> +		if ((preq->req_rw & REQ_FLUSH) &&
>> +		    test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state) &&
>> +		    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
>> +			spin_lock_irq(&plo->lock);
>> +			list_add_tail(&preq->list, &top_io->fsync_queue);
>> +			if (waitqueue_active(&top_io->fsync_waitq))
>> +				wake_up_interruptible(&top_io->fsync_waitq);
>> +			spin_unlock_irq(&plo->lock);
>> +			return;
>>   		}
>>   
>> -		preq->eng_state = PLOOP_E_COMPLETE;
>> -		ploop_complete_request(preq);
>> -		return;
>> +		/* Empty flush or unknown zero-size request */
> Do you know any zero size requests instead of FLUSH?

No. The comment only tells explicitly what always existed in ploop: 
simply ignore such requests (if they ever come to us).

>> +		if (preq->req_size == 0) {
>> +			if (preq->req_rw & REQ_FLUSH &&
>> +			    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
>> +				if (top_io->ops->issue_flush) {
>> +					top_io->ops->issue_flush(top_io, preq);
>> +					return;
>> +				}
>> +			}
>> +
>> +			preq->eng_state = PLOOP_E_COMPLETE;
>> +			ploop_complete_request(preq);
>> +			return;
>> +		}
>>   	}
>>   
>>   	if (unlikely(test_bit(PLOOP_REQ_SYNC, &preq->state) &&
>> diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
>> index 8096110..a0b7a4e 100644
>> --- a/drivers/block/ploop/io_direct.c
>> +++ b/drivers/block/ploop/io_direct.c
>> @@ -514,17 +514,32 @@ end_write:
>>   static void
>>   dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
>>   {
>> +	struct ploop_device *plo = preq->plo;
>>   	sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log;
>>   	loff_t clu_siz = 1 << (preq->plo->cluster_log + 9);
>>   	int err;
>>   
>>   	file_start_write(io->files.file);
>> +
>> +	/* Here io->io_count is even ... */
>> +	spin_lock_irq(&plo->lock);
>> +	io->io_count++;
>> +	set_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
>> +	spin_unlock_irq(&plo->lock);
>> +
>>   	err = io->files.file->f_op->fallocate(io->files.file,
>>   					      FALLOC_FL_CONVERT_UNWRITTEN,
>>   					      (loff_t)sec << 9, clu_siz);
>> -	if (!err)
>> +
>> +	/* highly unlikely case: FUA coming to a block not provisioned yet */
>> +	if (!err && (preq->req_rw & REQ_FUA))
>>   		err = io->ops->sync(io);
>>   
>> +	spin_lock_irq(&plo->lock);
>> +	io->io_count++;
>> +	spin_unlock_irq(&plo->lock);
>          Double entrance is not possible because ->post_submit is called
>          synchronously from ploop_thread(). Am I correct?

Yes. ploop_thread is single-threaded and lockless and functions it calls 
are never double-entered.

>> +	/* and here io->io_count is even (+2) again. */
>> +
>>   	file_end_write(io->files.file);
>>   	if (err) {
>>   		PLOOP_REQ_SET_ERROR(preq, err);
>> @@ -782,6 +797,7 @@ static int dio_fsync_thread(void * data)
>>   {
>>   	struct ploop_io * io = data;
>>   	struct ploop_device * plo = io->plo;
>> +	u64 io_count;
>>   
>>   	set_user_nice(current, -20);
>>   
>> @@ -808,6 +824,7 @@ static int dio_fsync_thread(void * data)
>>   
>>   		INIT_LIST_HEAD(&list);
>>   		list_splice_init(&io->fsync_queue, &list);
>> +		io_count = io->io_count;
>>   		spin_unlock_irq(&plo->lock);
>>   
>>   		/* filemap_fdatawrite() has been made already */
>> @@ -824,12 +841,17 @@ static int dio_fsync_thread(void * data)
>>   
>>   		spin_lock_irq(&plo->lock);
>>   
>> +		if (io_count == io->io_count && !(io_count & 1))
>> +			clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
>> +
>>   		while (!list_empty(&list)) {
>>   			struct ploop_request * preq;
>>   			preq = list_entry(list.next, struct ploop_request, list);
>>   			list_del(&preq->list);
>>   			if (err)
>>   				PLOOP_REQ_SET_ERROR(preq, err);
>> +
>> +			__set_bit(PLOOP_REQ_FSYNC_DONE, &preq->state);
>>   			list_add_tail(&preq->list, &plo->ready_queue);
>>   			io->fsync_qlen--;
>>   		}
>> @@ -1522,6 +1544,10 @@ dio_fastmap(struct ploop_io * io, struct bio * orig_bio,
>>   	struct extent_map * em;
>>   	int i;
>>   
>> +	if (unlikely((orig_bio->bi_rw & REQ_FLUSH) &&
> blk layer implicitly treats REQ_FUA as REQ_FUA + later FLUSH. Let's do the same:
> +	if (unlikely((orig_bio->bi_rw & (REQ_FUA|REQ_FLUSH)) &&

OK. Actually, seeing FUA when FSYNC_DELAYED, we don't know what exactly 
sits inside that not-committed-yet transaction on host fs. So, it's good 
idea to fsync there anyway.

>> +		     test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)))
>> +		return 1;
>> +
>>   	if (orig_bio->bi_size == 0) {
>>   		bio->bi_vcnt   = 0;
>>   		bio->bi_sector = 0;
>> diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
>> index ad36a91..49f3cda 100644
>> --- a/include/linux/ploop/ploop.h
>> +++ b/include/linux/ploop/ploop.h
>> @@ -87,6 +87,9 @@ struct ploop_file
>>    * This struct describes how we do real IO on particular backing file.
>>    */
>>   
>> +enum {
>> +	PLOOP_IO_FSYNC_DELAYED,  /* Must f_op->fsync before FLUSH|FUA */
>> +};
>>   
>>   struct ploop_io
>>   {
>> @@ -108,6 +111,8 @@ struct ploop_io
>>   	struct timer_list	fsync_timer;
>>   
>>   	struct ploop_io_ops	*ops;
>> +	unsigned long		io_state;
>> +	u64                     io_count;
>>   };
>>   
>>   struct ploop_io_ops
>> @@ -466,6 +471,7 @@ enum
>>   	PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
>>   	PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
>>   	PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
>> +	PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
>>   };
>>   
>>   enum