[Devel] [PATCH rh7] ploop: io_direct: delay f_op->fsync() until FLUSH|FUA

Dmitry Monakhov dmonakhov at openvz.org
Thu May 26 07:58:21 PDT 2016


Maxim Patlasov <mpatlasov at virtuozzo.com> writes:

> Once we converted extent to initialized it can be part of uncompleted
> journal transaction, so we have to force transaction commit at some point.
>
> Instead of forcing transaction commit immediately, the patch delays it
> until an incoming bio with FLUSH|FUA arrives. Then, as the very first
> step of processing such a bio, we sends corresponding preq to fsync_thread
> to perform f_op->fsync().
>
> As a very unlikely case, it is also possible that processing a FLUSH|FUA
> bio itself results in converting extents. Then, the patch calls f_op->fsync()
> immediately after conversion to preserve FUA semantics.
ACK. With minor comments. See below:
>
> https://jira.sw.ru/browse/PSBM-47026
>
> Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
> ---
>  drivers/block/ploop/dev.c       |   70 ++++++++++++++++++++++++---------------
>  drivers/block/ploop/io_direct.c |   28 +++++++++++++++-
>  include/linux/ploop/ploop.h     |    6 +++
>  3 files changed, 76 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
> index 654b60b..03fc289 100644
> --- a/drivers/block/ploop/dev.c
> +++ b/drivers/block/ploop/dev.c
> @@ -1942,46 +1942,62 @@ err:
>  
>  /* Main preq state machine */
>  
> +static inline bool preq_is_special(struct ploop_request * preq)
> +{
> +	return test_bit(PLOOP_REQ_MERGE, &preq->state) ||
> +		test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
> +		test_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
> +		test_bit(PLOOP_REQ_DISCARD, &preq->state) ||
> +		test_bit(PLOOP_REQ_ZERO, &preq->state);
Oh. It looks awful. Please use one atomic read here and other places.
#define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
#define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)

unsigned long state = READ_ONCE(preq->state);
...
    if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_B_FL) ...
> +}
> +
>  static void
>  ploop_entry_request(struct ploop_request * preq)
>  {
>  	struct ploop_device * plo       = preq->plo;
>  	struct ploop_delta  * top_delta = ploop_top_delta(plo);
> +	struct ploop_io     * top_io    = &top_delta->io;
>  	struct ploop_delta  * delta;
>  	int level;
>  	int err;
>  	iblock_t iblk;
>  
> -	/* Control request. */
> -	if (unlikely(preq->bl.head == NULL &&
> -		     !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
> -		     !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
> -		     !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
> -		     !test_bit(PLOOP_REQ_DISCARD, &preq->state) &&
> -		     !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
> -		complete(plo->quiesce_comp);
> -		wait_for_completion(&plo->relax_comp);
> -		ploop_complete_request(preq);
> -		complete(&plo->relaxed_comp);
> -		return;
> -	}
> +	if (!preq_is_special(preq)) {
> +		/* Control request */
> +		if (unlikely(preq->bl.head == NULL)) {
> +			complete(plo->quiesce_comp);
> +			wait_for_completion(&plo->relax_comp);
> +			ploop_complete_request(preq);
> +			complete(&plo->relaxed_comp);
> +			return;
> +		}
>  
> -	/* Empty flush. */
> -	if (unlikely(preq->req_size == 0 &&
> -		     !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
> -		     !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
> -		     !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
> -		     !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
> -		if (preq->req_rw & REQ_FLUSH) {
> -			if (top_delta->io.ops->issue_flush) {
> -				top_delta->io.ops->issue_flush(&top_delta->io, preq);
> -				return;
> -			}
> +		/* Need to fsync before start handling FLUSH */
> +		if ((preq->req_rw & REQ_FLUSH) &&
> +		    test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state) &&
> +		    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
> +			spin_lock_irq(&plo->lock);
> +			list_add_tail(&preq->list, &top_io->fsync_queue);
> +			if (waitqueue_active(&top_io->fsync_waitq))
> +				wake_up_interruptible(&top_io->fsync_waitq);
> +			spin_unlock_irq(&plo->lock);
> +			return;
>  		}
>  
> -		preq->eng_state = PLOOP_E_COMPLETE;
> -		ploop_complete_request(preq);
> -		return;
> +		/* Empty flush or unknown zero-size request */
Do you know any zero size requests instead of FLUSH?
> +		if (preq->req_size == 0) {
> +			if (preq->req_rw & REQ_FLUSH &&
> +			    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {

> +				if (top_io->ops->issue_flush) {
> +					top_io->ops->issue_flush(top_io, preq);
> +					return;
> +				}
> +			}
> +
> +			preq->eng_state = PLOOP_E_COMPLETE;
> +			ploop_complete_request(preq);
> +			return;
> +		}
>  	}
>  
>  	if (unlikely(test_bit(PLOOP_REQ_SYNC, &preq->state) &&
> diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
> index 8096110..a0b7a4e 100644
> --- a/drivers/block/ploop/io_direct.c
> +++ b/drivers/block/ploop/io_direct.c
> @@ -514,17 +514,32 @@ end_write:
>  static void
>  dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
>  {
> +	struct ploop_device *plo = preq->plo;
>  	sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log;
>  	loff_t clu_siz = 1 << (preq->plo->cluster_log + 9);
>  	int err;
>  
>  	file_start_write(io->files.file);
> +
> +	/* Here io->io_count is even ... */
> +	spin_lock_irq(&plo->lock);
> +	io->io_count++;
> +	set_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
> +	spin_unlock_irq(&plo->lock);
> +
>  	err = io->files.file->f_op->fallocate(io->files.file,
>  					      FALLOC_FL_CONVERT_UNWRITTEN,
>  					      (loff_t)sec << 9, clu_siz);
> -	if (!err)
> +
> +	/* highly unlikely case: FUA coming to a block not provisioned yet */
> +	if (!err && (preq->req_rw & REQ_FUA))
>  		err = io->ops->sync(io);
>  
> +	spin_lock_irq(&plo->lock);
> +	io->io_count++;
> +	spin_unlock_irq(&plo->lock);
        Double entrance is not possible because ->post_submit is called
        synchronously from ploop_thread(). Am I correct?
> +	/* and here io->io_count is even (+2) again. */
> +
>  	file_end_write(io->files.file);
>  	if (err) {
>  		PLOOP_REQ_SET_ERROR(preq, err);
> @@ -782,6 +797,7 @@ static int dio_fsync_thread(void * data)
>  {
>  	struct ploop_io * io = data;
>  	struct ploop_device * plo = io->plo;
> +	u64 io_count;
>  
>  	set_user_nice(current, -20);
>  
> @@ -808,6 +824,7 @@ static int dio_fsync_thread(void * data)
>  
>  		INIT_LIST_HEAD(&list);
>  		list_splice_init(&io->fsync_queue, &list);
> +		io_count = io->io_count;
>  		spin_unlock_irq(&plo->lock);
>  
>  		/* filemap_fdatawrite() has been made already */
> @@ -824,12 +841,17 @@ static int dio_fsync_thread(void * data)
>  
>  		spin_lock_irq(&plo->lock);
>  
> +		if (io_count == io->io_count && !(io_count & 1))
> +			clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
> +
>  		while (!list_empty(&list)) {
>  			struct ploop_request * preq;
>  			preq = list_entry(list.next, struct ploop_request, list);
>  			list_del(&preq->list);
>  			if (err)
>  				PLOOP_REQ_SET_ERROR(preq, err);
> +
> +			__set_bit(PLOOP_REQ_FSYNC_DONE, &preq->state);
>  			list_add_tail(&preq->list, &plo->ready_queue);
>  			io->fsync_qlen--;
>  		}
> @@ -1522,6 +1544,10 @@ dio_fastmap(struct ploop_io * io, struct bio * orig_bio,
>  	struct extent_map * em;
>  	int i;
>  
> +	if (unlikely((orig_bio->bi_rw & REQ_FLUSH) &&
blk layer implicitly treats REQ_FUA as REQ_FUA + later FLUSH. Let's do the same:
+	if (unlikely((orig_bio->bi_rw & (REQ_FUA|REQ_FLUSH)) &&
> +		     test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)))
> +		return 1;
> +
>  	if (orig_bio->bi_size == 0) {
>  		bio->bi_vcnt   = 0;
>  		bio->bi_sector = 0;
> diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
> index ad36a91..49f3cda 100644
> --- a/include/linux/ploop/ploop.h
> +++ b/include/linux/ploop/ploop.h
> @@ -87,6 +87,9 @@ struct ploop_file
>   * This struct describes how we do real IO on particular backing file.
>   */
>  
> +enum {
> +	PLOOP_IO_FSYNC_DELAYED,  /* Must f_op->fsync before FLUSH|FUA */
> +};
>  
>  struct ploop_io
>  {
> @@ -108,6 +111,8 @@ struct ploop_io
>  	struct timer_list	fsync_timer;
>  
>  	struct ploop_io_ops	*ops;
> +	unsigned long		io_state;
> +	u64                     io_count;
>  };
>  
>  struct ploop_io_ops
> @@ -466,6 +471,7 @@ enum
>  	PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
>  	PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
>  	PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
> +	PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
>  };
>  
>  enum
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 472 bytes
Desc: not available
URL: <http://lists.openvz.org/pipermail/devel/attachments/20160526/7abeebe8/attachment-0001.sig>


More information about the Devel mailing list