[Devel] [PATCH rh7] ploop: io_direct: delay f_op->fsync() until FLUSH|FUA
Dmitry Monakhov
dmonakhov at openvz.org
Thu May 26 07:58:21 PDT 2016
Maxim Patlasov <mpatlasov at virtuozzo.com> writes:
> Once we converted extent to initialized it can be part of uncompleted
> journal transaction, so we have to force transaction commit at some point.
>
> Instead of forcing transaction commit immediately, the patch delays it
> until an incoming bio with FLUSH|FUA arrives. Then, as the very first
> step of processing such a bio, we sends corresponding preq to fsync_thread
> to perform f_op->fsync().
>
> As a very unlikely case, it is also possible that processing a FLUSH|FUA
> bio itself results in converting extents. Then, the patch calls f_op->fsync()
> immediately after conversion to preserve FUA semantics.
ACK. With minor comments. See below:
>
> https://jira.sw.ru/browse/PSBM-47026
>
> Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
> ---
> drivers/block/ploop/dev.c | 70 ++++++++++++++++++++++++---------------
> drivers/block/ploop/io_direct.c | 28 +++++++++++++++-
> include/linux/ploop/ploop.h | 6 +++
> 3 files changed, 76 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
> index 654b60b..03fc289 100644
> --- a/drivers/block/ploop/dev.c
> +++ b/drivers/block/ploop/dev.c
> @@ -1942,46 +1942,62 @@ err:
>
> /* Main preq state machine */
>
> +static inline bool preq_is_special(struct ploop_request * preq)
> +{
> + return test_bit(PLOOP_REQ_MERGE, &preq->state) ||
> + test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
> + test_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
> + test_bit(PLOOP_REQ_DISCARD, &preq->state) ||
> + test_bit(PLOOP_REQ_ZERO, &preq->state);
Oh. It looks awful. Please use one atomic read here and other places.
#define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
#define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
unsigned long state = READ_ONCE(preq->state);
...
if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_B_FL) ...
> +}
> +
> static void
> ploop_entry_request(struct ploop_request * preq)
> {
> struct ploop_device * plo = preq->plo;
> struct ploop_delta * top_delta = ploop_top_delta(plo);
> + struct ploop_io * top_io = &top_delta->io;
> struct ploop_delta * delta;
> int level;
> int err;
> iblock_t iblk;
>
> - /* Control request. */
> - if (unlikely(preq->bl.head == NULL &&
> - !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
> - !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
> - !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
> - !test_bit(PLOOP_REQ_DISCARD, &preq->state) &&
> - !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
> - complete(plo->quiesce_comp);
> - wait_for_completion(&plo->relax_comp);
> - ploop_complete_request(preq);
> - complete(&plo->relaxed_comp);
> - return;
> - }
> + if (!preq_is_special(preq)) {
> + /* Control request */
> + if (unlikely(preq->bl.head == NULL)) {
> + complete(plo->quiesce_comp);
> + wait_for_completion(&plo->relax_comp);
> + ploop_complete_request(preq);
> + complete(&plo->relaxed_comp);
> + return;
> + }
>
> - /* Empty flush. */
> - if (unlikely(preq->req_size == 0 &&
> - !test_bit(PLOOP_REQ_MERGE, &preq->state) &&
> - !test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
> - !test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
> - !test_bit(PLOOP_REQ_ZERO, &preq->state))) {
> - if (preq->req_rw & REQ_FLUSH) {
> - if (top_delta->io.ops->issue_flush) {
> - top_delta->io.ops->issue_flush(&top_delta->io, preq);
> - return;
> - }
> + /* Need to fsync before start handling FLUSH */
> + if ((preq->req_rw & REQ_FLUSH) &&
> + test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state) &&
> + !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
> + spin_lock_irq(&plo->lock);
> + list_add_tail(&preq->list, &top_io->fsync_queue);
> + if (waitqueue_active(&top_io->fsync_waitq))
> + wake_up_interruptible(&top_io->fsync_waitq);
> + spin_unlock_irq(&plo->lock);
> + return;
> }
>
> - preq->eng_state = PLOOP_E_COMPLETE;
> - ploop_complete_request(preq);
> - return;
> + /* Empty flush or unknown zero-size request */
Do you know any zero size requests instead of FLUSH?
> + if (preq->req_size == 0) {
> + if (preq->req_rw & REQ_FLUSH &&
> + !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
> + if (top_io->ops->issue_flush) {
> + top_io->ops->issue_flush(top_io, preq);
> + return;
> + }
> + }
> +
> + preq->eng_state = PLOOP_E_COMPLETE;
> + ploop_complete_request(preq);
> + return;
> + }
> }
>
> if (unlikely(test_bit(PLOOP_REQ_SYNC, &preq->state) &&
> diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
> index 8096110..a0b7a4e 100644
> --- a/drivers/block/ploop/io_direct.c
> +++ b/drivers/block/ploop/io_direct.c
> @@ -514,17 +514,32 @@ end_write:
> static void
> dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
> {
> + struct ploop_device *plo = preq->plo;
> sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log;
> loff_t clu_siz = 1 << (preq->plo->cluster_log + 9);
> int err;
>
> file_start_write(io->files.file);
> +
> + /* Here io->io_count is even ... */
> + spin_lock_irq(&plo->lock);
> + io->io_count++;
> + set_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
> + spin_unlock_irq(&plo->lock);
> +
> err = io->files.file->f_op->fallocate(io->files.file,
> FALLOC_FL_CONVERT_UNWRITTEN,
> (loff_t)sec << 9, clu_siz);
> - if (!err)
> +
> + /* highly unlikely case: FUA coming to a block not provisioned yet */
> + if (!err && (preq->req_rw & REQ_FUA))
> err = io->ops->sync(io);
>
> + spin_lock_irq(&plo->lock);
> + io->io_count++;
> + spin_unlock_irq(&plo->lock);
Double entrance is not possible because ->post_submit is called
synchronously from ploop_thread(). Am I correct?
> + /* and here io->io_count is even (+2) again. */
> +
> file_end_write(io->files.file);
> if (err) {
> PLOOP_REQ_SET_ERROR(preq, err);
> @@ -782,6 +797,7 @@ static int dio_fsync_thread(void * data)
> {
> struct ploop_io * io = data;
> struct ploop_device * plo = io->plo;
> + u64 io_count;
>
> set_user_nice(current, -20);
>
> @@ -808,6 +824,7 @@ static int dio_fsync_thread(void * data)
>
> INIT_LIST_HEAD(&list);
> list_splice_init(&io->fsync_queue, &list);
> + io_count = io->io_count;
> spin_unlock_irq(&plo->lock);
>
> /* filemap_fdatawrite() has been made already */
> @@ -824,12 +841,17 @@ static int dio_fsync_thread(void * data)
>
> spin_lock_irq(&plo->lock);
>
> + if (io_count == io->io_count && !(io_count & 1))
> + clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
> +
> while (!list_empty(&list)) {
> struct ploop_request * preq;
> preq = list_entry(list.next, struct ploop_request, list);
> list_del(&preq->list);
> if (err)
> PLOOP_REQ_SET_ERROR(preq, err);
> +
> + __set_bit(PLOOP_REQ_FSYNC_DONE, &preq->state);
> list_add_tail(&preq->list, &plo->ready_queue);
> io->fsync_qlen--;
> }
> @@ -1522,6 +1544,10 @@ dio_fastmap(struct ploop_io * io, struct bio * orig_bio,
> struct extent_map * em;
> int i;
>
> + if (unlikely((orig_bio->bi_rw & REQ_FLUSH) &&
blk layer implicitly treats REQ_FUA as REQ_FUA + later FLUSH. Let's do the same:
+ if (unlikely((orig_bio->bi_rw & (REQ_FUA|REQ_FLUSH)) &&
> + test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)))
> + return 1;
> +
> if (orig_bio->bi_size == 0) {
> bio->bi_vcnt = 0;
> bio->bi_sector = 0;
> diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
> index ad36a91..49f3cda 100644
> --- a/include/linux/ploop/ploop.h
> +++ b/include/linux/ploop/ploop.h
> @@ -87,6 +87,9 @@ struct ploop_file
> * This struct describes how we do real IO on particular backing file.
> */
>
> +enum {
> + PLOOP_IO_FSYNC_DELAYED, /* Must f_op->fsync before FLUSH|FUA */
> +};
>
> struct ploop_io
> {
> @@ -108,6 +111,8 @@ struct ploop_io
> struct timer_list fsync_timer;
>
> struct ploop_io_ops *ops;
> + unsigned long io_state;
> + u64 io_count;
> };
>
> struct ploop_io_ops
> @@ -466,6 +471,7 @@ enum
> PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
> PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
> PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
> + PLOOP_REQ_FSYNC_DONE, /* fsync_thread() performed f_op->fsync() */
> };
>
> enum
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 472 bytes
Desc: not available
URL: <http://lists.openvz.org/pipermail/devel/attachments/20160526/7abeebe8/attachment-0001.sig>
More information about the Devel
mailing list