[Devel] [PATCH RHEL7 COMMIT] ploop: io_direct: delay f_op->fsync() until index_update for reloc requests (v3)

Konstantin Khorenko khorenko at virtuozzo.com
Tue Aug 9 01:40:41 PDT 2016


The commit is pushed to "branch-rh7-3.10.0-327.22.2.vz7.16.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.22.2.vz7.16.3
------>
commit 6a68a1d4155d50e1a4b1ec726818673bff40b0d8
Author: Maxim Patlasov <mpatlasov at virtuozzo.com>
Date:   Tue Aug 9 12:40:41 2016 +0400

    ploop: io_direct: delay f_op->fsync() until index_update for reloc requests (v3)
    
    Commit 9f860e606 introduced an engine to delay fsync: doing
    fallocate(FALLOC_FL_CONVERT_UNWRITTEN) dio_post_submit marks
    io as PLOOP_IO_FSYNC_DELAYED to ensure that fsync happens
    later, when incoming FLUSH|FUA comes.
    
    That was deemed as important because (PSBM-47026):
    
    > This optimization becomes more important due to the fact that customers tend to use pcompact heavily => ploop images grow each day.
    
    Now, we can easily re-use the engine to delay fsync for reloc
    requests as well. As explained in the description of commit
    5aa3fe09:
    
    >     1->read_data_from_old_post
    >     2->write_to_new_pos
    >       ->sumbit_alloc
    >          ->submit_pad
    >      ->post_submit->convert_unwritten
    >     3->update_index ->write_page with FLUSH|FUA
    >     4->nullify_old_pos
    >    5->issue_flush
    
    by the time of step 3 extent coversion is not yet stable because
    belongs to uncommitted transaction. But instead of doing fsync
    inside ->post_submit, we can fsync later, as the very first step
    of write_page for index_update.
    
    Changed in v2:
     - process delayed fsync asynchronously, via PLOOP_E_FSYNC_PENDED eng_state
    
    Changed in v3:
     - use extra arg for ploop_index_wb_proceed_or_delay() instead of ad-hoc PLOOP_REQ_FSYNC_IF_DELAYED
    
    https://jira.sw.ru/browse/PSBM-47026
    
    Signed-off-by: Maxim Patlasov <mpatlasov at virtuozzo.com>
    Acked-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
 drivers/block/ploop/dev.c   |  9 +++++++--
 drivers/block/ploop/map.c   | 32 ++++++++++++++++++++++++++++----
 include/linux/ploop/ploop.h |  1 +
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 5775c16..83b0e32 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2776,6 +2776,11 @@ restart:
 		ploop_index_wb_complete(preq);
 		break;
 
+	case PLOOP_E_FSYNC_PENDED:
+		/* fsync done */
+		ploop_index_wb_proceed(preq);
+		break;
+
 	default:
 		BUG();
 	}
@@ -4190,7 +4195,7 @@ static void ploop_relocate(struct ploop_device * plo)
 	preq->bl.tail = preq->bl.head = NULL;
 	preq->req_cluster = 0;
 	preq->req_size = 0;
-	preq->req_rw = WRITE_SYNC|REQ_FUA;
+	preq->req_rw = WRITE_SYNC;
 	preq->eng_state = PLOOP_E_ENTRY;
 	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_A);
 	preq->error = 0;
@@ -4494,7 +4499,7 @@ static void ploop_relocblks_process(struct ploop_device *plo)
 		preq->bl.tail = preq->bl.head = NULL;
 		preq->req_cluster = ~0U; /* uninitialized */
 		preq->req_size = 0;
-		preq->req_rw = WRITE_SYNC|REQ_FUA;
+		preq->req_rw = WRITE_SYNC;
 		preq->eng_state = PLOOP_E_ENTRY;
 		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_S);
 		preq->error = 0;
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 5f7fd66..715dc15 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -915,6 +915,24 @@ void ploop_index_wb_proceed(struct ploop_request * preq)
 	put_page(page);
 }
 
+static void ploop_index_wb_proceed_or_delay(struct ploop_request * preq,
+					    int do_fsync_if_delayed)
+{
+	if (do_fsync_if_delayed) {
+		struct map_node * m = preq->map;
+		struct ploop_delta * top_delta = map_top_delta(m->parent);
+		struct ploop_io * top_io = &top_delta->io;
+
+		if (test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state)) {
+			preq->eng_state = PLOOP_E_FSYNC_PENDED;
+			ploop_add_req_to_fsync_queue(preq);
+			return;
+		}
+	}
+
+	ploop_index_wb_proceed(preq);
+}
+
 /* Data write is commited. Now we need to update index. */
 
 void ploop_index_update(struct ploop_request * preq)
@@ -927,6 +945,7 @@ void ploop_index_update(struct ploop_request * preq)
 	int old_level;
 	struct page * page;
 	unsigned long state = READ_ONCE(preq->state);
+	int do_fsync_if_delayed = 0;
 
 	/* No way back, we are going to initiate index write. */
 
@@ -985,10 +1004,12 @@ void ploop_index_update(struct ploop_request * preq)
 	preq->req_rw &= ~REQ_FLUSH;
 
 	/* Relocate requires consistent index update */
-	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
 		preq->req_index_update_rw |= (REQ_FLUSH | REQ_FUA);
+		do_fsync_if_delayed = 1;
+	}
 
-	ploop_index_wb_proceed(preq);
+	ploop_index_wb_proceed_or_delay(preq, do_fsync_if_delayed);
 	return;
 
 enomem:
@@ -1109,6 +1130,7 @@ static void map_wb_complete(struct map_node * m, int err)
 	int delayed = 0;
 	unsigned int idx;
 	unsigned long rw;
+	int do_fsync_if_delayed = 0;
 
 	/* First, complete processing of written back indices,
 	 * finally instantiate indices in mapping cache.
@@ -1206,8 +1228,10 @@ static void map_wb_complete(struct map_node * m, int err)
 
 			state = READ_ONCE(preq->state);
 			/* Relocate requires consistent index update */
-			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
 				rw |= (REQ_FLUSH | REQ_FUA);
+				do_fsync_if_delayed = 1;
+			}
 
 			preq->eng_state = PLOOP_E_INDEX_WB;
 			get_page(page);
@@ -1234,7 +1258,7 @@ static void map_wb_complete(struct map_node * m, int err)
 	plo->st.map_multi_writes++;
 
 	main_preq->req_index_update_rw = rw;
-	ploop_index_wb_proceed(main_preq);
+	ploop_index_wb_proceed_or_delay(main_preq, do_fsync_if_delayed);
 }
 
 void
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index be7d9f7..b2ef6bd 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -524,6 +524,7 @@ enum
 	PLOOP_E_ZERO_INDEX,	/* Zeroing index of free block; original request
 				   can use .submit on completion */
 	PLOOP_E_DELTA_ZERO_INDEX,/* the same but for PLOOP_E_DELTA_READ */
+	PLOOP_E_FSYNC_PENDED,   /* INDEX_WB needs io->ops->sync() to proceed */
 };
 
 #define BIO_BDEV_REUSED	14	/* io_context is stored in bi_bdev */


More information about the Devel mailing list