[Devel] [PATCH 3/3] ploop: fixup FORCE_{FLUSH,FUA} handling v2

Dmitry Monakhov dmonakhov at openvz.org
Mon Jun 20 11:58:39 PDT 2016


barrier code is broken in many ways:
Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
But request also can goes though ->dio_submit_alloc()->dio_submit_pad and write_page (for indexes)
So in case of grow_dev we have following sequance:

E_RELOC_DATA_READ:
             ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
              ->delta->allocate
                 ->io->submit_allloc: dio_submit_alloc
                   ->dio_submit_pad
E_DATA_WBI : data written, time to update index
              ->delta->allocate_complete:ploop_index_update
                ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
                ->write_page
                ->ploop_map_wb_complete
                  ->ploop_wb_complete_post_process
                    ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
E_RELOC_NULLIFY:

               ->submit()

BUG#2: currecntly kaio write_page silently ignores REQ_FUA
BUG#3: io_direct:dio_submit  if fua_delay is not possible we MUST tag all bios via REQ_FUA
       not just latest one.
This patch unify barrier handling like follows:
- Get rid of FORCE_{FLUSH,FUA}
- Introduce DELAYED_FLUSH, currecntly it supported only by io_direct
- fix up fua handling for dio_submit

This makes reloc sequence optimal:
io_direct
RELOC_S: R1, W2, WBI:FLUSH|FUA
RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA
io_kaio
RELOC_S: R1, W2:FUA, WBI:FUA
RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA

https://jira.sw.ru/browse/PSBM-47107
Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
 drivers/block/ploop/dev.c       |  8 +++++---
 drivers/block/ploop/io_direct.c | 29 +++++++++-----------------
 drivers/block/ploop/io_kaio.c   | 17 ++++++++++------
 drivers/block/ploop/map.c       | 45 ++++++++++++++++++++++-------------------
 include/linux/ploop/ploop.h     |  8 ++++----
 5 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 96f7850..fbc5f2f 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct ploop_request * preq)
 
 	__TRACE("Z %p %u\n", preq, preq->req_cluster);
 
+	if (!preq->error) {
+		WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state));
+	}
 	while (preq->bl.head) {
 		struct bio * bio = preq->bl.head;
 		preq->bl.head = bio->bi_next;
@@ -2530,9 +2533,8 @@ restart:
 		top_delta = ploop_top_delta(plo);
 		sbl.head = sbl.tail = preq->aux_bio;
 
-		/* Relocated data write required sync before BAT updatee */
-		set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
+		/* Relocated data write required sync before BAT updatee
+		 * this will happen inside index_update */
 		if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
 			preq->eng_state = PLOOP_E_DATA_WBI;
 			plo->st.bio_out++;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index a6d83fe..d7ecd4a 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -90,21 +90,12 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
 	trace_submit(preq);
 
 	preflush = !!(rw & REQ_FLUSH);
-
-	if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state))
-		preflush = 1;
-
-	if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state))
-		postfua = 1;
-
-	if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
-
+	postfua = !!(rw & REQ_FUA);
+	if (ploop_req_delay_fua_possible(rw, preq)) {
 		/* Mark req that delayed flush required */
-		set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state);
-	} else if (rw & REQ_FUA) {
-		postfua = 1;
+		set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+		postfua = 0;
 	}
-
 	rw &= ~(REQ_FLUSH | REQ_FUA);
 
 
@@ -238,14 +229,15 @@ flush_bio:
 			rw2 |= REQ_FLUSH;
 			preflush = 0;
 		}
-		if (unlikely(postfua && !bl.head))
-			rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0));
+		/* Very unlikely, but correct.
+		 * TODO: Optimize postfua via DELAY_FLUSH for any req state */
+		if (unlikely(!postfua))
+			rw2 |= REQ_FUA;
 
 		ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
 		submit_bio(rw2, b);
 		bio_num++;
 	}
-
 	ploop_complete_io_request(preq);
 	return;
 
@@ -1520,15 +1512,14 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq,
 
 static void
 dio_write_page(struct ploop_io * io, struct ploop_request * preq,
-	       struct page * page, sector_t sec, int fua)
+	       struct page * page, sector_t sec, unsigned long rw)
 {
 	if (!(io->files.file->f_mode & FMODE_WRITE)) {
 		PLOOP_FAIL_REQUEST(preq, -EBADF);
 		return;
 	}
 
-	dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
-		    preq, page, sec);
+	dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
 }
 
 static int
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index de26319..0b93e13 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -72,13 +72,17 @@ static void kaio_complete_io_state(struct ploop_request * preq)
 	}
 
 	/* Convert requested fua to fsync */
-	if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) ||
+	if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) ||
 	    test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
 		post_fsync = 1;
 
 	if (!post_fsync &&
-	    !ploop_req_delay_fua_possible(preq->req_rw, preq) &&
-	    (preq->req_rw & REQ_FUA))
+	    !ploop_req_delay_fua_possible(preq->req_rw, preq))
+		post_fsync = 1;
+	/* Reloc requires FLUSH_FUA from ->index_update for delayed_fua
+	   which is not supporded by ->kaio_write_page */
+	if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
+	    test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state))
 		post_fsync = 1;
 
 	preq->req_rw &= ~REQ_FUA;
@@ -608,12 +612,13 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
 
 static void
 kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
-		 struct page * page, sector_t sec, int fua)
+		 struct page * page, sector_t sec, unsigned long rw)
 {
 	ploop_prepare_tracker(preq, sec);
-
+	/* This is async call , preflush is not supported */
+	BUG_ON(rw & REQ_FLUSH);
 	/* No FUA in kaio, convert it to fsync */
-	if (fua)
+	if (rw & REQ_FUA)
 		set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
 
 	kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 3a6365d..de9bb32 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq)
 	struct ploop_device * plo = preq->plo;
 	struct map_node * m = preq->map;
 	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	unsigned long rw = preq->req_rw & REQ_FUA;
+	unsigned long state = READ_ONCE(preq->state);
 	u32 idx;
 	map_index_t blk;
 	int old_level;
@@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq)
 	__TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
 	plo->st.map_single_writes++;
 	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
-	/* Relocate requires consistent writes, mark such reqs appropriately */
-	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-	    test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-		set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
-	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec,
-				      !!(preq->req_rw & REQ_FUA));
+	/* Relocate requires consistent index update */
+	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+		rw |= REQ_FUA;
+	if (state & PLOOP_REQ_DELAYED_FLUSH_FL)
+		rw |= REQ_FLUSH;
+	if (rw)
+		clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | WRITE);
 	put_page(page);
 	return;
 
@@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct ploop_map *map,
 	 * (see dio_submit()). So fsync of EXT4 image doesnt help us.
 	 * We need to force sync of nullified blocks.
 	 */
-	set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
+	preq->req_rw |= REQ_FUA;
 	top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
 				  &sbl, preq->iblock, 1<<plo->cluster_log);
 }
@@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int err)
 	struct ploop_delta * top_delta = map_top_delta(m->parent);
 	struct list_head * cursor, * tmp;
 	struct ploop_request * main_preq;
+	unsigned long rw =  0;
 	struct page * page = NULL;
 	int delayed = 0;
 	unsigned int idx;
 	sector_t sec;
-	int fua, force_fua;
+	int fua;
 
 	/* First, complete processing of written back indices,
 	 * finally instantiate indices in mapping cache.
@@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int err)
 
 	main_preq = NULL;
 	fua = 0;
-	force_fua = 0;
 
 	list_for_each_safe(cursor, tmp, &m->io_queue) {
 		struct ploop_request * preq;
+		unsigned long state;
 
 		preq = list_entry(cursor, struct ploop_request, list);
 
@@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int err)
 				spin_unlock_irq(&plo->lock);
 				break;
 			}
-
-			if (preq->req_rw & REQ_FUA)
-				fua = 1;
-
-			if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-			    test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-				force_fua = 1;
+			state = READ_ONCE(preq->state);
+			/* Relocate requires consistent index update */
+			rw |= preq->req_rw & REQ_FUA;
+			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+				rw |= REQ_FUA;
+			if (state & PLOOP_REQ_DELAYED_FLUSH_FL)
+				rw |= REQ_FLUSH;
+			if (rw)
+				clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
 
 			preq->eng_state = PLOOP_E_INDEX_WB;
 			get_page(page);
@@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int err)
 	plo->st.map_multi_writes++;
 	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
 
-	if (force_fua)
-		set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state);
-
-	top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, fua);
+	top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw | WRITE);
 	put_page(page);
 }
 
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 0fba25e..2f26824 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -157,7 +157,7 @@ struct ploop_io_ops
 	void	(*read_page)(struct ploop_io * io, struct ploop_request * preq,
 			     struct page * page, sector_t sec);
 	void	(*write_page)(struct ploop_io * io, struct ploop_request * preq,
-			      struct page * page, sector_t sec, int fua);
+			      struct page * page, sector_t sec, unsigned long rw);
 
 
 	int	(*sync_read)(struct ploop_io * io, struct page * page,
@@ -465,8 +465,7 @@ enum
 	PLOOP_REQ_ZERO,
 	PLOOP_REQ_DISCARD,
 	PLOOP_REQ_RSYNC,
-	PLOOP_REQ_FORCE_FUA,	/*force fua of req write I/O by engine */
-	PLOOP_REQ_FORCE_FLUSH,	/*force flush by engine */
+	PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */
 	PLOOP_REQ_KAIO_FSYNC,	/*force image fsync by KAIO module */
 	PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
 	PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
@@ -477,6 +476,7 @@ enum
 #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
 #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
 #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
+#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH)
 #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
 #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
 
@@ -614,7 +614,7 @@ static inline int ploop_req_delay_fua_possible(unsigned long rw,
 	 * fua.
 	 */
 	if (rw & REQ_FUA) {
-		if (preq->eng_state != PLOOP_E_COMPLETE)
+		if (preq->eng_state == PLOOP_E_DATA_WBI)
 			delay_fua = 1;
 	}
 	return delay_fua;
-- 
1.8.3.1



More information about the Devel mailing list