[Devel] [PATCH 3/3] ploop: fixup FORCE_{FLUSH,FUA} handling v3

Dmitry Monakhov dmonakhov at openvz.org
Tue Jun 21 06:55:30 PDT 2016


barrier code is broken in many ways:
Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
But request also can goes though ->dio_submit_alloc()->dio_submit_pad and write_page (for indexes)
So in case of grow_dev we have following sequance:

E_RELOC_DATA_READ:
             ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
              ->delta->allocate
                 ->io->submit_allloc: dio_submit_alloc
                   ->dio_submit_pad
E_DATA_WBI : data written, time to update index
              ->delta->allocate_complete:ploop_index_update
                ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
                ->write_page
                ->ploop_map_wb_complete
                  ->ploop_wb_complete_post_process
                    ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
E_RELOC_NULLIFY:

               ->submit()

BUG#2: currecntly kaio write_page silently ignores REQ_FLUSH
BUG#3: io_direct:dio_submit  if fua_delay is not possible we MUST tag all bios via REQ_FUA
       not just latest one.
This patch unify barrier handling like follows:
- Get rid of FORCE_{FLUSH,FUA}
- Introduce DELAYED_FLUSH
- fix fua handling for dio_submit
- BUG_ON for REQ_FLUSH in kaio_page_write

This makes reloc sequence optimal:
io_direct
RELOC_S: R1, W2, WBI:FLUSH|FUA
RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA
io_kaio
RELOC_S: R1, W2:FUA, WBI:FUA
RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA

https://jira.sw.ru/browse/PSBM-47107
Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
 drivers/block/ploop/dev.c       |  8 +++++---
 drivers/block/ploop/io_direct.c | 30 ++++++++++-----------------
 drivers/block/ploop/io_kaio.c   | 23 +++++++++++++--------
 drivers/block/ploop/map.c       | 45 ++++++++++++++++++++++-------------------
 include/linux/ploop/ploop.h     | 19 +++++------------
 5 files changed, 60 insertions(+), 65 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 96f7850..fbc5f2f 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct ploop_request * preq)
 
 	__TRACE("Z %p %u\n", preq, preq->req_cluster);
 
+	if (!preq->error) {
+		WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state));
+	}
 	while (preq->bl.head) {
 		struct bio * bio = preq->bl.head;
 		preq->bl.head = bio->bi_next;
@@ -2530,9 +2533,8 @@ restart:
 		top_delta = ploop_top_delta(plo);
 		sbl.head = sbl.tail = preq->aux_bio;
 
-		/* Relocated data write required sync before BAT updatee */
-		set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
+		/* Relocated data write required sync before BAT updatee
+		 * this will happen inside index_update */
 		if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
 			preq->eng_state = PLOOP_E_DATA_WBI;
 			plo->st.bio_out++;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index a6d83fe..303eb70 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -83,28 +83,19 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
 	int err;
 	struct bio_list_walk bw;
 	int preflush;
-	int postfua = 0;
+	int fua = 0;
 	int write = !!(rw & REQ_WRITE);
 	int bio_num;
 
 	trace_submit(preq);
 
 	preflush = !!(rw & REQ_FLUSH);
-
-	if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state))
-		preflush = 1;
-
-	if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state))
-		postfua = 1;
-
-	if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
-
+	fua = !!(rw & REQ_FUA);
+	if (fua && ploop_req_delay_fua_possible(rw, preq)) {
 		/* Mark req that delayed flush required */
-		set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state);
-	} else if (rw & REQ_FUA) {
-		postfua = 1;
+		set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+		fua = 0;
 	}
-
 	rw &= ~(REQ_FLUSH | REQ_FUA);
 
 
@@ -238,8 +229,10 @@ flush_bio:
 			rw2 |= REQ_FLUSH;
 			preflush = 0;
 		}
-		if (unlikely(postfua && !bl.head))
-			rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0));
+		/* Very unlikely, but correct.
+		 * TODO: Optimize postfua via DELAY_FLUSH for any req state */
+		if (unlikely(fua))
+			rw2 |= REQ_FUA;
 
 		ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
 		submit_bio(rw2, b);
@@ -1520,15 +1513,14 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq,
 
 static void
 dio_write_page(struct ploop_io * io, struct ploop_request * preq,
-	       struct page * page, sector_t sec, int fua)
+	       struct page * page, sector_t sec, unsigned long rw)
 {
 	if (!(io->files.file->f_mode & FMODE_WRITE)) {
 		PLOOP_FAIL_REQUEST(preq, -EBADF);
 		return;
 	}
 
-	dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
-		    preq, page, sec);
+	dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
 }
 
 static int
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index de26319..c3bdd20 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -61,6 +61,7 @@ static void kaio_complete_io_state(struct ploop_request * preq)
 	struct ploop_device * plo   = preq->plo;
 	unsigned long flags;
 	int post_fsync = 0;
+	unsigned long state = READ_ONCE(preq->state);
 
 	if (preq->error || !(preq->req_rw & REQ_FUA) ||
 	    preq->eng_state == PLOOP_E_INDEX_READ ||
@@ -72,18 +73,23 @@ static void kaio_complete_io_state(struct ploop_request * preq)
 	}
 
 	/* Convert requested fua to fsync */
-	if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) ||
-	    test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
+	if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
 		post_fsync = 1;
 
-	if (!post_fsync &&
-	    !ploop_req_delay_fua_possible(preq->req_rw, preq) &&
-	    (preq->req_rw & REQ_FUA))
+	if ((preq->req_rw & REQ_FUA) &&
+	    !ploop_req_delay_fua_possible(preq->req_rw, preq))
+		post_fsync = 1;
+	/* Usually we may delay fua for regular requets, but relocation
+	 * requires REQ_FLUSH_FUA for index_update which is not supported
+	 * by kaio_write_page(). Flush it now.
+	 */
+	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
 		post_fsync = 1;
 
 	preq->req_rw &= ~REQ_FUA;
 
 	if (post_fsync) {
+		test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
 		spin_lock_irqsave(&plo->lock, flags);
 		kaio_queue_fsync_req(preq);
 		plo->st.bio_syncwait++;
@@ -608,12 +614,13 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
 
 static void
 kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
-		 struct page * page, sector_t sec, int fua)
+		 struct page * page, sector_t sec, unsigned long rw)
 {
 	ploop_prepare_tracker(preq, sec);
-
+	/* This is async call , preflush is not supported */
+	BUG_ON(rw & REQ_FLUSH);
 	/* No FUA in kaio, convert it to fsync */
-	if (fua)
+	if (rw & REQ_FUA)
 		set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
 
 	kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 3a6365d..62f7d79 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq)
 	struct ploop_device * plo = preq->plo;
 	struct map_node * m = preq->map;
 	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	unsigned long rw = preq->req_rw & REQ_FUA;
+	unsigned long state = READ_ONCE(preq->state);
 	u32 idx;
 	map_index_t blk;
 	int old_level;
@@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq)
 	__TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
 	plo->st.map_single_writes++;
 	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
-	/* Relocate requires consistent writes, mark such reqs appropriately */
-	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-	    test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-		set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
-	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec,
-				      !!(preq->req_rw & REQ_FUA));
+	/* Relocate requires consistent index update */
+	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+		rw |= REQ_FUA;
+	if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
+		rw |= REQ_FLUSH;
+		clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+	}
+	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | WRITE);
 	put_page(page);
 	return;
 
@@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct ploop_map *map,
 	 * (see dio_submit()). So fsync of EXT4 image doesnt help us.
 	 * We need to force sync of nullified blocks.
 	 */
-	set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
+	preq->req_rw |= REQ_FUA;
 	top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
 				  &sbl, preq->iblock, 1<<plo->cluster_log);
 }
@@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int err)
 	struct ploop_delta * top_delta = map_top_delta(m->parent);
 	struct list_head * cursor, * tmp;
 	struct ploop_request * main_preq;
+	unsigned long rw =  0;
 	struct page * page = NULL;
 	int delayed = 0;
 	unsigned int idx;
 	sector_t sec;
-	int fua, force_fua;
+	int fua;
 
 	/* First, complete processing of written back indices,
 	 * finally instantiate indices in mapping cache.
@@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int err)
 
 	main_preq = NULL;
 	fua = 0;
-	force_fua = 0;
 
 	list_for_each_safe(cursor, tmp, &m->io_queue) {
 		struct ploop_request * preq;
+		unsigned long state;
 
 		preq = list_entry(cursor, struct ploop_request, list);
 
@@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int err)
 				spin_unlock_irq(&plo->lock);
 				break;
 			}
-
-			if (preq->req_rw & REQ_FUA)
-				fua = 1;
-
-			if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
-			    test_bit(PLOOP_REQ_RELOC_S, &preq->state))
-				force_fua = 1;
+			state = READ_ONCE(preq->state);
+			/* Relocate requires consistent index update */
+			rw |= preq->req_rw & REQ_FUA;
+			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+				rw |= REQ_FUA;
+			if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
+				rw |= REQ_FLUSH;
+				clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+			}
 
 			preq->eng_state = PLOOP_E_INDEX_WB;
 			get_page(page);
@@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int err)
 	plo->st.map_multi_writes++;
 	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
 
-	if (force_fua)
-		set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state);
-
-	top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, fua);
+	top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw | WRITE);
 	put_page(page);
 }
 
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 0fba25e..5af32ba 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -157,7 +157,7 @@ struct ploop_io_ops
 	void	(*read_page)(struct ploop_io * io, struct ploop_request * preq,
 			     struct page * page, sector_t sec);
 	void	(*write_page)(struct ploop_io * io, struct ploop_request * preq,
-			      struct page * page, sector_t sec, int fua);
+			      struct page * page, sector_t sec, unsigned long rw);
 
 
 	int	(*sync_read)(struct ploop_io * io, struct page * page,
@@ -465,8 +465,7 @@ enum
 	PLOOP_REQ_ZERO,
 	PLOOP_REQ_DISCARD,
 	PLOOP_REQ_RSYNC,
-	PLOOP_REQ_FORCE_FUA,	/*force fua of req write I/O by engine */
-	PLOOP_REQ_FORCE_FLUSH,	/*force flush by engine */
+	PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */
 	PLOOP_REQ_KAIO_FSYNC,	/*force image fsync by KAIO module */
 	PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
 	PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
@@ -477,6 +476,7 @@ enum
 #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
 #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
 #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
+#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH)
 #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
 #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
 
@@ -607,17 +607,8 @@ void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list,
 static inline int ploop_req_delay_fua_possible(unsigned long rw,
        struct ploop_request *preq)
 {
-	int delay_fua = 0;
-
-	/* In case of eng_state != COMPLETE, we'll do FUA in
-	 * ploop_index_update(). Otherwise, we should post
-	 * fua.
-	 */
-	if (rw & REQ_FUA) {
-		if (preq->eng_state != PLOOP_E_COMPLETE)
-			delay_fua = 1;
-	}
-	return delay_fua;
+	/* data may be flushed during ploop_index_update() */
+	return  (preq->eng_state == PLOOP_E_DATA_WBI);
 }
 
 static inline void ploop_req_set_error(struct ploop_request * preq, int err)
-- 
1.8.3.1



More information about the Devel mailing list