[Devel] [PATCH 3/3] ploop: fixup FORCE_{FLUSH,FUA} handling v3
Dmitry Monakhov
dmonakhov at openvz.org
Tue Jun 21 06:55:30 PDT 2016
barrier code is broken in many ways:
Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
But request also can goes though ->dio_submit_alloc()->dio_submit_pad and write_page (for indexes)
So in case of grow_dev we have following sequance:
E_RELOC_DATA_READ:
->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
->delta->allocate
->io->submit_allloc: dio_submit_alloc
->dio_submit_pad
E_DATA_WBI : data written, time to update index
->delta->allocate_complete:ploop_index_update
->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
->write_page
->ploop_map_wb_complete
->ploop_wb_complete_post_process
->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
E_RELOC_NULLIFY:
->submit()
BUG#2: currecntly kaio write_page silently ignores REQ_FLUSH
BUG#3: io_direct:dio_submit if fua_delay is not possible we MUST tag all bios via REQ_FUA
not just latest one.
This patch unify barrier handling like follows:
- Get rid of FORCE_{FLUSH,FUA}
- Introduce DELAYED_FLUSH
- fix fua handling for dio_submit
- BUG_ON for REQ_FLUSH in kaio_page_write
This makes reloc sequence optimal:
io_direct
RELOC_S: R1, W2, WBI:FLUSH|FUA
RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA
io_kaio
RELOC_S: R1, W2:FUA, WBI:FUA
RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA
https://jira.sw.ru/browse/PSBM-47107
Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
drivers/block/ploop/dev.c | 8 +++++---
drivers/block/ploop/io_direct.c | 30 ++++++++++-----------------
drivers/block/ploop/io_kaio.c | 23 +++++++++++++--------
drivers/block/ploop/map.c | 45 ++++++++++++++++++++++-------------------
include/linux/ploop/ploop.h | 19 +++++------------
5 files changed, 60 insertions(+), 65 deletions(-)
diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 96f7850..fbc5f2f 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct ploop_request * preq)
__TRACE("Z %p %u\n", preq, preq->req_cluster);
+ if (!preq->error) {
+ WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state));
+ }
while (preq->bl.head) {
struct bio * bio = preq->bl.head;
preq->bl.head = bio->bi_next;
@@ -2530,9 +2533,8 @@ restart:
top_delta = ploop_top_delta(plo);
sbl.head = sbl.tail = preq->aux_bio;
- /* Relocated data write required sync before BAT updatee */
- set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
+ /* Relocated data write required sync before BAT updatee
+ * this will happen inside index_update */
if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
preq->eng_state = PLOOP_E_DATA_WBI;
plo->st.bio_out++;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index a6d83fe..303eb70 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -83,28 +83,19 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
int err;
struct bio_list_walk bw;
int preflush;
- int postfua = 0;
+ int fua = 0;
int write = !!(rw & REQ_WRITE);
int bio_num;
trace_submit(preq);
preflush = !!(rw & REQ_FLUSH);
-
- if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state))
- preflush = 1;
-
- if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state))
- postfua = 1;
-
- if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
-
+ fua = !!(rw & REQ_FUA);
+ if (fua && ploop_req_delay_fua_possible(rw, preq)) {
/* Mark req that delayed flush required */
- set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state);
- } else if (rw & REQ_FUA) {
- postfua = 1;
+ set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+ fua = 0;
}
-
rw &= ~(REQ_FLUSH | REQ_FUA);
@@ -238,8 +229,10 @@ flush_bio:
rw2 |= REQ_FLUSH;
preflush = 0;
}
- if (unlikely(postfua && !bl.head))
- rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0));
+ /* Very unlikely, but correct.
+ * TODO: Optimize postfua via DELAY_FLUSH for any req state */
+ if (unlikely(fua))
+ rw2 |= REQ_FUA;
ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
submit_bio(rw2, b);
@@ -1520,15 +1513,14 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq,
static void
dio_write_page(struct ploop_io * io, struct ploop_request * preq,
- struct page * page, sector_t sec, int fua)
+ struct page * page, sector_t sec, unsigned long rw)
{
if (!(io->files.file->f_mode & FMODE_WRITE)) {
PLOOP_FAIL_REQUEST(preq, -EBADF);
return;
}
- dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
- preq, page, sec);
+ dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
}
static int
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index de26319..c3bdd20 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -61,6 +61,7 @@ static void kaio_complete_io_state(struct ploop_request * preq)
struct ploop_device * plo = preq->plo;
unsigned long flags;
int post_fsync = 0;
+ unsigned long state = READ_ONCE(preq->state);
if (preq->error || !(preq->req_rw & REQ_FUA) ||
preq->eng_state == PLOOP_E_INDEX_READ ||
@@ -72,18 +73,23 @@ static void kaio_complete_io_state(struct ploop_request * preq)
}
/* Convert requested fua to fsync */
- if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) ||
- test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
+ if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state))
post_fsync = 1;
- if (!post_fsync &&
- !ploop_req_delay_fua_possible(preq->req_rw, preq) &&
- (preq->req_rw & REQ_FUA))
+ if ((preq->req_rw & REQ_FUA) &&
+ !ploop_req_delay_fua_possible(preq->req_rw, preq))
+ post_fsync = 1;
+ /* Usually we may delay fua for regular requets, but relocation
+ * requires REQ_FLUSH_FUA for index_update which is not supported
+ * by kaio_write_page(). Flush it now.
+ */
+ if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
post_fsync = 1;
preq->req_rw &= ~REQ_FUA;
if (post_fsync) {
+ test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
spin_lock_irqsave(&plo->lock, flags);
kaio_queue_fsync_req(preq);
plo->st.bio_syncwait++;
@@ -608,12 +614,13 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
static void
kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
- struct page * page, sector_t sec, int fua)
+ struct page * page, sector_t sec, unsigned long rw)
{
ploop_prepare_tracker(preq, sec);
-
+ /* This is async call , preflush is not supported */
+ BUG_ON(rw & REQ_FLUSH);
/* No FUA in kaio, convert it to fsync */
- if (fua)
+ if (rw & REQ_FUA)
set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 3a6365d..62f7d79 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq)
struct ploop_device * plo = preq->plo;
struct map_node * m = preq->map;
struct ploop_delta * top_delta = map_top_delta(m->parent);
+ unsigned long rw = preq->req_rw & REQ_FUA;
+ unsigned long state = READ_ONCE(preq->state);
u32 idx;
map_index_t blk;
int old_level;
@@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq)
__TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
plo->st.map_single_writes++;
top_delta->ops->map_index(top_delta, m->mn_start, &sec);
- /* Relocate requires consistent writes, mark such reqs appropriately */
- if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
- test_bit(PLOOP_REQ_RELOC_S, &preq->state))
- set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
-
- top_delta->io.ops->write_page(&top_delta->io, preq, page, sec,
- !!(preq->req_rw & REQ_FUA));
+ /* Relocate requires consistent index update */
+ if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+ rw |= REQ_FUA;
+ if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
+ rw |= REQ_FLUSH;
+ clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+ }
+ top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | WRITE);
put_page(page);
return;
@@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct ploop_map *map,
* (see dio_submit()). So fsync of EXT4 image doesnt help us.
* We need to force sync of nullified blocks.
*/
- set_bit(PLOOP_REQ_FORCE_FUA, &preq->state);
+ preq->req_rw |= REQ_FUA;
top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
&sbl, preq->iblock, 1<<plo->cluster_log);
}
@@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int err)
struct ploop_delta * top_delta = map_top_delta(m->parent);
struct list_head * cursor, * tmp;
struct ploop_request * main_preq;
+ unsigned long rw = 0;
struct page * page = NULL;
int delayed = 0;
unsigned int idx;
sector_t sec;
- int fua, force_fua;
+ int fua;
/* First, complete processing of written back indices,
* finally instantiate indices in mapping cache.
@@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int err)
main_preq = NULL;
fua = 0;
- force_fua = 0;
list_for_each_safe(cursor, tmp, &m->io_queue) {
struct ploop_request * preq;
+ unsigned long state;
preq = list_entry(cursor, struct ploop_request, list);
@@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int err)
spin_unlock_irq(&plo->lock);
break;
}
-
- if (preq->req_rw & REQ_FUA)
- fua = 1;
-
- if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
- test_bit(PLOOP_REQ_RELOC_S, &preq->state))
- force_fua = 1;
+ state = READ_ONCE(preq->state);
+ /* Relocate requires consistent index update */
+ rw |= preq->req_rw & REQ_FUA;
+ if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL))
+ rw |= REQ_FUA;
+ if (state & PLOOP_REQ_DELAYED_FLUSH_FL) {
+ rw |= REQ_FLUSH;
+ clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state);
+ }
preq->eng_state = PLOOP_E_INDEX_WB;
get_page(page);
@@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int err)
plo->st.map_multi_writes++;
top_delta->ops->map_index(top_delta, m->mn_start, &sec);
- if (force_fua)
- set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state);
-
- top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, fua);
+ top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw | WRITE);
put_page(page);
}
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 0fba25e..5af32ba 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -157,7 +157,7 @@ struct ploop_io_ops
void (*read_page)(struct ploop_io * io, struct ploop_request * preq,
struct page * page, sector_t sec);
void (*write_page)(struct ploop_io * io, struct ploop_request * preq,
- struct page * page, sector_t sec, int fua);
+ struct page * page, sector_t sec, unsigned long rw);
int (*sync_read)(struct ploop_io * io, struct page * page,
@@ -465,8 +465,7 @@ enum
PLOOP_REQ_ZERO,
PLOOP_REQ_DISCARD,
PLOOP_REQ_RSYNC,
- PLOOP_REQ_FORCE_FUA, /*force fua of req write I/O by engine */
- PLOOP_REQ_FORCE_FLUSH, /*force flush by engine */
+ PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */
PLOOP_REQ_KAIO_FSYNC, /*force image fsync by KAIO module */
PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
@@ -477,6 +476,7 @@ enum
#define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
#define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
#define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
+#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH)
#define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
#define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
@@ -607,17 +607,8 @@ void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list,
static inline int ploop_req_delay_fua_possible(unsigned long rw,
struct ploop_request *preq)
{
- int delay_fua = 0;
-
- /* In case of eng_state != COMPLETE, we'll do FUA in
- * ploop_index_update(). Otherwise, we should post
- * fua.
- */
- if (rw & REQ_FUA) {
- if (preq->eng_state != PLOOP_E_COMPLETE)
- delay_fua = 1;
- }
- return delay_fua;
+ /* data may be flushed during ploop_index_update() */
+ return (preq->eng_state == PLOOP_E_DATA_WBI);
}
static inline void ploop_req_set_error(struct ploop_request * preq, int err)
--
1.8.3.1
More information about the Devel
mailing list