[Devel] [PATCH RHEL8 COMMIT] ploop: Rework merge

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jun 24 14:25:07 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.49
------>
commit e71d670f5186dab7b3295cd776275a18a61bd255
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Thu Jun 24 14:25:07 2021 +0300

    ploop: Rework merge
    
    pio should be main entity of all driver, and
    waiting locked cluster should be made via pio
    postponeing. So, we rework merge to fit that.
    
    Preparation for https://jira.sw.ru/browse/PSBM-124550
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 drivers/md/dm-ploop-cmd.c    | 200 +++++++++++++------------------------------
 drivers/md/dm-ploop-map.c    |  20 +++--
 drivers/md/dm-ploop-target.c |   5 +-
 drivers/md/dm-ploop.h        |  32 +++----
 4 files changed, 92 insertions(+), 165 deletions(-)

diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index 7f2affd38ca6..fff68ecc3c7b 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -10,21 +10,11 @@
 #include <linux/uio.h>
 #include <linux/ctype.h>
 #include <linux/umh.h>
+#include <linux/sched/signal.h>
 #include "dm-ploop.h"
 
 #define DM_MSG_PREFIX "ploop"
 
-static void ploop_queue_deferred_cmd(struct ploop *ploop, struct ploop_cmd *cmd)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ploop->deferred_lock, flags);
-	BUG_ON(ploop->deferred_cmd && ploop->deferred_cmd != cmd);
-	ploop->deferred_cmd = cmd;
-	spin_unlock_irqrestore(&ploop->deferred_lock, flags);
-	queue_work(ploop->wq, &ploop->worker);
-}
-
 /*
  * Assign newly allocated memory for BAT array and holes_bitmap
  * before grow.
@@ -564,8 +554,6 @@ static int ploop_resize(struct ploop *ploop, sector_t new_sectors)
 	cmd.resize.hb_nr = hb_nr;
 	cmd.resize.new_sectors = new_sectors;
 	cmd.resize.md0 = md0;
-	cmd.retval = 0;
-	cmd.ploop = ploop;
 
 	ploop_suspend_submitting_pios(ploop);
 	ret = process_resize_cmd(ploop, &cmd);
@@ -577,106 +565,75 @@ static int ploop_resize(struct ploop *ploop, sector_t new_sectors)
 	free_md_pages_tree(&cmd.resize.md_pages_root);
 	return ret;
 }
-
-static void ploop_queue_deferred_cmd_wrapper(struct ploop *ploop,
-					     int ret, void *data)
+static void service_pio_endio(struct pio *pio, void *data, blk_status_t status)
 {
-	struct ploop_cmd *cmd = data;
-
-	if (ret) {
-		/* kwork will see this at next time it is on cpu */
-		WRITE_ONCE(cmd->retval, ret);
-	}
-	atomic_inc(&cmd->merge.nr_available);
-	ploop_queue_deferred_cmd(cmd->ploop, cmd);
-}
-
-/* Find mergeable cluster and return it in cmd->merge.cluster */
-static bool iter_delta_clusters(struct ploop *ploop, struct ploop_cmd *cmd)
-{
-	unsigned int dst_cluster, *cluster = &cmd->merge.cluster;
-	u8 level;
-	bool skip;
-
-	BUG_ON(cmd->type != PLOOP_CMD_MERGE_SNAPSHOT);
-
-	for (; *cluster < ploop->nr_bat_entries; ++*cluster) {
-		/*
-		 * Check *cluster is provided by the merged delta.
-		 * We are in kwork, so bat_rwlock is not needed
-		 * (see comment in process_one_deferred_bio()).
-		 */
-		/* FIXME: Optimize this. ploop_bat_entries() is overkill */
-		dst_cluster = ploop_bat_entries(ploop, *cluster, &level);
-		if (dst_cluster == BAT_ENTRY_NONE ||
-		    level != ploop->nr_deltas - 2)
-			continue;
-
-		spin_lock_irq(&ploop->deferred_lock);
-		skip = find_lk_of_cluster(ploop, *cluster);
-		spin_unlock_irq(&ploop->deferred_lock);
-		if (skip) {
-			/*
-			 * Cluster is locked (maybe, under COW).
-			 * Skip it and try to repeat later.
-			 */
-			cmd->merge.do_repeat = true;
-			continue;
-		}
+	struct ploop *ploop = pio->ploop;
+	blk_status_t *status_ptr = data;
+	unsigned long flags;
 
-		return true;
+	if (unlikely(status)) {
+		spin_lock_irqsave(&ploop->err_status_lock, flags);
+		*status_ptr = status;
+		spin_unlock_irqrestore(&ploop->err_status_lock, flags);
 	}
 
-	return false;
+	if (atomic_dec_return(&ploop->service_pios) < MERGE_PIOS_MAX / 2)
+		wake_up(&ploop->service_wq);
 }
 
-static void process_merge_latest_snapshot_cmd(struct ploop *ploop,
-					      struct ploop_cmd *cmd)
+static int process_merge_latest_snapshot(struct ploop *ploop)
 {
-	unsigned int dst_cluster, *cluster = &cmd->merge.cluster;
-	u8 level;
-
-	if (cmd->retval)
-		goto out;
-
-	while (iter_delta_clusters(ploop, cmd)) {
-		/*
-		 * We are in kwork, so bat_rwlock is not needed
-		 * (we can't race with changing BAT, since cmds
-		 *  are processed before bios and piwb is sync).
-		 */
-		/* FIXME: Optimize this: ploop_bat_entries() is overkill */
-		dst_cluster = ploop_bat_entries(ploop, *cluster, &level);
+	static blk_status_t service_status;
+	struct bio_vec bvec = {0};
+	struct pio *pio;
+	int ret = 0;
+	u32 clu;
 
-		/* Check we can submit one more cow in parallel */
-		if (!atomic_add_unless(&cmd->merge.nr_available, -1, 0))
-			return;
-		/*
-		 * This adds cluster lk. Further write bios to *cluster will go
-		 * from ploop_map to kwork (because bat_levels[*cluster] is not
-		 * top_level()), so they will see the lk.
-		 */
-		if (submit_cluster_cow(ploop, level, *cluster, dst_cluster,
-				    ploop_queue_deferred_cmd_wrapper, cmd)) {
-			atomic_inc(&cmd->merge.nr_available);
-			cmd->retval = -ENOMEM;
-			goto out;
+	for (clu = 0; clu < ploop->nr_bat_entries; clu++) {
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+		pio = kmalloc(sizeof(*pio), GFP_KERNEL);
+		if (!pio) {
+			ret = -ENOMEM;
+			break;
+		}
+		init_pio(ploop, REQ_OP_WRITE, pio);
+		pio->free_on_endio = true;
+		pio->bi_io_vec = &bvec;
+		pio->bi_iter.bi_sector = CLU_TO_SEC(ploop, clu);
+		pio->bi_iter.bi_size = 0;
+		pio->bi_iter.bi_idx = 0;
+		pio->bi_iter.bi_bvec_done = 0;
+		pio->endio_cb = service_pio_endio;
+		pio->endio_cb_data = &service_status;
+		pio->is_fake_merge = true;
+		WARN_ON_ONCE(!fake_merge_pio(pio));
+
+		defer_pios(ploop, pio, NULL);
+
+		if (atomic_inc_return(&ploop->service_pios) == MERGE_PIOS_MAX) {
+			wait_event(ploop->service_wq,
+					atomic_read(&ploop->service_pios) < MERGE_PIOS_MAX);
 		}
 
-		++*cluster;
+		if (unlikely(READ_ONCE(service_status)))
+			break;
 	}
-out:
-	if (atomic_read(&cmd->merge.nr_available) != NR_MERGE_BIOS) {
-		/* Wait till last COW queues us */
-		return;
+
+	wait_event(ploop->service_wq, !atomic_read(&ploop->service_pios));
+	if (!ret) {
+		spin_lock_irq(&ploop->err_status_lock);
+		ret = blk_status_to_errno(service_status);
+		spin_unlock_irq(&ploop->err_status_lock);
 	}
 
-	complete(&cmd->comp); /* Last touch of cmd memory */
+	return ret;
 }
 
 static int ploop_merge_latest_snapshot(struct ploop *ploop)
 {
-	struct ploop_cmd cmd;
 	struct file *file;
 	u8 level;
 	int ret;
@@ -687,33 +644,14 @@ static int ploop_merge_latest_snapshot(struct ploop *ploop)
 		return -EROFS;
 	if (ploop->nr_deltas < 2)
 		return -ENOENT;
-again:
-	memset(&cmd, 0, sizeof(cmd));
-	cmd.type = PLOOP_CMD_MERGE_SNAPSHOT;
-	cmd.ploop = ploop;
-	atomic_set(&cmd.merge.nr_available, NR_MERGE_BIOS);
-
-	init_completion(&cmd.comp);
-	ploop_queue_deferred_cmd(ploop, &cmd);
-	ret = wait_for_completion_interruptible(&cmd.comp);
-	if (ret) {
-		/*
-		 * process_merge_latest_snapshot_cmd() will see this
-		 * later or earlier. Take a lock if you want earlier.
-		 */
-		WRITE_ONCE(cmd.retval, -EINTR);
-		wait_for_completion(&cmd.comp);
-	}
 
-	if (cmd.retval)
+	ret = process_merge_latest_snapshot(ploop);
+	if (ret)
 		goto out;
 
-	if (cmd.merge.do_repeat)
-		goto again;
-
 	/* Delta merged. Release delta's file */
-	cmd.retval = ploop_suspend_submitting_pios(ploop);
-	if (cmd.retval)
+	ret = ploop_suspend_submitting_pios(ploop);
+	if (ret)
 		goto out;
 
 	write_lock_irq(&ploop->bat_rwlock);
@@ -726,7 +664,7 @@ static int ploop_merge_latest_snapshot(struct ploop *ploop)
 
 	ploop_resume_submitting_pios(ploop);
 out:
-	return cmd.retval;
+	return ret;
 }
 
 static void notify_delta_merged(struct ploop *ploop, u8 level,
@@ -1161,28 +1099,6 @@ static int ploop_flip_upper_deltas(struct ploop *ploop)
 	return process_flip_upper_deltas(ploop);
 }
 
-/* Handle user commands requested via "message" interface */
-void process_deferred_cmd(struct ploop *ploop)
-	__releases(&ploop->deferred_lock)
-	__acquires(&ploop->deferred_lock)
-{
-	struct ploop_cmd *cmd = ploop->deferred_cmd;
-
-	if (likely(!cmd))
-		return;
-
-	ploop->deferred_cmd = NULL;
-	spin_unlock_irq(&ploop->deferred_lock);
-
-	if (cmd->type == PLOOP_CMD_MERGE_SNAPSHOT) {
-		process_merge_latest_snapshot_cmd(ploop, cmd);
-	} else {
-		cmd->retval = -EINVAL;
-		complete(&cmd->comp);
-	}
-	spin_lock_irq(&ploop->deferred_lock);
-}
-
 static int ploop_get_event(struct ploop *ploop, char *result, unsigned int maxlen)
 {
 	unsigned int sz = 0;
diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index f519a933b915..3a2dc8696118 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -77,6 +77,7 @@ void init_pio(struct ploop *ploop, unsigned int bi_op, struct pio *pio)
 	pio->bi_op = bi_op;
 	pio->wants_discard_index_cleanup = false;
 	pio->is_data_alloc = false;
+	pio->is_fake_merge = false;
 	pio->free_on_endio = false;
 	pio->ref_index = PLOOP_REF_INDEX_INVALID;
 	pio->bi_status = BLK_STS_OK;
@@ -485,6 +486,14 @@ static bool pio_endio_if_all_zeros(struct pio *pio)
 	return true;
 }
 
+static bool pio_endio_if_merge_fake_pio(struct pio *pio)
+{
+	if (likely(!fake_merge_pio(pio)))
+		return false;
+	pio_endio(pio);
+	return true;
+}
+
 static int punch_hole(struct file *file, loff_t pos, loff_t len)
 {
 	return vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
@@ -1143,9 +1152,9 @@ static bool postpone_if_cluster_locked(struct ploop *ploop, struct pio *pio,
 	return e_h != NULL;
 }
 
-int submit_cluster_cow(struct ploop *ploop, unsigned int level,
-		       unsigned int cluster, unsigned int dst_cluster,
-		       void (*end_fn)(struct ploop *, int, void *), void *data)
+static int submit_cluster_cow(struct ploop *ploop, unsigned int level,
+			      unsigned int cluster, unsigned int dst_cluster,
+			      void (*end_fn)(struct ploop *, int, void *), void *data)
 {
 	struct ploop_cow *cow = NULL;
 	struct pio *pio = NULL;
@@ -1399,6 +1408,8 @@ static int process_one_deferred_bio(struct ploop *ploop, struct pio *pio,
 
 	if (cluster_is_in_top_delta(ploop, cluster)) {
 		/* Already mapped */
+		if (pio_endio_if_merge_fake_pio(pio))
+			goto out;
 		goto queue;
 	} else if (!op_is_write(pio->bi_op)) {
 		/*
@@ -1545,13 +1556,10 @@ void do_ploop_work(struct work_struct *ws)
 	 *
 	 * Currenly, it's impossible to submit two bat pages update
 	 * in parallel, since the update uses global ploop->bat_page.
-	 * Note, that process_deferred_cmd() expects there is no
-	 * pending index wb.
 	 */
 	ploop_index_wb_init(&piwb, ploop);
 
 	spin_lock_irq(&ploop->deferred_lock);
-	process_deferred_cmd(ploop);
 	process_delta_wb(ploop, &piwb);
 
 	list_splice_init(&ploop->deferred_pios, &deferred_pios);
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index cabe8162f416..afdf23325d3b 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -131,10 +131,9 @@ void free_md_pages_tree(struct rb_root *root)
 
 static bool ploop_has_pending_activity(struct ploop *ploop)
 {
-	bool has;
+	bool has = false;
 
 	spin_lock_irq(&ploop->deferred_lock);
-	has = ploop->deferred_cmd;
 	has |= !list_empty(&ploop->deferred_pios);
 	has |= !list_empty(&ploop->discard_pios);
 	has |= !list_empty(&ploop->delta_cow_action_list);
@@ -320,7 +319,9 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	rwlock_init(&ploop->bat_rwlock);
+	spin_lock_init(&ploop->err_status_lock);
 	init_rwsem(&ploop->ctl_rwsem);
+	init_waitqueue_head(&ploop->service_wq);
 	spin_lock_init(&ploop->inflight_lock);
 	spin_lock_init(&ploop->deferred_lock);
 
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 70c40a794231..cf2680d55ddf 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -47,12 +47,10 @@ struct ploop_delta {
 	bool is_raw;
 };
 
+#define MERGE_PIOS_MAX			64
+
 struct ploop_cmd {
-#define PLOOP_CMD_MERGE_SNAPSHOT	3
 	struct completion comp;
-	struct ploop *ploop;
-	unsigned int type;
-	int retval;
 	union {
 		struct {
 			sector_t new_sectors;
@@ -69,12 +67,6 @@ struct ploop_cmd {
 			unsigned int cluster, dst_cluster;
 			struct pio *pio;
 		} resize;
-		struct {
-#define NR_MERGE_BIOS			64
-			atomic_t nr_available;
-			unsigned int cluster; /* Currently iterated cluster */
-			bool do_repeat;
-		} merge;
 	};
 };
 
@@ -180,8 +172,11 @@ struct ploop {
 	struct list_head resubmit_pios; /* After partial IO */
 	struct list_head enospc_pios; /* Delayed after ENOSPC */
 
+	atomic_t service_pios;
+	struct wait_queue_head service_wq;
+
+	spinlock_t err_status_lock;
 	struct rw_semaphore ctl_rwsem;
-	struct ploop_cmd *deferred_cmd;
 
 	/*
 	 * List of locked clusters (no write is possible).
@@ -237,6 +232,7 @@ struct pio {
 
 	bool is_data_alloc:1;
 	bool wants_discard_index_cleanup:1;
+	bool is_fake_merge:1;
 	bool free_on_endio:1;
 	/*
 	 * 0 and 1 are related to inflight_bios_ref[],
@@ -493,6 +489,16 @@ static inline struct hlist_head *ploop_htable_slot(struct hlist_head head[], u32
 	return &head[hash_32(clu, PLOOP_HASH_TABLE_BITS)];
 }
 
+static inline bool fake_merge_pio(struct pio *pio)
+{
+	if (pio->is_fake_merge) {
+		WARN_ON_ONCE(pio->bi_iter.bi_size ||
+			     pio->bi_op != REQ_OP_WRITE);
+		return true;
+	}
+	return false;
+}
+
 extern void md_page_insert(struct ploop *ploop, struct md_page *md);
 extern void ploop_free_md_page(struct md_page *md);
 extern void free_md_pages_tree(struct rb_root *root);
@@ -506,7 +512,6 @@ extern void defer_pios(struct ploop *ploop, struct pio *pio, struct list_head *p
 extern void do_ploop_work(struct work_struct *ws);
 extern void do_ploop_fsync_work(struct work_struct *ws);
 extern void ploop_event_work(struct work_struct *work);
-extern void process_deferred_cmd(struct ploop *ploop);
 extern int ploop_clone_and_map(struct dm_target *ti, struct request *rq,
 		    union map_info *map_context, struct request **clone);
 extern struct pio *find_lk_of_cluster(struct ploop *ploop, u32 cluster);
@@ -521,9 +526,6 @@ extern void ploop_reset_bat_update(struct ploop_index_wb *);
 extern void ploop_submit_index_wb_sync(struct ploop *, struct ploop_index_wb *);
 extern int ploop_message(struct dm_target *ti, unsigned int argc, char **argv,
 			 char *result, unsigned int maxlen);
-extern int submit_cluster_cow(struct ploop *ploop, unsigned int level,
-			      unsigned int cluster, unsigned int dst_cluster,
-			      void (*end_fn)(struct ploop *, int, void *), void *data);
 
 extern struct pio * alloc_pio_with_pages(struct ploop *ploop);
 extern void free_pio_with_pages(struct ploop *ploop, struct pio *pio);


More information about the Devel mailing list