[Devel] [PATCH RHEL9 COMMIT] fuse: pcs: protection against sync seq numbers wraparound

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jul 28 14:07:07 MSK 2022


The commit is pushed to "branch-rh9-5.14.0-70.13.1.vz9.16.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-70.13.1.vz9.16.5
------>
commit fd72cdb96256c97754b400a07b88366ab9c7f371
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date:   Mon Jul 18 19:46:35 2022 +0800

    fuse: pcs: protection against sync seq numbers wraparound
    
    There is a hole in sync protocol. When CS is journaling sequence
    number grows even if chunk is not accessed at all due to commits
    to other chunks. This means that after long idle period sequence
    numbers can jump arbitrarily comparing to state cached at client.
    The result is lockup, such chunk cannot be synced.
    
    Affects: #VSTOR-55377
    https://pmc.acronis.com/browse/VSTOR-55377
    
    Signed-off-by: Alexey Kuznetsov <kuznet at acronis.com>
---
 fs/fuse/kio/pcs/pcs_map.c | 95 +++++++++++++++++++++++++++++++++++++++++++++--
 fs/fuse/kio/pcs/pcs_map.h | 16 ++++++++
 2 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
index 27c96cbbba8b..f3bc3a18517d 100644
--- a/fs/fuse/kio/pcs/pcs_map.c
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -820,6 +820,7 @@ void transfer_sync_data(struct pcs_cs_list * new_cs_list, struct pcs_cs_list * o
 		for (k = 0; k < old_cs_list->nsrv; k++) {
 			if (old_cs_list->cs[k].info.id.val == new_cs_list->cs[i].info.id.val) {
 				new_cs_list->cs[i].sync = old_cs_list->cs[k].sync;
+				new_cs_list->cs[i].dirty_ts = old_cs_list->cs[i].dirty_ts;
 				break;
 			}
 		}
@@ -872,6 +873,7 @@ static void evaluate_dirty_status(struct pcs_map_entry * m)
 				rec->sync.dirty_integrity = 0;
 				rec->sync.dirty_epoch = 0;
 				rec->sync.dirty_seq = 0;
+				rec->dirty_ts = jiffies;
 			}
 		} else
 			rec->sync.dirty_integrity = 0;
@@ -975,6 +977,7 @@ struct pcs_cs_list* cslist_alloc( struct pcs_cs_set *css, struct pcs_cs_info *re
 	for (i = 0; i < cs_cnt; i++) {
 		cs_list->cs[i].info = rec[i];
 		memset(&cs_list->cs[i].sync, 0, sizeof(cs_list->cs[i].sync));
+		cs_list->cs[i].dirty_ts = jiffies;
 		RCU_INIT_POINTER(cs_list->cs[i].cslink.cs, NULL);
 		INIT_LIST_HEAD(&cs_list->cs[i].cslink.link);
 		cs_list->cs[i].cslink.index = i;
@@ -2571,11 +2574,36 @@ noinline void pcs_mapping_truncate(struct pcs_dentry_info *di, u64 new_size)
 	pcs_map_put(m);
 }
 
+static void warp_sync_seq(struct pcs_map_entry * m, struct pcs_cs_record * rec, struct pcs_cs_sync_data * sync)
+{
+	struct cs_sync_state * srec = &rec->sync;
+
+	/* The procedure can force sync seq to _decrease_. Log this event just in case.
+	 */
+	if (pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) < 0 ||
+	    (sync->sync_epoch == srec->dirty_epoch &&
+	     pcs_sync_seq_compare(sync->sync_dirty, srec->dirty_seq) < 0) ||
+	    pcs_sync_seq_compare(sync->sync_epoch, srec->sync_epoch) < 0 ||
+	    (sync->sync_epoch == srec->sync_epoch &&
+	     pcs_sync_seq_compare(sync->sync_current, srec->sync_seq) < 0))
+		FUSE_KTRACE(cc_from_maps(m->maps)->fc, "Warp [" NODE_FMT ",%u/%u,%u/%u,%u/%u] -> [%u/%u,%u/%u]",
+		       NODE_ARGS(rec->info.id), rec->info.integrity_seq, srec->dirty_integrity,
+		       srec->dirty_epoch, srec->dirty_seq, srec->sync_epoch, srec->sync_seq,
+		       sync->sync_epoch, sync->sync_dirty, sync->sync_epoch, sync->sync_current);
+
+	srec->dirty_epoch = sync->sync_epoch;
+	srec->dirty_seq = sync->sync_dirty;
+	srec->sync_epoch = sync->sync_epoch;
+	srec->sync_seq = sync->sync_current;
+}
+
 static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec,
 			     struct pcs_cs_sync_data * sync, u32 lat, int op_type)
 {
 	int dirtying;
 	struct cs_sync_state * srec = &rec->sync;
+	int was_dirty = cs_is_dirty(srec);
+
 	if (sync->ts_net > sync->ts_io)
 		lat -= sync->ts_net;
 	else
@@ -2603,6 +2631,10 @@ static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec
 	dirtying = (op_type == PCS_CS_WRITE_SYNC_RESP || op_type == PCS_CS_WRITE_RESP ||
 		   op_type == PCS_CS_WRITE_AL_RESP ||
 		   op_type == PCS_CS_WRITE_HOLE_RESP || op_type == PCS_CS_WRITE_ZERO_RESP);
+
+	if (sync->sync_dirty == 0)
+		dirtying = 0;
+
 	/* The following looks scary, could be more clear.
 	 * The goal is to update sync seq numbers:
 	 *
@@ -2612,16 +2644,60 @@ static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec
 	 * - sync_epoch/sync_seq advance sync_epoch/seq
 	 * - sync_epoch/sync_dirty advance dirty_epoch/seq
 	 */
-	if (dirtying && sync->sync_dirty) {
-		srec->dirty_integrity = sync->integrity_seq;
+	if (dirtying) {
+		unsigned int tmo;
 
-		if (srec->dirty_epoch == 0 ||
-		    pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) > 0) {
+		if (pcs_sync_seq_compare(sync->sync_current, sync->sync_dirty) > 0) {
+			dirtying = 0;
+			tmo = PCS_MAX_SYNC_TIMEOUT;
+		} else {
+			srec->dirty_integrity = sync->integrity_seq;
+			tmo = PCS_MAX_DIRTY_TIMEOUT;
+		}
+
+		if (jiffies - rec->dirty_ts > tmo) {
+			warp_sync_seq(m, rec, sync);
+			rec->dirty_ts = jiffies;
+		} else if (srec->dirty_epoch == 0 ||
+			   pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) > 0) {
 			srec->dirty_epoch = sync->sync_epoch;
 			srec->dirty_seq = sync->sync_dirty;
+			rec->dirty_ts = jiffies;
 		} else if (sync->sync_epoch == srec->dirty_epoch &&
 			   pcs_sync_seq_compare(sync->sync_dirty, srec->dirty_seq) > 0) {
 			srec->dirty_seq = sync->sync_dirty;
+			rec->dirty_ts = jiffies;
+		}
+	} else {
+		if (jiffies - rec->dirty_ts > PCS_MAX_SYNC_TIMEOUT) {
+			srec->sync_epoch = sync->sync_epoch;
+			srec->sync_seq = sync->sync_current;
+
+			if (was_dirty) {
+				/* This must not happen, dirty replica cannot remain dirty
+				 * for >> PCS_SYNC_TIMEOUT unless the cluster is very sick.
+				 */
+				FUSE_KTRACE(cc_from_maps(m->maps)->fc, MAP_FMT " replica " NODE_FMT " is dirty for %ld hz; [%u/%u,%u/%u,%u/%u] -> [%u/%u]",
+				       MAP_ARGS(m), NODE_ARGS(rec->info.id), (long)(jiffies - rec->dirty_ts),
+				       rec->info.integrity_seq, srec->dirty_integrity,
+				       srec->dirty_epoch, srec->dirty_seq, srec->sync_epoch, srec->sync_seq,
+				       sync->sync_epoch, sync->sync_current);
+
+				/* Old dirty seq might be invalid and new one in not dirtying response
+				 * is undefined. But it must be set to _something_ _dirty_.
+				 * We are in troubles. We have to violate requirement of obscurity
+				 * of sequence numbers for client and to forge some sequence number.
+				 */
+				srec->dirty_epoch = sync->sync_epoch;
+				srec->dirty_seq = sync->sync_current;
+			} else {
+				 /* Normal case, replica is clean. Actually, we could do such reset to 0
+				  * every time when replica becomes clean.
+				  */
+				srec->dirty_epoch = 0;
+				srec->dirty_seq = 0;
+			}
+			rec->dirty_ts = jiffies;
 		}
 	}
 
@@ -2633,6 +2709,17 @@ static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec
 		   pcs_sync_seq_compare(sync->sync_current, srec->sync_seq) > 0) {
 		srec->sync_seq = sync->sync_current;
 	}
+
+	if (!was_dirty && !dirtying && cs_is_dirty(srec)) {
+		FUSE_KTRACE(cc_from_maps(m->maps)->fc, MAP_FMT " replica " NODE_FMT " is not expected to be dirty [%u/%u,%u/%u,%u/%u]",
+		       MAP_ARGS(m), NODE_ARGS(rec->info.id),
+		       rec->info.integrity_seq, srec->dirty_integrity,
+		       srec->dirty_epoch, srec->dirty_seq, srec->sync_epoch, srec->sync_seq);
+		/* "Dirty" dirty fixup */
+		srec->dirty_epoch = 0;
+		srec->dirty_seq = 0;
+		rec->dirty_ts = jiffies;
+	}
 	return 0;
 }
 
diff --git a/fs/fuse/kio/pcs/pcs_map.h b/fs/fuse/kio/pcs/pcs_map.h
index 507a92c79928..4bab867477f4 100644
--- a/fs/fuse/kio/pcs/pcs_map.h
+++ b/fs/fuse/kio/pcs/pcs_map.h
@@ -19,6 +19,21 @@ struct pcs_int_request;
 #define PCS_MAP_LIMIT		4096
 
 #define PCS_SYNC_TIMEOUT		(20 * HZ)
+/* Protection against sync seq wraparound */
+
+/* For completely synchronous writes, which we use with immediate writes
+ * (i.e.  iscsi). It will hold INT_MAX/(PCS_MAX_SYNC_TIMEOUT/HZ)
+ * iops = ~50kiops, this should be enough: such writes are expensive and cannot
+ * be optimized.
+ */
+#define PCS_MAX_SYNC_TIMEOUT	(12*3600*HZ)
+
+/* For normal dirtying writes the limit can be made a lot weaker, the only
+ * severe limitation is that PCS_MAX_DIRTY_TIMEOUT was much greater than
+ * PCS_SYNC_TIMEOUT. Factor 100 protects against ~1Miops.
+ */
+#define PCS_MAX_DIRTY_TIMEOUT	(100*PCS_SYNC_TIMEOUT)
+
 
 #define PCS_REPLICATION_BLACKLIST_TIMEOUT  HZ
 
@@ -83,6 +98,7 @@ struct pcs_cs_record
 {
 	struct pcs_cs_info	info;
 	struct cs_sync_state	sync;
+	abs_time_t		dirty_ts;
 	struct pcs_cs_link	cslink;
 };
 


More information about the Devel mailing list