[Devel] [PATCH RHEL9 COMMIT] fuse: pcs: protection against sync seq numbers wraparound
Konstantin Khorenko
khorenko at virtuozzo.com
Thu Jul 28 14:07:07 MSK 2022
The commit is pushed to "branch-rh9-5.14.0-70.13.1.vz9.16.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-70.13.1.vz9.16.5
------>
commit fd72cdb96256c97754b400a07b88366ab9c7f371
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date: Mon Jul 18 19:46:35 2022 +0800
fuse: pcs: protection against sync seq numbers wraparound
There is a hole in sync protocol. When CS is journaling sequence
number grows even if chunk is not accessed at all due to commits
to other chunks. This means that after long idle period sequence
numbers can jump arbitrarily comparing to state cached at client.
The result is lockup, such chunk cannot be synced.
Affects: #VSTOR-55377
https://pmc.acronis.com/browse/VSTOR-55377
Signed-off-by: Alexey Kuznetsov <kuznet at acronis.com>
---
fs/fuse/kio/pcs/pcs_map.c | 95 +++++++++++++++++++++++++++++++++++++++++++++--
fs/fuse/kio/pcs/pcs_map.h | 16 ++++++++
2 files changed, 107 insertions(+), 4 deletions(-)
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
index 27c96cbbba8b..f3bc3a18517d 100644
--- a/fs/fuse/kio/pcs/pcs_map.c
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -820,6 +820,7 @@ void transfer_sync_data(struct pcs_cs_list * new_cs_list, struct pcs_cs_list * o
for (k = 0; k < old_cs_list->nsrv; k++) {
if (old_cs_list->cs[k].info.id.val == new_cs_list->cs[i].info.id.val) {
new_cs_list->cs[i].sync = old_cs_list->cs[k].sync;
+ new_cs_list->cs[i].dirty_ts = old_cs_list->cs[i].dirty_ts;
break;
}
}
@@ -872,6 +873,7 @@ static void evaluate_dirty_status(struct pcs_map_entry * m)
rec->sync.dirty_integrity = 0;
rec->sync.dirty_epoch = 0;
rec->sync.dirty_seq = 0;
+ rec->dirty_ts = jiffies;
}
} else
rec->sync.dirty_integrity = 0;
@@ -975,6 +977,7 @@ struct pcs_cs_list* cslist_alloc( struct pcs_cs_set *css, struct pcs_cs_info *re
for (i = 0; i < cs_cnt; i++) {
cs_list->cs[i].info = rec[i];
memset(&cs_list->cs[i].sync, 0, sizeof(cs_list->cs[i].sync));
+ cs_list->cs[i].dirty_ts = jiffies;
RCU_INIT_POINTER(cs_list->cs[i].cslink.cs, NULL);
INIT_LIST_HEAD(&cs_list->cs[i].cslink.link);
cs_list->cs[i].cslink.index = i;
@@ -2571,11 +2574,36 @@ noinline void pcs_mapping_truncate(struct pcs_dentry_info *di, u64 new_size)
pcs_map_put(m);
}
+static void warp_sync_seq(struct pcs_map_entry * m, struct pcs_cs_record * rec, struct pcs_cs_sync_data * sync)
+{
+ struct cs_sync_state * srec = &rec->sync;
+
+ /* The procedure can force sync seq to _decrease_. Log this event just in case.
+ */
+ if (pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) < 0 ||
+ (sync->sync_epoch == srec->dirty_epoch &&
+ pcs_sync_seq_compare(sync->sync_dirty, srec->dirty_seq) < 0) ||
+ pcs_sync_seq_compare(sync->sync_epoch, srec->sync_epoch) < 0 ||
+ (sync->sync_epoch == srec->sync_epoch &&
+ pcs_sync_seq_compare(sync->sync_current, srec->sync_seq) < 0))
+ FUSE_KTRACE(cc_from_maps(m->maps)->fc, "Warp [" NODE_FMT ",%u/%u,%u/%u,%u/%u] -> [%u/%u,%u/%u]",
+ NODE_ARGS(rec->info.id), rec->info.integrity_seq, srec->dirty_integrity,
+ srec->dirty_epoch, srec->dirty_seq, srec->sync_epoch, srec->sync_seq,
+ sync->sync_epoch, sync->sync_dirty, sync->sync_epoch, sync->sync_current);
+
+ srec->dirty_epoch = sync->sync_epoch;
+ srec->dirty_seq = sync->sync_dirty;
+ srec->sync_epoch = sync->sync_epoch;
+ srec->sync_seq = sync->sync_current;
+}
+
static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec,
struct pcs_cs_sync_data * sync, u32 lat, int op_type)
{
int dirtying;
struct cs_sync_state * srec = &rec->sync;
+ int was_dirty = cs_is_dirty(srec);
+
if (sync->ts_net > sync->ts_io)
lat -= sync->ts_net;
else
@@ -2603,6 +2631,10 @@ static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec
dirtying = (op_type == PCS_CS_WRITE_SYNC_RESP || op_type == PCS_CS_WRITE_RESP ||
op_type == PCS_CS_WRITE_AL_RESP ||
op_type == PCS_CS_WRITE_HOLE_RESP || op_type == PCS_CS_WRITE_ZERO_RESP);
+
+ if (sync->sync_dirty == 0)
+ dirtying = 0;
+
/* The following looks scary, could be more clear.
* The goal is to update sync seq numbers:
*
@@ -2612,16 +2644,60 @@ static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec
* - sync_epoch/sync_seq advance sync_epoch/seq
* - sync_epoch/sync_dirty advance dirty_epoch/seq
*/
- if (dirtying && sync->sync_dirty) {
- srec->dirty_integrity = sync->integrity_seq;
+ if (dirtying) {
+ unsigned int tmo;
- if (srec->dirty_epoch == 0 ||
- pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) > 0) {
+ if (pcs_sync_seq_compare(sync->sync_current, sync->sync_dirty) > 0) {
+ dirtying = 0;
+ tmo = PCS_MAX_SYNC_TIMEOUT;
+ } else {
+ srec->dirty_integrity = sync->integrity_seq;
+ tmo = PCS_MAX_DIRTY_TIMEOUT;
+ }
+
+ if (jiffies - rec->dirty_ts > tmo) {
+ warp_sync_seq(m, rec, sync);
+ rec->dirty_ts = jiffies;
+ } else if (srec->dirty_epoch == 0 ||
+ pcs_sync_seq_compare(sync->sync_epoch, srec->dirty_epoch) > 0) {
srec->dirty_epoch = sync->sync_epoch;
srec->dirty_seq = sync->sync_dirty;
+ rec->dirty_ts = jiffies;
} else if (sync->sync_epoch == srec->dirty_epoch &&
pcs_sync_seq_compare(sync->sync_dirty, srec->dirty_seq) > 0) {
srec->dirty_seq = sync->sync_dirty;
+ rec->dirty_ts = jiffies;
+ }
+ } else {
+ if (jiffies - rec->dirty_ts > PCS_MAX_SYNC_TIMEOUT) {
+ srec->sync_epoch = sync->sync_epoch;
+ srec->sync_seq = sync->sync_current;
+
+ if (was_dirty) {
+ /* This must not happen, dirty replica cannot remain dirty
+ * for >> PCS_SYNC_TIMEOUT unless the cluster is very sick.
+ */
+ FUSE_KTRACE(cc_from_maps(m->maps)->fc, MAP_FMT " replica " NODE_FMT " is dirty for %ld hz; [%u/%u,%u/%u,%u/%u] -> [%u/%u]",
+ MAP_ARGS(m), NODE_ARGS(rec->info.id), (long)(jiffies - rec->dirty_ts),
+ rec->info.integrity_seq, srec->dirty_integrity,
+ srec->dirty_epoch, srec->dirty_seq, srec->sync_epoch, srec->sync_seq,
+ sync->sync_epoch, sync->sync_current);
+
+ /* Old dirty seq might be invalid and new one in not dirtying response
+ * is undefined. But it must be set to _something_ _dirty_.
+ * We are in troubles. We have to violate requirement of obscurity
+ * of sequence numbers for client and to forge some sequence number.
+ */
+ srec->dirty_epoch = sync->sync_epoch;
+ srec->dirty_seq = sync->sync_current;
+ } else {
+ /* Normal case, replica is clean. Actually, we could do such reset to 0
+ * every time when replica becomes clean.
+ */
+ srec->dirty_epoch = 0;
+ srec->dirty_seq = 0;
+ }
+ rec->dirty_ts = jiffies;
}
}
@@ -2633,6 +2709,17 @@ static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec
pcs_sync_seq_compare(sync->sync_current, srec->sync_seq) > 0) {
srec->sync_seq = sync->sync_current;
}
+
+ if (!was_dirty && !dirtying && cs_is_dirty(srec)) {
+ FUSE_KTRACE(cc_from_maps(m->maps)->fc, MAP_FMT " replica " NODE_FMT " is not expected to be dirty [%u/%u,%u/%u,%u/%u]",
+ MAP_ARGS(m), NODE_ARGS(rec->info.id),
+ rec->info.integrity_seq, srec->dirty_integrity,
+ srec->dirty_epoch, srec->dirty_seq, srec->sync_epoch, srec->sync_seq);
+ /* "Dirty" dirty fixup */
+ srec->dirty_epoch = 0;
+ srec->dirty_seq = 0;
+ rec->dirty_ts = jiffies;
+ }
return 0;
}
diff --git a/fs/fuse/kio/pcs/pcs_map.h b/fs/fuse/kio/pcs/pcs_map.h
index 507a92c79928..4bab867477f4 100644
--- a/fs/fuse/kio/pcs/pcs_map.h
+++ b/fs/fuse/kio/pcs/pcs_map.h
@@ -19,6 +19,21 @@ struct pcs_int_request;
#define PCS_MAP_LIMIT 4096
#define PCS_SYNC_TIMEOUT (20 * HZ)
+/* Protection against sync seq wraparound */
+
+/* For completely synchronous writes, which we use with immediate writes
+ * (i.e. iscsi). It will hold INT_MAX/(PCS_MAX_SYNC_TIMEOUT/HZ)
+ * iops = ~50kiops, this should be enough: such writes are expensive and cannot
+ * be optimized.
+ */
+#define PCS_MAX_SYNC_TIMEOUT (12*3600*HZ)
+
+/* For normal dirtying writes the limit can be made a lot weaker, the only
+ * severe limitation is that PCS_MAX_DIRTY_TIMEOUT was much greater than
+ * PCS_SYNC_TIMEOUT. Factor 100 protects against ~1Miops.
+ */
+#define PCS_MAX_DIRTY_TIMEOUT (100*PCS_SYNC_TIMEOUT)
+
#define PCS_REPLICATION_BLACKLIST_TIMEOUT HZ
@@ -83,6 +98,7 @@ struct pcs_cs_record
{
struct pcs_cs_info info;
struct cs_sync_state sync;
+ abs_time_t dirty_ts;
struct pcs_cs_link cslink;
};
More information about the Devel
mailing list