[Devel] [PATCH RHEL9 COMMIT] fuse: error reporting races

Konstantin Khorenko khorenko at virtuozzo.com
Wed Nov 1 22:47:37 MSK 2023


The commit is pushed to "branch-rh9-5.14.0-284.25.1.vz9.30.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-284.25.1.vz9.30.8
------>
commit b9ce1a2648badaa04d0cee6600d2a320ad4a8680
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date:   Fri Oct 6 18:43:42 2023 +0800

    fuse: error reporting races
    
    pcs_cs_accel added some error reporting from unserialized contexts,
    it is a bug. Not new. Actually, the same problem was present since
    day zero in pcs_sreq_complete, where error was propagated from
    children iochunk requests to parent api request.
    
    Add an atomic version of pcs_error_t, which can be set
    from arbitrary context and move some racy error updates
    to serialized context.
    
    https://pmc.acronis.work/browse/VSTOR-54040
    
    Signed-off-by: Alexey Kuznetsov <kuznet at acronis.com>
    
    Feature: vStorage
---
 fs/fuse/kio/pcs/pcs_cluster.c  |  2 +-
 fs/fuse/kio/pcs/pcs_cs_accel.c | 97 +++++++++++++++++-------------------------
 fs/fuse/kio/pcs/pcs_error.h    | 44 +++++++++++++++++--
 fs/fuse/kio/pcs/pcs_map.c      |  2 +-
 fs/fuse/kio/pcs/pcs_req.h      |  1 +
 5 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/fs/fuse/kio/pcs/pcs_cluster.c b/fs/fuse/kio/pcs/pcs_cluster.c
index dbca68fe1ccb..a19a5597dea6 100644
--- a/fs/fuse/kio/pcs/pcs_cluster.c
+++ b/fs/fuse/kio/pcs/pcs_cluster.c
@@ -61,7 +61,7 @@ void pcs_sreq_complete(struct pcs_int_request *sreq)
 				ireq_delay(sreq);
 				return;
 			}
-			pcs_copy_error(&ireq->error, &sreq->error);
+			pcs_copy_error_cond_atomic(&ireq->error, &sreq->error);
 		}
 
 		if (sreq->type != PCS_IREQ_CUSTOM)
diff --git a/fs/fuse/kio/pcs/pcs_cs_accel.c b/fs/fuse/kio/pcs/pcs_cs_accel.c
index 33a60cae936b..6db4562d38bf 100644
--- a/fs/fuse/kio/pcs/pcs_cs_accel.c
+++ b/fs/fuse/kio/pcs/pcs_cs_accel.c
@@ -370,21 +370,15 @@ static void __pcs_csa_final_completion(struct pcs_aio_req *areq)
 		struct pcs_csa_context * ctx;
 		rcu_read_lock();
 		ctx = rcu_dereference(cs->csa_ctx);
-		if (!ctx || !ctx->tfm || decrypt_data(ireq, ctx->tfm)) {
-			ireq->error.remote = 1;
-			ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
-			ireq->error.value = PCS_ERR_IO;
-		}
+		if (!ctx || !ctx->tfm || decrypt_data(ireq, ctx->tfm))
+			pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
 		rcu_read_unlock();
 	}
 
 	if (areq->crc) {
 		if (!pcs_if_error(&ireq->error)) {
-			if (verify_crc(ireq, areq->crc)) {
-				ireq->error.remote = 1;
-				ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
-				ireq->error.value = PCS_ERR_IO;
-			}
+			if (verify_crc(ireq, areq->crc))
+				pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
 		}
 
 		if (areq->crc && areq->crc != areq->crcb) {
@@ -483,11 +477,7 @@ static void csa_crc_work(struct work_struct *w)
 		areq->crc = kmalloc(ncrc, GFP_KERNEL);
 		if (areq->crc == NULL) {
 out:
-			if (!ireq->error.value) {
-				ireq->error.remote = 1;
-				ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
-				ireq->error.value = PCS_ERR_NORES;
-			}
+			pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_NORES, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
 			fput(areq->cfile);
 			if (areq->crc && areq->crc != areq->crcb) {
 				kfree(areq->crc);
@@ -524,13 +514,8 @@ static void pcs_csa_complete(struct kiocb *iocb, long ret)
 	areq = container_of(iocb, struct pcs_aio_req, iocb);
 	ireq = container_of(areq, struct pcs_int_request, iochunk.ar);
 
-	if (ret != ireq->iochunk.size) {
-		if (!ireq->error.value) {
-			ireq->error.remote = 1;
-			ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
-			ireq->error.value = PCS_ERR_IO;
-		}
-	}
+	if (ret != ireq->iochunk.size)
+		pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
 
 	if (atomic_dec_and_test(&areq->iocount)) {
 		INIT_WORK(&areq->work, csa_complete_work);
@@ -585,11 +570,7 @@ static inline int csa_submit(struct file * file, struct file *cfile, int do_csum
 
 	if (unlikely(ret != -EIOCBQUEUED)) {
 		if (ret != size) {
-			if (!ireq->error.value) {
-				ireq->error.remote = 1;
-				ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
-				ireq->error.value = PCS_ERR_IO;
-			}
+			pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
 
 			/* Do not drop refs, we do not want to complete ireq. */
 			fput(areq->iocb.ki_filp);
@@ -696,6 +677,7 @@ static void ireq_init_acr(struct pcs_int_request * ireq)
 {
 	atomic_set(&ireq->iochunk.acr.iocount, 1);
 	ireq->iochunk.acr.num_awr = 0;
+	pcs_clear_error(&ireq->iochunk.acr.net_error);
 	ireq->iochunk.acr.num_iotimes = 0;
 }
 
@@ -745,6 +727,17 @@ static void __complete_acr_work(struct work_struct * w)
 {
 	struct pcs_int_request * ireq = container_of(w, struct pcs_int_request, iochunk.acr.work);
 
+	if (pcs_if_error(&ireq->iochunk.acr.net_error)) {
+		/* Error on N-request overrides any error on a D-request. */
+		pcs_copy_error(&ireq->error, &ireq->iochunk.acr.net_error);
+		ireq->flags |= IREQ_F_NO_ACCEL;
+		/* Clear ACCELERROR to deliver this error normally, through invalidating the map */
+		ireq->flags &= ~(IREQ_F_ACCELERROR|IREQ_F_ONCE);
+	} else if (pcs_if_error(&ireq->error)) {
+		ireq->flags |= IREQ_F_NO_ACCEL|IREQ_F_ACCELERROR;
+		ireq->flags &= ~IREQ_F_ONCE;
+	}
+
 	if (pcs_if_error(&ireq->error)) {
 		FUSE_KTRACE(ireq->cc->fc, "IO error %d %lu, ireq:%p : %llu:%u+%u",
 		      ireq->error.value,
@@ -806,7 +799,6 @@ static inline void csa_complete_acr(struct pcs_int_request * ireq)
 	}
 }
 
-
 static void __pcs_csa_write_final_completion(struct pcs_accel_write_req *areq)
 {
 	struct pcs_int_request * ireq;
@@ -842,12 +834,8 @@ static void csa_sync_work(struct work_struct *w)
 
 	res = vfs_fsync(areq->iocb.ki_filp, 1);
 
-	if (res) {
-		ireq->error.remote = 1;
-		ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
-		ireq->error.value = PCS_ERR_IO;
-		ireq->flags |= IREQ_F_ACCELERROR;
-	}
+	if (res)
+		pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
 
 	if (atomic_dec_and_test(&areq->iocount))
 		__pcs_csa_write_final_completion(areq);
@@ -869,15 +857,11 @@ static void csa_write_complete(struct kiocb *iocb, long ret)
 	ireq = container_of(areq-areq->index, struct pcs_int_request, iochunk.acr.awr[0]);
 
 	if (ret != ireq->iochunk.size) {
-		if (!ireq->error.value) {
-			ireq->error.remote = 1;
-			ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
-			ireq->error.value = PCS_ERR_IO;
-			ireq->flags |= IREQ_F_ACCELERROR;
-		}
+		if (!pcs_if_error(&ireq->error))
+			pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
 	}
 
-	if (!ireq->error.value) {
+	if (!pcs_if_error(&ireq->error)) {
 		if ((ireq->dentry->fileinfo.attr.attrib & PCS_FATTR_IMMEDIATE_WRITE) ||
 		    ireq->dentry->no_write_delay) {
 			INIT_WORK(&areq->work, csa_sync_work);
@@ -1044,11 +1028,7 @@ static inline int csa_submit_write(struct file * file, struct pcs_int_request *
 
 	if (unlikely(ret != -EIOCBQUEUED)) {
 		if (ret != size) {
-			if (!ireq->error.value) {
-				ireq->error.remote = 1;
-				ireq->error.offender = ireq->iochunk.csl->cs[idx].info.id;
-				ireq->error.value = PCS_ERR_IO;
-			}
+			pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[idx].info.id);
 
 			/* Do not drop refs, we do not want to complete ireq. */
 			fput(areq->iocb.ki_filp);
@@ -1115,16 +1095,12 @@ static void complete_N_request(struct pcs_int_request * sreq)
 {
 	struct pcs_int_request * ireq = sreq->iochunk.parent_N;
 
-	if (pcs_if_error(&sreq->error)) {
-		/* Error on N-request overrides any error on a D-request. */
-		ireq->error = sreq->error;
-		ireq->flags |= IREQ_F_NO_ACCEL;
-		/* Clear ACCELERROR to deliver this error normally, through invalidating the map */
-		ireq->flags &= ~IREQ_F_ACCELERROR;
-	}
+	if (pcs_if_error(&sreq->error))
+		pcs_copy_error(&ireq->iochunk.acr.net_error, &sreq->error);
 
 	/* And free all clone resources */
-	pcs_sreq_detach(sreq);
+	if (!pcs_sreq_detach(sreq))
+		BUG();
 	if (sreq->iochunk.map)
 		pcs_map_put(sreq->iochunk.map);
 	if (sreq->iochunk.csl)
@@ -1167,9 +1143,7 @@ struct pcs_int_request * pcs_csa_csl_write_submit(struct pcs_int_request * ireq)
 			/* Some D replicas are submitted. So, we have to go
 			 * through error cycle.
 			 */
-			ireq->error.remote = 1;
-			ireq->error.offender = ireq->iochunk.csl->cs[idx].info.id;
-			ireq->error.value = PCS_ERR_NORES;
+			pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_NORES, 1, ireq->iochunk.csl->cs[idx].info.id);
 			csa_complete_acr(ireq);
 			return NULL;
 		}
@@ -1193,7 +1167,6 @@ struct pcs_int_request * pcs_csa_csl_write_submit(struct pcs_int_request * ireq)
 	}
 }
 
-
 int pcs_csa_csl_write_submit_single(struct pcs_int_request * ireq, int idx)
 {
 	if (idx >= PCS_MAX_ACCEL_CS)
@@ -1360,12 +1333,20 @@ int pcs_csa_register(struct pcs_cluster_core * cc, PCS_NODE_ID_T cs_id, struct c
 
 int pcs_csa_init(void)
 {
+	struct _old_pcs_error_t
+	{
+		unsigned int     value : 31, remote: 1;
+		PCS_NODE_ID_T           offender;
+	};
+
 	pcs_csa_cachep = kmem_cache_create("pcs_csa",
 					    sizeof(struct pcs_csa_entry),
 					    0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
 	if (!pcs_csa_cachep)
 		return -ENOMEM;
 
+	BUILD_BUG_ON(sizeof(struct _old_pcs_error_t) != sizeof(struct _pcs_error_t));
+
 	return 0;
 }
 
diff --git a/fs/fuse/kio/pcs/pcs_error.h b/fs/fuse/kio/pcs/pcs_error.h
index e19e8df040b7..cbf5c1532945 100644
--- a/fs/fuse/kio/pcs/pcs_error.h
+++ b/fs/fuse/kio/pcs/pcs_error.h
@@ -208,9 +208,15 @@ int pcs_sys_strerror_r(int err, char *buf, int buflen);
 
 struct _pcs_error_t
 {
-	unsigned int	value : 31, remote: 1;
-
-	PCS_NODE_ID_T		offender;
+	union {
+		struct {
+			u32     value : 31, remote: 1;
+			u32     pad;
+		};
+		atomic64_t atomic;
+	};
+
+        PCS_NODE_ID_T           offender;
 };
 typedef struct _pcs_error_t pcs_error_t;
 
@@ -244,6 +250,38 @@ static __inline void pcs_set_local_error(pcs_error_t * status, int err)
 	status->remote = 0;
 }
 
+/* Atomicity is special here, intended only to set an error status from multiple contexts.
+ * Actual access to content is made only from serialized context and does not need atomicity.
+ * It is important that an error set once cannot be cleared or changed or even read until
+ * all the context which could modify error are finished.
+ */
+static __inline void pcs_set_error_cond_atomic(pcs_error_t * status, int err, int remote, PCS_NODE_ID_T id)
+{
+	if (!status->value) {
+		for (;;) {
+			s64 val = err | (remote ? 1ULL << 31 : 0);
+			s64 old_val = atomic64_read(&status->atomic);
+			if (old_val & 0x7FFFFFFFUL)
+				break;
+			if (old_val == atomic64_cmpxchg(&status->atomic, old_val, val)) {
+				if (remote)
+					status->offender = id;
+				break;
+			}
+		}
+	}
+}
+
+static __inline void pcs_set_local_error_cond_atomic(pcs_error_t * status, int err)
+{
+	pcs_set_error_cond_atomic(status, err, 0, (PCS_NODE_ID_T) { .val = 0 });
+}
+
+static __inline void pcs_copy_error_cond_atomic(pcs_error_t * dst, pcs_error_t * src)
+{
+	pcs_set_error_cond_atomic(dst, src->value, src->remote, src->offender);
+}
+
 int pcs_error_to_errno(pcs_error_t *);
 
 static __inline void *pcs_err_ptr(int err)
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
index df33b525ba95..f0520bae8699 100644
--- a/fs/fuse/kio/pcs/pcs_map.c
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -3068,7 +3068,7 @@ static void pcs_flushreq_complete(struct pcs_int_request * sreq)
 			return;
 		}
 		FUSE_KTRACE(sreq->cc->fc, "flush error %d", sreq->error.value);
-		pcs_copy_error(&ireq->error, &sreq->error);
+		pcs_copy_error_cond_atomic(&ireq->error, &sreq->error);
 		notify_error = 1;
 	}
 
diff --git a/fs/fuse/kio/pcs/pcs_req.h b/fs/fuse/kio/pcs/pcs_req.h
index c677332ead28..cdf1c9754803 100644
--- a/fs/fuse/kio/pcs/pcs_req.h
+++ b/fs/fuse/kio/pcs/pcs_req.h
@@ -80,6 +80,7 @@ struct pcs_accel_req
 	atomic_t			iocount;
 	int				num_awr;
 	struct pcs_accel_write_req	awr[PCS_MAX_ACCEL_CS];
+	pcs_error_t			net_error;
 	int				num_iotimes;
 	struct fuse_tr_iotimes_cs	io_times[PCS_MAX_ACCEL_CS];
 	struct work_struct		work;


More information about the Devel mailing list