[Devel] [PATCH RHEL9 COMMIT] fuse: error reporting races
Konstantin Khorenko
khorenko at virtuozzo.com
Wed Nov 1 22:47:37 MSK 2023
The commit is pushed to "branch-rh9-5.14.0-284.25.1.vz9.30.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-284.25.1.vz9.30.8
------>
commit b9ce1a2648badaa04d0cee6600d2a320ad4a8680
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date: Fri Oct 6 18:43:42 2023 +0800
fuse: error reporting races
pcs_cs_accel added some error reporting from unserialized contexts,
it is a bug. Not new. Actually, the same problem was present since
day zero in pcs_sreq_complete, where error was propagated from
children iochunk requests to parent api request.
Add an atomic version of pcs_error_t, which can be set
from arbitrary context and move some racy error updates
to serialized context.
https://pmc.acronis.work/browse/VSTOR-54040
Signed-off-by: Alexey Kuznetsov <kuznet at acronis.com>
Feature: vStorage
---
fs/fuse/kio/pcs/pcs_cluster.c | 2 +-
fs/fuse/kio/pcs/pcs_cs_accel.c | 97 +++++++++++++++++-------------------------
fs/fuse/kio/pcs/pcs_error.h | 44 +++++++++++++++++--
fs/fuse/kio/pcs/pcs_map.c | 2 +-
fs/fuse/kio/pcs/pcs_req.h | 1 +
5 files changed, 83 insertions(+), 63 deletions(-)
diff --git a/fs/fuse/kio/pcs/pcs_cluster.c b/fs/fuse/kio/pcs/pcs_cluster.c
index dbca68fe1ccb..a19a5597dea6 100644
--- a/fs/fuse/kio/pcs/pcs_cluster.c
+++ b/fs/fuse/kio/pcs/pcs_cluster.c
@@ -61,7 +61,7 @@ void pcs_sreq_complete(struct pcs_int_request *sreq)
ireq_delay(sreq);
return;
}
- pcs_copy_error(&ireq->error, &sreq->error);
+ pcs_copy_error_cond_atomic(&ireq->error, &sreq->error);
}
if (sreq->type != PCS_IREQ_CUSTOM)
diff --git a/fs/fuse/kio/pcs/pcs_cs_accel.c b/fs/fuse/kio/pcs/pcs_cs_accel.c
index 33a60cae936b..6db4562d38bf 100644
--- a/fs/fuse/kio/pcs/pcs_cs_accel.c
+++ b/fs/fuse/kio/pcs/pcs_cs_accel.c
@@ -370,21 +370,15 @@ static void __pcs_csa_final_completion(struct pcs_aio_req *areq)
struct pcs_csa_context * ctx;
rcu_read_lock();
ctx = rcu_dereference(cs->csa_ctx);
- if (!ctx || !ctx->tfm || decrypt_data(ireq, ctx->tfm)) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
- ireq->error.value = PCS_ERR_IO;
- }
+ if (!ctx || !ctx->tfm || decrypt_data(ireq, ctx->tfm))
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
rcu_read_unlock();
}
if (areq->crc) {
if (!pcs_if_error(&ireq->error)) {
- if (verify_crc(ireq, areq->crc)) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
- ireq->error.value = PCS_ERR_IO;
- }
+ if (verify_crc(ireq, areq->crc))
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
}
if (areq->crc && areq->crc != areq->crcb) {
@@ -483,11 +477,7 @@ static void csa_crc_work(struct work_struct *w)
areq->crc = kmalloc(ncrc, GFP_KERNEL);
if (areq->crc == NULL) {
out:
- if (!ireq->error.value) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
- ireq->error.value = PCS_ERR_NORES;
- }
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_NORES, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
fput(areq->cfile);
if (areq->crc && areq->crc != areq->crcb) {
kfree(areq->crc);
@@ -524,13 +514,8 @@ static void pcs_csa_complete(struct kiocb *iocb, long ret)
areq = container_of(iocb, struct pcs_aio_req, iocb);
ireq = container_of(areq, struct pcs_int_request, iochunk.ar);
- if (ret != ireq->iochunk.size) {
- if (!ireq->error.value) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
- ireq->error.value = PCS_ERR_IO;
- }
- }
+ if (ret != ireq->iochunk.size)
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
if (atomic_dec_and_test(&areq->iocount)) {
INIT_WORK(&areq->work, csa_complete_work);
@@ -585,11 +570,7 @@ static inline int csa_submit(struct file * file, struct file *cfile, int do_csum
if (unlikely(ret != -EIOCBQUEUED)) {
if (ret != size) {
- if (!ireq->error.value) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
- ireq->error.value = PCS_ERR_IO;
- }
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
/* Do not drop refs, we do not want to complete ireq. */
fput(areq->iocb.ki_filp);
@@ -696,6 +677,7 @@ static void ireq_init_acr(struct pcs_int_request * ireq)
{
atomic_set(&ireq->iochunk.acr.iocount, 1);
ireq->iochunk.acr.num_awr = 0;
+ pcs_clear_error(&ireq->iochunk.acr.net_error);
ireq->iochunk.acr.num_iotimes = 0;
}
@@ -745,6 +727,17 @@ static void __complete_acr_work(struct work_struct * w)
{
struct pcs_int_request * ireq = container_of(w, struct pcs_int_request, iochunk.acr.work);
+ if (pcs_if_error(&ireq->iochunk.acr.net_error)) {
+ /* Error on N-request overrides any error on a D-request. */
+ pcs_copy_error(&ireq->error, &ireq->iochunk.acr.net_error);
+ ireq->flags |= IREQ_F_NO_ACCEL;
+ /* Clear ACCELERROR to deliver this error normally, through invalidating the map */
+ ireq->flags &= ~(IREQ_F_ACCELERROR|IREQ_F_ONCE);
+ } else if (pcs_if_error(&ireq->error)) {
+ ireq->flags |= IREQ_F_NO_ACCEL|IREQ_F_ACCELERROR;
+ ireq->flags &= ~IREQ_F_ONCE;
+ }
+
if (pcs_if_error(&ireq->error)) {
FUSE_KTRACE(ireq->cc->fc, "IO error %d %lu, ireq:%p : %llu:%u+%u",
ireq->error.value,
@@ -806,7 +799,6 @@ static inline void csa_complete_acr(struct pcs_int_request * ireq)
}
}
-
static void __pcs_csa_write_final_completion(struct pcs_accel_write_req *areq)
{
struct pcs_int_request * ireq;
@@ -842,12 +834,8 @@ static void csa_sync_work(struct work_struct *w)
res = vfs_fsync(areq->iocb.ki_filp, 1);
- if (res) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
- ireq->error.value = PCS_ERR_IO;
- ireq->flags |= IREQ_F_ACCELERROR;
- }
+ if (res)
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
if (atomic_dec_and_test(&areq->iocount))
__pcs_csa_write_final_completion(areq);
@@ -869,15 +857,11 @@ static void csa_write_complete(struct kiocb *iocb, long ret)
ireq = container_of(areq-areq->index, struct pcs_int_request, iochunk.acr.awr[0]);
if (ret != ireq->iochunk.size) {
- if (!ireq->error.value) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id;
- ireq->error.value = PCS_ERR_IO;
- ireq->flags |= IREQ_F_ACCELERROR;
- }
+ if (!pcs_if_error(&ireq->error))
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[ireq->iochunk.cs_index].info.id);
}
- if (!ireq->error.value) {
+ if (!pcs_if_error(&ireq->error)) {
if ((ireq->dentry->fileinfo.attr.attrib & PCS_FATTR_IMMEDIATE_WRITE) ||
ireq->dentry->no_write_delay) {
INIT_WORK(&areq->work, csa_sync_work);
@@ -1044,11 +1028,7 @@ static inline int csa_submit_write(struct file * file, struct pcs_int_request *
if (unlikely(ret != -EIOCBQUEUED)) {
if (ret != size) {
- if (!ireq->error.value) {
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[idx].info.id;
- ireq->error.value = PCS_ERR_IO;
- }
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_IO, 1, ireq->iochunk.csl->cs[idx].info.id);
/* Do not drop refs, we do not want to complete ireq. */
fput(areq->iocb.ki_filp);
@@ -1115,16 +1095,12 @@ static void complete_N_request(struct pcs_int_request * sreq)
{
struct pcs_int_request * ireq = sreq->iochunk.parent_N;
- if (pcs_if_error(&sreq->error)) {
- /* Error on N-request overrides any error on a D-request. */
- ireq->error = sreq->error;
- ireq->flags |= IREQ_F_NO_ACCEL;
- /* Clear ACCELERROR to deliver this error normally, through invalidating the map */
- ireq->flags &= ~IREQ_F_ACCELERROR;
- }
+ if (pcs_if_error(&sreq->error))
+ pcs_copy_error(&ireq->iochunk.acr.net_error, &sreq->error);
/* And free all clone resources */
- pcs_sreq_detach(sreq);
+ if (!pcs_sreq_detach(sreq))
+ BUG();
if (sreq->iochunk.map)
pcs_map_put(sreq->iochunk.map);
if (sreq->iochunk.csl)
@@ -1167,9 +1143,7 @@ struct pcs_int_request * pcs_csa_csl_write_submit(struct pcs_int_request * ireq)
/* Some D replicas are submitted. So, we have to go
* through error cycle.
*/
- ireq->error.remote = 1;
- ireq->error.offender = ireq->iochunk.csl->cs[idx].info.id;
- ireq->error.value = PCS_ERR_NORES;
+ pcs_set_error_cond_atomic(&ireq->error, PCS_ERR_NORES, 1, ireq->iochunk.csl->cs[idx].info.id);
csa_complete_acr(ireq);
return NULL;
}
@@ -1193,7 +1167,6 @@ struct pcs_int_request * pcs_csa_csl_write_submit(struct pcs_int_request * ireq)
}
}
-
int pcs_csa_csl_write_submit_single(struct pcs_int_request * ireq, int idx)
{
if (idx >= PCS_MAX_ACCEL_CS)
@@ -1360,12 +1333,20 @@ int pcs_csa_register(struct pcs_cluster_core * cc, PCS_NODE_ID_T cs_id, struct c
int pcs_csa_init(void)
{
+ struct _old_pcs_error_t
+ {
+ unsigned int value : 31, remote: 1;
+ PCS_NODE_ID_T offender;
+ };
+
pcs_csa_cachep = kmem_cache_create("pcs_csa",
sizeof(struct pcs_csa_entry),
0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
if (!pcs_csa_cachep)
return -ENOMEM;
+ BUILD_BUG_ON(sizeof(struct _old_pcs_error_t) != sizeof(struct _pcs_error_t));
+
return 0;
}
diff --git a/fs/fuse/kio/pcs/pcs_error.h b/fs/fuse/kio/pcs/pcs_error.h
index e19e8df040b7..cbf5c1532945 100644
--- a/fs/fuse/kio/pcs/pcs_error.h
+++ b/fs/fuse/kio/pcs/pcs_error.h
@@ -208,9 +208,15 @@ int pcs_sys_strerror_r(int err, char *buf, int buflen);
struct _pcs_error_t
{
- unsigned int value : 31, remote: 1;
-
- PCS_NODE_ID_T offender;
+ union {
+ struct {
+ u32 value : 31, remote: 1;
+ u32 pad;
+ };
+ atomic64_t atomic;
+ };
+
+ PCS_NODE_ID_T offender;
};
typedef struct _pcs_error_t pcs_error_t;
@@ -244,6 +250,38 @@ static __inline void pcs_set_local_error(pcs_error_t * status, int err)
status->remote = 0;
}
+/* Atomicity is special here, intended only to set an error status from multiple contexts.
+ * Actual access to content is made only from serialized context and does not need atomicity.
+ * It is important that an error set once cannot be cleared or changed or even read until
+ * all the context which could modify error are finished.
+ */
+static __inline void pcs_set_error_cond_atomic(pcs_error_t * status, int err, int remote, PCS_NODE_ID_T id)
+{
+ if (!status->value) {
+ for (;;) {
+ s64 val = err | (remote ? 1ULL << 31 : 0);
+ s64 old_val = atomic64_read(&status->atomic);
+ if (old_val & 0x7FFFFFFFUL)
+ break;
+ if (old_val == atomic64_cmpxchg(&status->atomic, old_val, val)) {
+ if (remote)
+ status->offender = id;
+ break;
+ }
+ }
+ }
+}
+
+static __inline void pcs_set_local_error_cond_atomic(pcs_error_t * status, int err)
+{
+ pcs_set_error_cond_atomic(status, err, 0, (PCS_NODE_ID_T) { .val = 0 });
+}
+
+static __inline void pcs_copy_error_cond_atomic(pcs_error_t * dst, pcs_error_t * src)
+{
+ pcs_set_error_cond_atomic(dst, src->value, src->remote, src->offender);
+}
+
int pcs_error_to_errno(pcs_error_t *);
static __inline void *pcs_err_ptr(int err)
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
index df33b525ba95..f0520bae8699 100644
--- a/fs/fuse/kio/pcs/pcs_map.c
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -3068,7 +3068,7 @@ static void pcs_flushreq_complete(struct pcs_int_request * sreq)
return;
}
FUSE_KTRACE(sreq->cc->fc, "flush error %d", sreq->error.value);
- pcs_copy_error(&ireq->error, &sreq->error);
+ pcs_copy_error_cond_atomic(&ireq->error, &sreq->error);
notify_error = 1;
}
diff --git a/fs/fuse/kio/pcs/pcs_req.h b/fs/fuse/kio/pcs/pcs_req.h
index c677332ead28..cdf1c9754803 100644
--- a/fs/fuse/kio/pcs/pcs_req.h
+++ b/fs/fuse/kio/pcs/pcs_req.h
@@ -80,6 +80,7 @@ struct pcs_accel_req
atomic_t iocount;
int num_awr;
struct pcs_accel_write_req awr[PCS_MAX_ACCEL_CS];
+ pcs_error_t net_error;
int num_iotimes;
struct fuse_tr_iotimes_cs io_times[PCS_MAX_ACCEL_CS];
struct work_struct work;
More information about the Devel
mailing list