[Devel] [PATCH RHEL7 COMMIT] fs/fuse kio: align CS messages to 512 bytes

Konstantin Khorenko khorenko at virtuozzo.com
Mon May 25 15:56:37 MSK 2020


The commit is pushed to "branch-rh7-3.10.0-1127.8.2.vz7.161.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1127.8.2.vz7.161.1
------>
commit 94fa9d799079e071295ec87761f6df55b94f60b3
Author: Ildar Ismagilov <Ildar.Ismagilov at acronis.com>
Date:   Mon May 25 15:56:37 2020 +0300

    fs/fuse kio: align CS messages to 512 bytes
    
    CS now receives client messages into large continuous buffers
    in a batch. And csd_aio and csd_next modes require data buffers to be
    512 bytes aligned to use them for O_DIRECT. It means that now every
    message (and message header too) must be aligned.
    
    The message alignment is used only if storage version is greater than or
    equal to PCS_CS_MSG_ALIGNED_VERSION.
    
    https://pmc.acronis.com/browse/VSTOR-33830
    
    Signed-off-by: Ildar Ismagilov <ildar.ismagilov at virtuozzo.com>
    Acked-by: Andrey Zaitsev <azaitsev at virtuozzo.com>
    Acked-by: Alexey Kuznetsov <kuznet at acronis.com>
---
 fs/fuse/kio/pcs/fuse_stat.c      |  1 +
 fs/fuse/kio/pcs/pcs_cs.c         | 86 +++++++++++++++++++++++++++++++++++-----
 fs/fuse/kio/pcs/pcs_cs.h         |  3 ++
 fs/fuse/kio/pcs/pcs_cs_prot.h    |  9 +++++
 fs/fuse/kio/pcs/pcs_map.c        | 15 ++++---
 fs/fuse/kio/pcs/pcs_prot_types.h |  1 +
 fs/fuse/kio/pcs/pcs_req.h        |  2 +
 7 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/fs/fuse/kio/pcs/fuse_stat.c b/fs/fuse/kio/pcs/fuse_stat.c
index 1bcffa9641a36..47e7f1c404eb5 100644
--- a/fs/fuse/kio/pcs/fuse_stat.c
+++ b/fs/fuse/kio/pcs/fuse_stat.c
@@ -725,6 +725,7 @@ struct fuse_val_stat *req_stat_entry(struct pcs_fuse_io_stat *io, u32 type)
 			return &io->read_bytes;
 		case PCS_CS_WRITE_SYNC_RESP:
 		case PCS_CS_WRITE_RESP:
+		case PCS_CS_WRITE_AL_RESP:
 			return &io->write_bytes;
 		case PCS_CS_SYNC_RESP:
 			return &io->flush_cnt;
diff --git a/fs/fuse/kio/pcs/pcs_cs.c b/fs/fuse/kio/pcs/pcs_cs.c
index 9487e614bce8b..abf714620b9c9 100644
--- a/fs/fuse/kio/pcs/pcs_cs.c
+++ b/fs/fuse/kio/pcs/pcs_cs.c
@@ -82,6 +82,31 @@ static int pcs_cs_percpu_stat_alloc(struct pcs_cs *cs)
 	return -ENOMEM;
 }
 
+u32 pcs_cs_msg_size(u32 size, u32 storage_version)
+{
+	if (pcs_cs_use_aligned_io(storage_version))
+		size = ALIGN(size, PCS_CS_MSG_ALIGNMENT);
+
+	return size;
+}
+
+struct pcs_msg* pcs_alloc_cs_msg(u32 type, u32 size, u32 storage_version)
+{
+	struct pcs_msg* msg;
+	struct pcs_rpc_hdr* h;
+
+	msg = pcs_rpc_alloc_output_msg(pcs_cs_msg_size(size, storage_version));
+	if (!msg)
+		return NULL;
+
+	h = (struct pcs_rpc_hdr*)msg_inline_head(msg);
+	memset(h, 0, msg->size);
+	h->len = msg->size;
+	h->type = type;
+
+	return msg;
+}
+
 static void pcs_cs_percpu_stat_free(struct pcs_cs *cs)
 {
 	free_percpu(cs->stat.sync_ops_rate);
@@ -354,6 +379,7 @@ void pcs_cs_update_stat(struct pcs_cs *cs, u32 iolat, u32 netlat, int op_type)
 	switch (op_type) {
 	case PCS_CS_WRITE_SYNC_RESP:
 	case PCS_CS_WRITE_RESP:
+	case PCS_CS_WRITE_AL_RESP:
 		this_cpu_inc(cs->stat.write_ops_rate->total);
 		break;
 	case PCS_CS_READ_RESP:
@@ -549,6 +575,40 @@ static void cs_get_data(struct pcs_msg *msg, int offset, struct iov_iter *it)
 	}
 }
 
+static void cs_get_data_aligned(struct pcs_msg *msg, int offset, struct iov_iter *it)
+{
+	struct pcs_int_request * ireq = ireq_from_msg(msg);
+	int storage_version = atomic_read(&ireq->cc->storage_version);
+	unsigned hdrsize = pcs_cs_msg_size(sizeof(struct pcs_cs_iohdr),
+					   storage_version);
+	unsigned padding;
+
+	if (offset < sizeof(struct pcs_cs_iohdr)) {
+		cs_get_data(msg, offset, it);
+		return;
+	}
+
+	if (offset < hdrsize) {
+		BUILD_BUG_ON(sizeof(ireq->cc->nilbuffer) < PCS_CS_MSG_ALIGNMENT);
+		iov_iter_init_plain(it, ireq->cc->nilbuffer, hdrsize - offset, 0);
+		return;
+	}
+
+	if (offset < hdrsize + ireq->iochunk.size) {
+		/* cs_get_data() does not know about header padding, so fixup the offset */
+		offset -= hdrsize - sizeof(struct pcs_cs_iohdr);
+		cs_get_data(msg, offset, it);
+		return;
+	}
+
+	padding = pcs_cs_msg_size(ireq->iochunk.size, storage_version) -
+		  ireq->iochunk.size;
+	BUG_ON(offset >= hdrsize + ireq->iochunk.size + padding);
+
+	iov_iter_init_plain(it, ireq->cc->nilbuffer,
+			    hdrsize + ireq->iochunk.size + padding - offset, 0);
+}
+
 static void cs_sent(struct pcs_msg *msg)
 {
 	msg->done = cs_response_done;
@@ -565,6 +625,8 @@ void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq)
 	struct pcs_cs_iohdr *ioh;
 	struct pcs_cs_list *csl = ireq->iochunk.csl;
 	struct pcs_map_entry *map = ireq->iochunk.map; /* ireq keeps reference to map */
+	int storage_version = atomic_read(&ireq->cc->storage_version);
+	int aligned_msg;
 
 	msg->private = cs;
 
@@ -572,15 +634,21 @@ void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq)
 	msg->private2 = ireq;
 
 	ioh = &ireq->iochunk.hbuf;
-	ioh->hdr.len = sizeof(struct pcs_cs_iohdr);
+	ioh->hdr.len = pcs_cs_msg_size(sizeof(struct pcs_cs_iohdr),
+				       storage_version);
+	aligned_msg = pcs_cs_use_aligned_io(storage_version);
 	switch (ireq->iochunk.cmd) {
 	case PCS_REQ_T_READ:
 		ioh->hdr.type = PCS_CS_READ_REQ;
 		break;
 	case PCS_REQ_T_WRITE:
-		ioh->hdr.type = (ireq->dentry->fileinfo.attr.attrib & PCS_FATTR_IMMEDIATE_WRITE) ?
-				PCS_CS_WRITE_SYNC_REQ : PCS_CS_WRITE_REQ;
-		ioh->hdr.len += ireq->iochunk.size;
+		if (aligned_msg)
+			ioh->hdr.type = PCS_CS_WRITE_AL_REQ;
+		else
+			ioh->hdr.type = (ireq->dentry->fileinfo.attr.attrib & PCS_FATTR_IMMEDIATE_WRITE) ?
+					PCS_CS_WRITE_SYNC_REQ : PCS_CS_WRITE_REQ;
+		ioh->hdr.len = pcs_cs_msg_size(ioh->hdr.len + ireq->iochunk.size,
+					       storage_version);
 		break;
 	case PCS_REQ_T_WRITE_HOLE:
 		ioh->hdr.type = PCS_CS_WRITE_HOLE_REQ;
@@ -611,7 +679,7 @@ void pcs_cs_submit(struct pcs_cs *cs, struct pcs_int_request *ireq)
 	msg->rpc = NULL;
 	pcs_clear_error(&msg->error);
 	msg->done = cs_sent;
-	msg->get_iter = cs_get_data;
+	msg->get_iter = aligned_msg ? cs_get_data_aligned : cs_get_data;
 
 	if ((map->state & PCS_MAP_DEAD) || (map->cs_list != csl)) {
 		ireq->error.value = PCS_ERR_CSD_STALE_MAP;
@@ -1085,17 +1153,13 @@ static struct pcs_msg *cs_prep_probe(struct pcs_cs *cs)
 	struct pcs_msg *msg;
 	struct pcs_cs_map_prop *m;
 	unsigned int msg_sz = offsetof(struct pcs_cs_map_prop, nodes) + sizeof(struct pcs_cs_node_desc);
+	int storage_version = atomic_read(&cc_from_csset(cs->css)->storage_version);
 
-
-	msg = pcs_rpc_alloc_output_msg(msg_sz);
+	msg = pcs_alloc_cs_msg(PCS_CS_MAP_PROP_REQ, msg_sz, storage_version);
 	if (!msg)
 		return NULL;
 
 	m = (struct pcs_cs_map_prop *)msg_inline_head(msg);
-	memset(m, 0, msg_sz);
-
-	m->hdr.h.type = PCS_CS_MAP_PROP_REQ;
-	m->hdr.h.len = msg_sz;
 
 	m->flags = CS_MAPF_PING;
 	m->nnodes = 1;
diff --git a/fs/fuse/kio/pcs/pcs_cs.h b/fs/fuse/kio/pcs/pcs_cs.h
index 5a7bee151be8d..81743fd8a3e11 100644
--- a/fs/fuse/kio/pcs/pcs_cs.h
+++ b/fs/fuse/kio/pcs/pcs_cs.h
@@ -201,4 +201,7 @@ static inline bool cs_is_blacklisted(struct pcs_cs *cs)
 
 void pcs_cs_set_stat_up(struct pcs_cs_set *set);
 
+u32 pcs_cs_msg_size(u32 size, u32 storage_version);
+struct pcs_msg* pcs_alloc_cs_msg(u32 type, u32 size, u32 storage_version);
+
 #endif /* _PCS_CS_H_ */
diff --git a/fs/fuse/kio/pcs/pcs_cs_prot.h b/fs/fuse/kio/pcs/pcs_cs_prot.h
index 8ca6cbabf7418..12ffbf94cb2e7 100644
--- a/fs/fuse/kio/pcs/pcs_cs_prot.h
+++ b/fs/fuse/kio/pcs/pcs_cs_prot.h
@@ -6,6 +6,8 @@
 #define PCS_CS_FLUSH_WEIGHT	(128*1024)
 #define PCS_CS_HOLE_WEIGHT	(4096)
 
+#define PCS_CS_MSG_ALIGNMENT	(512ULL)
+
 struct pcs_cs_sync_data
 {
 	PCS_INTEGRITY_SEQ_T	integrity_seq;	/* Invariant. Changed only on CS host crash */
@@ -67,6 +69,10 @@ struct pcs_cs_iohdr {
 	struct pcs_cs_sync_resp sync_resp[0];	/* Used only in response to write/sync */
 } __attribute__((aligned(8)));
 
+static inline int pcs_cs_use_aligned_io(u32 storage_version)
+{
+	return (storage_version >= PCS_CS_MSG_ALIGNED_VERSION);
+}
 
 /* Maximal message size. Actually, random */
 #define PCS_CS_MSG_MAX_SIZE	(1024*1024 + sizeof(struct pcs_cs_iohdr))
@@ -86,6 +92,9 @@ struct pcs_cs_iohdr {
 #define PCS_CS_WRITE_SYNC_REQ	(PCS_RPC_CS_CLIENT_BASE + 8)
 #define PCS_CS_WRITE_SYNC_RESP	(PCS_CS_WRITE_SYNC_REQ|PCS_RPC_DIRECTION)
 
+#define PCS_CS_WRITE_AL_REQ	(PCS_RPC_CS_CLIENT_BASE + 20)
+#define PCS_CS_WRITE_AL_RESP	(PCS_CS_WRITE_AL_REQ|PCS_RPC_DIRECTION)
+
 struct pcs_cs_cong_notification {
 	struct pcs_rpc_hdr	hdr;
 
diff --git a/fs/fuse/kio/pcs/pcs_map.c b/fs/fuse/kio/pcs/pcs_map.c
index d70ef8fea70e8..89caac4284a1d 100644
--- a/fs/fuse/kio/pcs/pcs_map.c
+++ b/fs/fuse/kio/pcs/pcs_map.c
@@ -2575,6 +2575,7 @@ static int commit_cs_record(struct pcs_map_entry * m, struct pcs_cs_record * rec
 	BUG_ON(srec->dirty_integrity && srec->dirty_integrity != sync->integrity_seq);
 
 	dirtify = (op_type == PCS_CS_WRITE_SYNC_RESP || op_type == PCS_CS_WRITE_RESP ||
+		   op_type == PCS_CS_WRITE_AL_RESP ||
 		   op_type == PCS_CS_WRITE_HOLE_RESP || op_type == PCS_CS_WRITE_ZERO_RESP);
 	/* The following looks scary, could be more clear.
 	 * The goal is to update sync seq numbers:
@@ -2926,17 +2927,15 @@ static void prepare_map_flush_msg(struct pcs_map_entry * m, struct pcs_int_reque
 {
 	struct pcs_cs_iohdr * ioh;
 	struct pcs_cs_sync_resp * arr;
+	unsigned varsize = 0;
 
 	assert_spin_locked(&m->lock);
 
 	ioh = (struct pcs_cs_iohdr *)msg->_inline_buffer;
 	arr = (struct pcs_cs_sync_resp *)(ioh + 1);
 
-	ioh->hdr.len = sizeof(struct pcs_cs_iohdr);
-	ioh->hdr.type = PCS_CS_SYNC_REQ;
 	memset(&ioh->sync, 0, sizeof(ioh->sync));
 	ioh->offset = 0;
-	ioh->size = 0;
 	ioh->_reserved = 0;
 	ioh->sync.misc = PCS_CS_IO_SEQ;
 
@@ -2959,7 +2958,7 @@ static void prepare_map_flush_msg(struct pcs_map_entry * m, struct pcs_int_reque
 				arr->sync.ts_io = 0;
 				arr->sync.ts_net = 0;
 				arr->sync._reserved = 0;
-				ioh->hdr.len += sizeof(struct pcs_cs_sync_resp);
+				varsize += sizeof(struct pcs_cs_sync_resp);
 				FUSE_KLOG(cc_from_maps(m->maps)->fc, LOG_DEBUG5, "fill sync "NODE_FMT" [%d,%d,%d,%d]", NODE_ARGS(arr->cs_id),
 					arr->sync.integrity_seq, arr->sync.sync_epoch,
 					arr->sync.sync_dirty, arr->sync.sync_current);
@@ -2967,6 +2966,9 @@ static void prepare_map_flush_msg(struct pcs_map_entry * m, struct pcs_int_reque
 			}
 		}
 	}
+	ioh->size = varsize;
+	ioh->hdr.len = pcs_cs_msg_size(sizeof(struct pcs_cs_iohdr) + varsize,
+				       atomic_read(&cc_from_map(m)->storage_version));
 	msg->size = ioh->hdr.len;
 	msg->private = sreq;
 	msg->done = sync_done;
@@ -3019,8 +3021,9 @@ static int prepare_map_flush_ireq(struct pcs_map_entry *m,
 	if (!sreq)
 		goto err_cslist;
 
-	msg = pcs_rpc_alloc_output_msg(sizeof(struct pcs_cs_iohdr) +
-				       cslist->nsrv * sizeof(struct pcs_cs_sync_resp));
+	msg = pcs_alloc_cs_msg(PCS_CS_SYNC_REQ, sizeof(struct pcs_cs_iohdr) +
+			       cslist->nsrv * sizeof(struct pcs_cs_sync_resp),
+			       atomic_read(&cc_from_map(m)->storage_version));
 	if (!msg)
 		goto err_ireq;
 
diff --git a/fs/fuse/kio/pcs/pcs_prot_types.h b/fs/fuse/kio/pcs/pcs_prot_types.h
index 638b076674678..d48cfc4f0470d 100644
--- a/fs/fuse/kio/pcs/pcs_prot_types.h
+++ b/fs/fuse/kio/pcs/pcs_prot_types.h
@@ -15,6 +15,7 @@
 #define PCS_VERSION_UNKNOWN 0
 
 #define PCS_VZ7_VERSION 100
+#define PCS_CS_MSG_ALIGNED_VERSION 134
 
 /* milliseconds since Jan 1970 */
 typedef u64 PCS_FILETIME_T;
diff --git a/fs/fuse/kio/pcs/pcs_req.h b/fs/fuse/kio/pcs/pcs_req.h
index 33f0fe9e7cb55..722175a1132f6 100644
--- a/fs/fuse/kio/pcs/pcs_req.h
+++ b/fs/fuse/kio/pcs/pcs_req.h
@@ -239,6 +239,8 @@ struct pcs_cluster_core
 
 	char cluster_name[NAME_MAX];
 	atomic_t storage_version;
+
+	char nilbuffer[PCS_CS_MSG_ALIGNMENT];
 };
 
 static inline struct pcs_cluster_core *cc_from_csset(struct pcs_cs_set * css)


More information about the Devel mailing list