[Devel] [PATCH RHEL9 COMMIT] fs/fuse: hashed write buckets

Konstantin Khorenko khorenko at virtuozzo.com
Mon Feb 5 22:30:10 MSK 2024


The commit is pushed to "branch-rh9-5.14.0-362.8.1.vz9.35.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-362.8.1.vz9.35.10
------>
commit 7dee4f82540e2828718e0ca5df4704621654c714
Author: Alexey Kuznetsov <kuznet at virtuozzo.com>
Date:   Tue Feb 6 01:34:44 2024 +0800

    fs/fuse: hashed write buckets
    
    Previous write record was 3.5G/sec and this ceiling
    could not be penetrated even though eventloop had lots
    of spare cpu. The bottlneck is diagnosed as saturation
    of thread copying requests from kernel. So, we have
    to switch to spreading it over multiple threads,
    similar to scheme used for reads. So, for writes we
    introduce two-level table, keyed by request size
    and by inode hash.
    
    New record is 4.8G/sec. Further progress is not going to be
    easy, now we simultaneously:
     - saturate eventloop
     - saturate ec_offload thread, increasing offload threads does
       not help, we get 2 threads each 50% busy
     - we are near to 100Gib ethernet limit, 9.6G/sec of network load
    
    https://pmc.acronis.work/browse/VSTOR-79527
    
    Signed-off-by: Alexey Kuznetsov <kuznet at acronis.com>
---
 fs/fuse/file.c   | 4 ++--
 fs/fuse/fuse_i.h | 1 +
 fs/fuse/inode.c  | 9 +++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 418e0475199a..978007aee323 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -799,8 +799,8 @@ struct fuse_iqueue *fuse_route_io(struct fuse_conn *fc, struct fuse_rtable *rt,
 		if (iosize == 0)
 			return NULL;
 
-		for (i = 0; i < rt->rt_size; i++) {
-			fiq = rt->iqs_table + i;
+		for (i = 0; i < rt->rt_size; i += rt->divisor) {
+			fiq = rt->iqs_table + i + (jhash_1word((u32)inode->i_ino, 0) % rt->divisor);
 			if (iosize <= fiq->size && fiq->handled_by_fud)
 				return fiq;
 		}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fa1502151193..0c1bd5209dbc 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -682,6 +682,7 @@ static inline unsigned int fuse_qhash_bucket(void)
 struct fuse_rtable {
 	int	type;
 	int	rt_size;
+	int	divisor;
 	union {
 		void				*iqs;
 		struct fuse_iqueue __percpu	*iqs_cpu;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8ea401dd46df..43e1c53d35dd 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1058,6 +1058,8 @@ static int alloc_rt_table(struct fuse_dev *fud, struct fuse_rtable *rt,
 	int res = -EINVAL;
 	int idx;
 
+	rt->divisor = 1;
+
 	switch (req->type) {
 	case FUSE_ROUTING_CPU:
 		if (req->index >= NR_CPUS || !cpu_possible(req->index))
@@ -1073,7 +1075,9 @@ static int alloc_rt_table(struct fuse_dev *fud, struct fuse_rtable *rt,
 		res = 0;
 		break;
 	case FUSE_ROUTING_SIZE:
-		if (req->key > FUSE_MAX_MAX_PAGES*PAGE_SIZE || (req->key % PAGE_SIZE))
+		rt->divisor = 1 + (req->key & (PAGE_SIZE - 1));
+		if (rt->divisor > req->table_size ||
+		   (req->key & ~(PAGE_SIZE - 1)) > FUSE_MAX_MAX_PAGES*PAGE_SIZE)
 			break;
 		fallthrough;
 	case FUSE_ROUTING_HASH:
@@ -1097,7 +1101,7 @@ static int alloc_rt_table(struct fuse_dev *fud, struct fuse_rtable *rt,
 static void adjust_rt_table(struct fuse_dev *fud, struct fuse_iqueue *fiq,
 			    struct fuse_iq_routing *req)
 {
-	u32 size = req->key;
+	u32 size = req->key & ~(PAGE_SIZE - 1);
 
 	fiq->size = size;
 
@@ -1143,6 +1147,7 @@ int fuse_install_iq_route(struct fuse_dev *fud, struct fuse_iq_routing *req)
 
 	if (rt->iqs == NULL) {
 		rt->iqs = rtl.iqs;
+		rt->divisor = rtl.divisor;
 		rt->type = rtl.type;
 		rt->rt_size = rtl.rt_size;
 	} else if (rt->iqs != rtl.iqs) {


More information about the Devel mailing list