[Devel] [PATCH VZ9] fs/fuse kio: introduce a new rpc affinity mode

Fri Feb 7 11:59:24 MSK 2025

Currently the rpc work is scheduled to run on the sender's cpu with
the default rpc affinity mode. However there is a serious problem in
this mode for certain workload that a majority of rpc work ends up
being run on just one or two cpus even though the rest of cpus are
idle, thus resulting in significant drop in performance. The issue
of rpc work concentrating on just few cpus is fatal in that once it
happens it can no longer escape from it.

The newly added mode tries to prevent the concentration from happening
by capping the number of rpc work assigned to each cpu, while still
trying to prioritize the affinity to the sender's cpu. Initial test
shows quite significant performance improvement for some workloads,
however also degradation for some other workloads. However we need
to do a comprehensive test to compare the pros and cons.

Related to #VSTOR-99387
Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
---
 fs/fuse/kio/pcs/pcs_rpc.c | 96 +++++++++++++++++++++++++++++++++++++++
 fs/fuse/kio/pcs/pcs_rpc.h |  7 +++
 2 files changed, 103 insertions(+)

diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c
index e74448f2074a..b9774ce1ab34 100644
--- a/fs/fuse/kio/pcs/pcs_rpc.c
+++ b/fs/fuse/kio/pcs/pcs_rpc.c
@@ -44,8 +44,18 @@ static unsigned long rpc_cpu_time_slice = PCS_RPC_CPU_SLICE;
 module_param(rpc_cpu_time_slice, ulong, 0644);
 MODULE_PARM_DESC(rpc_cpu_time_slice, "Time slice for RPC rebinding");
 
+static unsigned long rpc_cpu_timeout = PCS_RPC_CPU_TIMEOUT; // 500 ms
+module_param(rpc_cpu_timeout, ulong, 0644);
+MODULE_PARM_DESC(rpc_cpu_timeout, "Timeout for RPC binding after become idle");
+
+static unsigned int rpc_cpu_nr_base = 2;
+module_param(rpc_cpu_nr_base, uint, 0644);
+MODULE_PARM_DESC(rpc_cpu_nr_base, "The minimum cap of numbers of rpc per cpu");
+
 DECLARE_WAIT_QUEUE_HEAD(pcs_waitq);
 
+static DEFINE_PER_CPU(struct pcs_rpc_cpu, rpc_cpu) = { .nr_attached = ATOMIC_INIT(0) };
+
 static void timer_work(struct work_struct *w);
 static int rpc_gc_classify(struct pcs_rpc * ep);
 
@@ -360,6 +370,7 @@ static void pcs_rpc_destroy(struct pcs_rpc *ep)
 
 	cancel_delayed_work_sync(&ep->calendar_work);
 	flush_work(&ep->work);
+	flush_delayed_work(&ep->cpu_timer_work);
 
 	/* pcs_free(ep->sun); */
 	/* ep->sun = NULL; */
@@ -789,6 +800,61 @@ static int pcs_rpc_cpu_next(void)
 	return new;
 }
 
+static void pcs_rpc_cpu_select(struct pcs_rpc *ep)
+{
+	struct pcs_rpc_cpu *prc;
+	int cpu, node, max_rpc_per_cpu;
+
+	if (ep->cpu != WORK_CPU_UNBOUND)
+		atomic_dec_if_positive(&per_cpu_ptr(&rpc_cpu, ep->cpu)->nr_attached);
+
+	/*
+	 * lock protection for reading eng->nrpcs is unnecessary, as
+	 * we just need to derive a rough value.
+	 */
+	max_rpc_per_cpu = ep->eng->nrpcs / nr_cpu_ids + rpc_cpu_nr_base;
+
+	/* Check current cpu first.*/
+	cpu = smp_processor_id();
+	prc = per_cpu_ptr(&rpc_cpu, cpu);
+	if (atomic_read(&prc->nr_attached) <  max_rpc_per_cpu)
+		goto found;
+
+	/* Try to find one cpu from same numa node. */
+	node = cpu_to_node(cpu);
+	cpu = cpumask_first_and(cpumask_of_node(node), cpu_online_mask);
+	while (cpu < nr_cpu_ids) {
+		prc = per_cpu_ptr(&rpc_cpu, cpu);
+		if (atomic_read(&prc->nr_attached) <  max_rpc_per_cpu)
+			goto found;
+		cpu = cpumask_next_and(cpu, cpumask_of_node(node), cpu_online_mask);
+	}
+
+	/*
+	 * Otherwise, search all cpus to find one. It is a bit inefficient here,
+	 * however we don't expect this function to be called frequently in performance
+	 * critical path. So simplicity is preferred.
+	 */
+	for_each_online_cpu(cpu) {
+		prc = per_cpu_ptr(&rpc_cpu, cpu);
+		if (atomic_read(&prc->nr_attached) <  max_rpc_per_cpu)
+			goto found;
+	}
+
+	// Should not reach here
+	WARN_ONCE(1, "Failed to find a cpu for pcs_rpc work");
+	ep->cpu = WORK_CPU_UNBOUND;
+
+	return;
+
+found:
+	atomic_inc(&prc->nr_attached);
+	ep->cpu = cpu;
+	ep->cpu_stamp = jiffies + rpc_cpu_time_slice;
+	if (unlikely(!timer_pending(&ep->cpu_timer_work.timer)))
+		mod_delayed_work(cc_from_rpc(ep->eng)->wq, &ep->cpu_timer_work, rpc_cpu_timeout);
+}
+
 static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle)
 {
 	switch(rpc_affinity_mode) {
@@ -814,6 +880,10 @@ static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle)
 				ep->cpu = pcs_rpc_cpu_next();
 			}
 			break;
+		case RPC_AFFINITY_FAIR_SPREAD:
+			if (time_is_before_jiffies(ep->cpu_stamp) && was_idle)
+				pcs_rpc_cpu_select(ep);
+			break;
 		default:
 			pr_err("Unknown affinity mode: %u\n", rpc_affinity_mode);
 	}
@@ -834,6 +904,31 @@ void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * msg)
 		pcs_rpc_kick_queue(ep);
 }
 
+static void rpc_cpu_timer_work(struct work_struct *w)
+{
+	struct pcs_rpc *ep = container_of(w, struct pcs_rpc, cpu_timer_work.work);
+	struct pcs_rpc_cpu *prc;
+
+	if (unlikely(ep->cpu == WORK_CPU_UNBOUND))
+		return;
+
+	spin_lock(&ep->q_lock);
+	if ((ep->state == PCS_RPC_WORK) &&
+		time_is_after_jiffies(ep->cpu_stamp + rpc_cpu_timeout)) {
+		unsigned long timeout;
+
+		spin_unlock(&ep->q_lock);
+		timeout = rpc_cpu_timeout - (jiffies - ep->cpu_stamp);
+		mod_delayed_work(cc_from_rpc(ep->eng)->wq, &ep->cpu_timer_work, timeout);
+		return;
+	}
+
+	prc = per_cpu_ptr(&rpc_cpu, ep->cpu);
+	ep->cpu = WORK_CPU_UNBOUND;
+	atomic_dec(&prc->nr_attached);
+	spin_unlock(&ep->q_lock);
+}
+
 static void calendar_work(struct work_struct *w)
 {
 	struct pcs_rpc * ep = container_of(w, struct pcs_rpc, calendar_work.work);
@@ -1022,6 +1117,7 @@ void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, struct pcs_rpc_params *parm,
 	INIT_WORK(&ep->close_work, rpc_close_work);
 	INIT_DELAYED_WORK(&ep->timer_work, timer_work);
 	INIT_DELAYED_WORK(&ep->calendar_work, calendar_work);
+	INIT_DELAYED_WORK(&ep->cpu_timer_work, rpc_cpu_timer_work);
 
 	for (i = 0; i < RPC_MAX_CALENDAR; i++)
 		INIT_HLIST_HEAD(&ep->kill_calendar[i]);
diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h
index baec7f844e38..cb18557a3da5 100644
--- a/fs/fuse/kio/pcs/pcs_rpc.h
+++ b/fs/fuse/kio/pcs/pcs_rpc.h
@@ -40,6 +40,7 @@ enum {
 	RPC_AFFINITY_RETENT = 1,
 	RPC_AFFINITY_SPREAD = 2,
 	RPC_AFFINITY_RSS    = 3,
+	RPC_AFFINITY_FAIR_SPREAD = 4,
 };
 
 extern unsigned int rpc_affinity_mode;
@@ -78,6 +79,7 @@ typedef union __pre_aligned(8) _PCS_CLUSTER_ID_T {
 /////////////////////////////
 
 #define PCS_RPC_CPU_SLICE (100 * HZ / 1000) /* 100ms */
+#define PCS_RPC_CPU_TIMEOUT (500 * HZ / 1000) /* 500ms */
 struct pcs_rpc
 {
 	struct hlist_node	link;		/* Link in hash table */
@@ -139,6 +141,7 @@ struct pcs_rpc
 	struct list_head	input_queue;	/* Queue of requests waiting to be handled */
 	int			cpu;
 	unsigned long		cpu_stamp;
+	struct delayed_work	cpu_timer_work;	/* reset cpu affinity after being idle */
 
 	struct mutex		mutex;
 	u64			accounted;
@@ -160,6 +163,10 @@ struct pcs_rpc
 	struct work_struct  close_work;
 };
 
+struct pcs_rpc_cpu {
+	atomic_t	nr_attached;
+};
+
 struct pcs_rpc_engine
 {
 	spinlock_t		lock;
-- 
2.39.5 (Apple Git-154)