[Devel] [PATCH VZ9] fs/fuse kio: introduce a new rpc affinity mode
Alexey Kuznetsov
kuznet at virtuozzo.com
Fri Feb 7 13:22:33 MSK 2025
Ack. Let's give this a try. It might solve problems which we have with
default affinity algo.
On Fri, Feb 7, 2025 at 5:11 PM Liu Kui <kui.liu at virtuozzo.com> wrote:
>
> Currently the rpc work is scheduled to run on the sender's cpu with
> the default rpc affinity mode. However there is a serious problem in
> this mode for certain workload that a majority of rpc work ends up
> being run on just one or two cpus even though the rest of cpus are
> idle, thus resulting in significant drop in performance. The issue
> of rpc work concentrating on just few cpus is fatal in that once it
> happens it can no longer escape from it.
>
> The newly added mode tries to prevent the concentration from happening
> by capping the number of rpc work assigned to each cpu, while still
> trying to prioritize the affinity to the sender's cpu. Initial test
> shows quite significant performance improvement for some workloads,
> however also degradation for some other workloads. However we need
> to do a comprehensive test to compare the pros and cons.
>
> Related to #VSTOR-99387
> Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
> ---
> fs/fuse/kio/pcs/pcs_rpc.c | 96 +++++++++++++++++++++++++++++++++++++++
> fs/fuse/kio/pcs/pcs_rpc.h | 7 +++
> 2 files changed, 103 insertions(+)
>
> diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c
> index e74448f2074a..b9774ce1ab34 100644
> --- a/fs/fuse/kio/pcs/pcs_rpc.c
> +++ b/fs/fuse/kio/pcs/pcs_rpc.c
> @@ -44,8 +44,18 @@ static unsigned long rpc_cpu_time_slice = PCS_RPC_CPU_SLICE;
> module_param(rpc_cpu_time_slice, ulong, 0644);
> MODULE_PARM_DESC(rpc_cpu_time_slice, "Time slice for RPC rebinding");
>
> +static unsigned long rpc_cpu_timeout = PCS_RPC_CPU_TIMEOUT; // 500 ms
> +module_param(rpc_cpu_timeout, ulong, 0644);
> +MODULE_PARM_DESC(rpc_cpu_timeout, "Timeout for RPC binding after become idle");
> +
> +static unsigned int rpc_cpu_nr_base = 2;
> +module_param(rpc_cpu_nr_base, uint, 0644);
> +MODULE_PARM_DESC(rpc_cpu_nr_base, "The minimum cap of numbers of rpc per cpu");
> +
> DECLARE_WAIT_QUEUE_HEAD(pcs_waitq);
>
> +static DEFINE_PER_CPU(struct pcs_rpc_cpu, rpc_cpu) = { .nr_attached = ATOMIC_INIT(0) };
> +
> static void timer_work(struct work_struct *w);
> static int rpc_gc_classify(struct pcs_rpc * ep);
>
> @@ -360,6 +370,7 @@ static void pcs_rpc_destroy(struct pcs_rpc *ep)
>
> cancel_delayed_work_sync(&ep->calendar_work);
> flush_work(&ep->work);
> + flush_delayed_work(&ep->cpu_timer_work);
>
> /* pcs_free(ep->sun); */
> /* ep->sun = NULL; */
> @@ -789,6 +800,61 @@ static int pcs_rpc_cpu_next(void)
> return new;
> }
>
> +static void pcs_rpc_cpu_select(struct pcs_rpc *ep)
> +{
> + struct pcs_rpc_cpu *prc;
> + int cpu, node, max_rpc_per_cpu;
> +
> + if (ep->cpu != WORK_CPU_UNBOUND)
> + atomic_dec_if_positive(&per_cpu_ptr(&rpc_cpu, ep->cpu)->nr_attached);
> +
> + /*
> + * lock protection for reading eng->nrpcs is unnecessary, as
> + * we just need to derive a rough value.
> + */
> + max_rpc_per_cpu = ep->eng->nrpcs / nr_cpu_ids + rpc_cpu_nr_base;
> +
> + /* Check current cpu first.*/
> + cpu = smp_processor_id();
> + prc = per_cpu_ptr(&rpc_cpu, cpu);
> + if (atomic_read(&prc->nr_attached) < max_rpc_per_cpu)
> + goto found;
> +
> + /* Try to find one cpu from same numa node. */
> + node = cpu_to_node(cpu);
> + cpu = cpumask_first_and(cpumask_of_node(node), cpu_online_mask);
> + while (cpu < nr_cpu_ids) {
> + prc = per_cpu_ptr(&rpc_cpu, cpu);
> + if (atomic_read(&prc->nr_attached) < max_rpc_per_cpu)
> + goto found;
> + cpu = cpumask_next_and(cpu, cpumask_of_node(node), cpu_online_mask);
> + }
> +
> + /*
> + * Otherwise, search all cpus to find one. It is a bit inefficient here,
> + * however we don't expect this function to be called frequently in performance
> + * critical path. So simplicity is preferred.
> + */
> + for_each_online_cpu(cpu) {
> + prc = per_cpu_ptr(&rpc_cpu, cpu);
> + if (atomic_read(&prc->nr_attached) < max_rpc_per_cpu)
> + goto found;
> + }
> +
> + // Should not reach here
> + WARN_ONCE(1, "Failed to find a cpu for pcs_rpc work");
> + ep->cpu = WORK_CPU_UNBOUND;
> +
> + return;
> +
> +found:
> + atomic_inc(&prc->nr_attached);
> + ep->cpu = cpu;
> + ep->cpu_stamp = jiffies + rpc_cpu_time_slice;
> + if (unlikely(!timer_pending(&ep->cpu_timer_work.timer)))
> + mod_delayed_work(cc_from_rpc(ep->eng)->wq, &ep->cpu_timer_work, rpc_cpu_timeout);
> +}
> +
> static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle)
> {
> switch(rpc_affinity_mode) {
> @@ -814,6 +880,10 @@ static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle)
> ep->cpu = pcs_rpc_cpu_next();
> }
> break;
> + case RPC_AFFINITY_FAIR_SPREAD:
> + if (time_is_before_jiffies(ep->cpu_stamp) && was_idle)
> + pcs_rpc_cpu_select(ep);
> + break;
> default:
> pr_err("Unknown affinity mode: %u\n", rpc_affinity_mode);
> }
> @@ -834,6 +904,31 @@ void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * msg)
> pcs_rpc_kick_queue(ep);
> }
>
> +static void rpc_cpu_timer_work(struct work_struct *w)
> +{
> + struct pcs_rpc *ep = container_of(w, struct pcs_rpc, cpu_timer_work.work);
> + struct pcs_rpc_cpu *prc;
> +
> + if (unlikely(ep->cpu == WORK_CPU_UNBOUND))
> + return;
> +
> + spin_lock(&ep->q_lock);
> + if ((ep->state == PCS_RPC_WORK) &&
> + time_is_after_jiffies(ep->cpu_stamp + rpc_cpu_timeout)) {
> + unsigned long timeout;
> +
> + spin_unlock(&ep->q_lock);
> + timeout = rpc_cpu_timeout - (jiffies - ep->cpu_stamp);
> + mod_delayed_work(cc_from_rpc(ep->eng)->wq, &ep->cpu_timer_work, timeout);
> + return;
> + }
> +
> + prc = per_cpu_ptr(&rpc_cpu, ep->cpu);
> + ep->cpu = WORK_CPU_UNBOUND;
> + atomic_dec(&prc->nr_attached);
> + spin_unlock(&ep->q_lock);
> +}
> +
> static void calendar_work(struct work_struct *w)
> {
> struct pcs_rpc * ep = container_of(w, struct pcs_rpc, calendar_work.work);
> @@ -1022,6 +1117,7 @@ void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, struct pcs_rpc_params *parm,
> INIT_WORK(&ep->close_work, rpc_close_work);
> INIT_DELAYED_WORK(&ep->timer_work, timer_work);
> INIT_DELAYED_WORK(&ep->calendar_work, calendar_work);
> + INIT_DELAYED_WORK(&ep->cpu_timer_work, rpc_cpu_timer_work);
>
> for (i = 0; i < RPC_MAX_CALENDAR; i++)
> INIT_HLIST_HEAD(&ep->kill_calendar[i]);
> diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h
> index baec7f844e38..cb18557a3da5 100644
> --- a/fs/fuse/kio/pcs/pcs_rpc.h
> +++ b/fs/fuse/kio/pcs/pcs_rpc.h
> @@ -40,6 +40,7 @@ enum {
> RPC_AFFINITY_RETENT = 1,
> RPC_AFFINITY_SPREAD = 2,
> RPC_AFFINITY_RSS = 3,
> + RPC_AFFINITY_FAIR_SPREAD = 4,
> };
>
> extern unsigned int rpc_affinity_mode;
> @@ -78,6 +79,7 @@ typedef union __pre_aligned(8) _PCS_CLUSTER_ID_T {
> /////////////////////////////
>
> #define PCS_RPC_CPU_SLICE (100 * HZ / 1000) /* 100ms */
> +#define PCS_RPC_CPU_TIMEOUT (500 * HZ / 1000) /* 500ms */
> struct pcs_rpc
> {
> struct hlist_node link; /* Link in hash table */
> @@ -139,6 +141,7 @@ struct pcs_rpc
> struct list_head input_queue; /* Queue of requests waiting to be handled */
> int cpu;
> unsigned long cpu_stamp;
> + struct delayed_work cpu_timer_work; /* reset cpu affinity after being idle */
>
> struct mutex mutex;
> u64 accounted;
> @@ -160,6 +163,10 @@ struct pcs_rpc
> struct work_struct close_work;
> };
>
> +struct pcs_rpc_cpu {
> + atomic_t nr_attached;
> +};
> +
> struct pcs_rpc_engine
> {
> spinlock_t lock;
> --
> 2.39.5 (Apple Git-154)
More information about the Devel
mailing list