[Devel] [PATCH VZ9] fs/fuse kio: introduce a new rpc affinity mode

Fri Feb 7 13:22:33 MSK 2025

Ack. Let's give this a try. It might solve problems which we have with
default affinity algo.

On Fri, Feb 7, 2025 at 5:11 PM Liu Kui <kui.liu at virtuozzo.com> wrote:
>
> Currently the rpc work is scheduled to run on the sender's cpu with
> the default rpc affinity mode. However there is a serious problem in
> this mode for certain workload that a majority of rpc work ends up
> being run on just one or two cpus even though the rest of cpus are
> idle, thus resulting in significant drop in performance. The issue
> of rpc work concentrating on just few cpus is fatal in that once it
> happens it can no longer escape from it.
>
> The newly added mode tries to prevent the concentration from happening
> by capping the number of rpc work assigned to each cpu, while still
> trying to prioritize the affinity to the sender's cpu. Initial test
> shows quite significant performance improvement for some workloads,
> however also degradation for some other workloads. However we need
> to do a comprehensive test to compare the pros and cons.
>
> Related to #VSTOR-99387
> Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
> ---
>  fs/fuse/kio/pcs/pcs_rpc.c | 96 +++++++++++++++++++++++++++++++++++++++
>  fs/fuse/kio/pcs/pcs_rpc.h |  7 +++
>  2 files changed, 103 insertions(+)
>
> diff --git a/fs/fuse/kio/pcs/pcs_rpc.c b/fs/fuse/kio/pcs/pcs_rpc.c
> index e74448f2074a..b9774ce1ab34 100644
> --- a/fs/fuse/kio/pcs/pcs_rpc.c
> +++ b/fs/fuse/kio/pcs/pcs_rpc.c
> @@ -44,8 +44,18 @@ static unsigned long rpc_cpu_time_slice = PCS_RPC_CPU_SLICE;
>  module_param(rpc_cpu_time_slice, ulong, 0644);
>  MODULE_PARM_DESC(rpc_cpu_time_slice, "Time slice for RPC rebinding");
>
> +static unsigned long rpc_cpu_timeout = PCS_RPC_CPU_TIMEOUT; // 500 ms
> +module_param(rpc_cpu_timeout, ulong, 0644);
> +MODULE_PARM_DESC(rpc_cpu_timeout, "Timeout for RPC binding after become idle");
> +
> +static unsigned int rpc_cpu_nr_base = 2;
> +module_param(rpc_cpu_nr_base, uint, 0644);
> +MODULE_PARM_DESC(rpc_cpu_nr_base, "The minimum cap of numbers of rpc per cpu");
> +
>  DECLARE_WAIT_QUEUE_HEAD(pcs_waitq);
>
> +static DEFINE_PER_CPU(struct pcs_rpc_cpu, rpc_cpu) = { .nr_attached = ATOMIC_INIT(0) };
> +
>  static void timer_work(struct work_struct *w);
>  static int rpc_gc_classify(struct pcs_rpc * ep);
>
> @@ -360,6 +370,7 @@ static void pcs_rpc_destroy(struct pcs_rpc *ep)
>
>         cancel_delayed_work_sync(&ep->calendar_work);
>         flush_work(&ep->work);
> +       flush_delayed_work(&ep->cpu_timer_work);
>
>         /* pcs_free(ep->sun); */
>         /* ep->sun = NULL; */
> @@ -789,6 +800,61 @@ static int pcs_rpc_cpu_next(void)
>         return new;
>  }
>
> +static void pcs_rpc_cpu_select(struct pcs_rpc *ep)
> +{
> +       struct pcs_rpc_cpu *prc;
> +       int cpu, node, max_rpc_per_cpu;
> +
> +       if (ep->cpu != WORK_CPU_UNBOUND)
> +               atomic_dec_if_positive(&per_cpu_ptr(&rpc_cpu, ep->cpu)->nr_attached);
> +
> +       /*
> +        * lock protection for reading eng->nrpcs is unnecessary, as
> +        * we just need to derive a rough value.
> +        */
> +       max_rpc_per_cpu = ep->eng->nrpcs / nr_cpu_ids + rpc_cpu_nr_base;
> +
> +       /* Check current cpu first.*/
> +       cpu = smp_processor_id();
> +       prc = per_cpu_ptr(&rpc_cpu, cpu);
> +       if (atomic_read(&prc->nr_attached) <  max_rpc_per_cpu)
> +               goto found;
> +
> +       /* Try to find one cpu from same numa node. */
> +       node = cpu_to_node(cpu);
> +       cpu = cpumask_first_and(cpumask_of_node(node), cpu_online_mask);
> +       while (cpu < nr_cpu_ids) {
> +               prc = per_cpu_ptr(&rpc_cpu, cpu);
> +               if (atomic_read(&prc->nr_attached) <  max_rpc_per_cpu)
> +                       goto found;
> +               cpu = cpumask_next_and(cpu, cpumask_of_node(node), cpu_online_mask);
> +       }
> +
> +       /*
> +        * Otherwise, search all cpus to find one. It is a bit inefficient here,
> +        * however we don't expect this function to be called frequently in performance
> +        * critical path. So simplicity is preferred.
> +        */
> +       for_each_online_cpu(cpu) {
> +               prc = per_cpu_ptr(&rpc_cpu, cpu);
> +               if (atomic_read(&prc->nr_attached) <  max_rpc_per_cpu)
> +                       goto found;
> +       }
> +
> +       // Should not reach here
> +       WARN_ONCE(1, "Failed to find a cpu for pcs_rpc work");
> +       ep->cpu = WORK_CPU_UNBOUND;
> +
> +       return;
> +
> +found:
> +       atomic_inc(&prc->nr_attached);
> +       ep->cpu = cpu;
> +       ep->cpu_stamp = jiffies + rpc_cpu_time_slice;
> +       if (unlikely(!timer_pending(&ep->cpu_timer_work.timer)))
> +               mod_delayed_work(cc_from_rpc(ep->eng)->wq, &ep->cpu_timer_work, rpc_cpu_timeout);
> +}
> +
>  static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle)
>  {
>         switch(rpc_affinity_mode) {
> @@ -814,6 +880,10 @@ static void pcs_rpc_affinity(struct pcs_rpc *ep, bool was_idle)
>                                 ep->cpu = pcs_rpc_cpu_next();
>                         }
>                         break;
> +               case RPC_AFFINITY_FAIR_SPREAD:
> +                       if (time_is_before_jiffies(ep->cpu_stamp) && was_idle)
> +                               pcs_rpc_cpu_select(ep);
> +                       break;
>                 default:
>                         pr_err("Unknown affinity mode: %u\n", rpc_affinity_mode);
>         }
> @@ -834,6 +904,31 @@ void pcs_rpc_queue(struct pcs_rpc * ep, struct pcs_msg * msg)
>                 pcs_rpc_kick_queue(ep);
>  }
>
> +static void rpc_cpu_timer_work(struct work_struct *w)
> +{
> +       struct pcs_rpc *ep = container_of(w, struct pcs_rpc, cpu_timer_work.work);
> +       struct pcs_rpc_cpu *prc;
> +
> +       if (unlikely(ep->cpu == WORK_CPU_UNBOUND))
> +               return;
> +
> +       spin_lock(&ep->q_lock);
> +       if ((ep->state == PCS_RPC_WORK) &&
> +               time_is_after_jiffies(ep->cpu_stamp + rpc_cpu_timeout)) {
> +               unsigned long timeout;
> +
> +               spin_unlock(&ep->q_lock);
> +               timeout = rpc_cpu_timeout - (jiffies - ep->cpu_stamp);
> +               mod_delayed_work(cc_from_rpc(ep->eng)->wq, &ep->cpu_timer_work, timeout);
> +               return;
> +       }
> +
> +       prc = per_cpu_ptr(&rpc_cpu, ep->cpu);
> +       ep->cpu = WORK_CPU_UNBOUND;
> +       atomic_dec(&prc->nr_attached);
> +       spin_unlock(&ep->q_lock);
> +}
> +
>  static void calendar_work(struct work_struct *w)
>  {
>         struct pcs_rpc * ep = container_of(w, struct pcs_rpc, calendar_work.work);
> @@ -1022,6 +1117,7 @@ void pcs_rpc_configure_new_ep(struct pcs_rpc * ep, struct pcs_rpc_params *parm,
>         INIT_WORK(&ep->close_work, rpc_close_work);
>         INIT_DELAYED_WORK(&ep->timer_work, timer_work);
>         INIT_DELAYED_WORK(&ep->calendar_work, calendar_work);
> +       INIT_DELAYED_WORK(&ep->cpu_timer_work, rpc_cpu_timer_work);
>
>         for (i = 0; i < RPC_MAX_CALENDAR; i++)
>                 INIT_HLIST_HEAD(&ep->kill_calendar[i]);
> diff --git a/fs/fuse/kio/pcs/pcs_rpc.h b/fs/fuse/kio/pcs/pcs_rpc.h
> index baec7f844e38..cb18557a3da5 100644
> --- a/fs/fuse/kio/pcs/pcs_rpc.h
> +++ b/fs/fuse/kio/pcs/pcs_rpc.h
> @@ -40,6 +40,7 @@ enum {
>         RPC_AFFINITY_RETENT = 1,
>         RPC_AFFINITY_SPREAD = 2,
>         RPC_AFFINITY_RSS    = 3,
> +       RPC_AFFINITY_FAIR_SPREAD = 4,
>  };
>
>  extern unsigned int rpc_affinity_mode;
> @@ -78,6 +79,7 @@ typedef union __pre_aligned(8) _PCS_CLUSTER_ID_T {
>  /////////////////////////////
>
>  #define PCS_RPC_CPU_SLICE (100 * HZ / 1000) /* 100ms */
> +#define PCS_RPC_CPU_TIMEOUT (500 * HZ / 1000) /* 500ms */
>  struct pcs_rpc
>  {
>         struct hlist_node       link;           /* Link in hash table */
> @@ -139,6 +141,7 @@ struct pcs_rpc
>         struct list_head        input_queue;    /* Queue of requests waiting to be handled */
>         int                     cpu;
>         unsigned long           cpu_stamp;
> +       struct delayed_work     cpu_timer_work; /* reset cpu affinity after being idle */
>
>         struct mutex            mutex;
>         u64                     accounted;
> @@ -160,6 +163,10 @@ struct pcs_rpc
>         struct work_struct  close_work;
>  };
>
> +struct pcs_rpc_cpu {
> +       atomic_t        nr_attached;
> +};
> +
>  struct pcs_rpc_engine
>  {
>         spinlock_t              lock;
> --
> 2.39.5 (Apple Git-154)