[Devel] [PATCH VZ9 v2 1/2] fs/fuse kio: implement memory region to support zero-copy between userapce and kernel space.
Vasily Averin
vvs at openvz.org
Mon May 27 08:06:21 MSK 2024
On 5/27/24 07:56, Liu Kui wrote:
> The memory region(MR) is very similar to RDMA memory region, however
> much simpler. It allows userspace to register a bulky memory to kernel,
> which would pin all pages from that memory and returns a reference back
> to userspace. Userspace can then just pass the descriptor(start address,
> length) of a buffer allocated from the registered MR, together with
> returned reference for that MR to kernel to complete data transfer to/from
> kernel.
>
> This feature will be used for implementing pcs_krpc.
>
> Signed-off-by: Liu Kui <kui.liu at virtuozzo.com>
Reviewed-by: Vasily Averin <vvs at openvz.org>
> ---
> fs/fuse/kio/pcs/pcs_mr.c | 212 +++++++++++++++++++++++++++++++++++++++
> fs/fuse/kio/pcs/pcs_mr.h | 64 ++++++++++++
> 2 files changed, 276 insertions(+)
> create mode 100644 fs/fuse/kio/pcs/pcs_mr.c
> create mode 100644 fs/fuse/kio/pcs/pcs_mr.h
>
> diff --git a/fs/fuse/kio/pcs/pcs_mr.c b/fs/fuse/kio/pcs/pcs_mr.c
> new file mode 100644
> index 000000000000..c2a2c072ba9e
> --- /dev/null
> +++ b/fs/fuse/kio/pcs/pcs_mr.c
> @@ -0,0 +1,212 @@
> +/*
> + * Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights reserved.
> + */
> +
> +#include <linux/gfp.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/slab.h>
> +#include <linux/sched/mm.h>
> +#include <linux/resource.h>
> +
> +#include "pcs_mr.h"
> +
> +void pcs_umem_release(struct pcs_umem *umem)
> +{
> + struct mm_struct *mm_s = umem->mm;
> +
> + unpin_user_pages(umem->pages, umem->npages);
> + atomic64_sub(umem->npages, &mm_s->pinned_vm);
> + mmdrop(mm_s);
> + kfree(umem->pages);
> + kfree(umem);
> +}
> +
> +/*
> + * Pin pages from userspace memory
> + */
> +struct pcs_umem *pcs_umem_get(u64 start, u64 len)
> +{
> + struct pcs_umem *umem = NULL;
> + struct page **pages;
> + int npages;
> + u64 fp_va;
> + struct mm_struct *mm_s;
> + int got, ret;
> +
> + fp_va = start & PAGE_MASK;
> + npages = PAGE_ALIGN(start + len - fp_va) >> PAGE_SHIFT;
> +
> + umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> + if (!umem)
> + return ERR_PTR(-ENOMEM);
> +
> + /* pin user pages */
> + mm_s = current->mm;
> + umem->mm = mm_s;
> +
> + mmgrab(mm_s);
> + mmap_read_lock(mm_s);
> +
> + umem->fp_addr = fp_va;
> + umem->pages = kcalloc(npages, sizeof(struct page *), GFP_KERNEL);
> + if (!umem->pages) {
> + ret = -ENOMEM;
> + goto out_err;
> + }
> +
> + got = 0;
> + while (npages) {
> + pages = &umem->pages[got];
> + ret = pin_user_pages(fp_va, npages, FOLL_WRITE | FOLL_LONGTERM, pages, NULL);
> + if (ret < 0)
> + goto out_err;
> +
> + WARN_ON(ret == 0);
> + umem->npages += ret;
> + atomic64_add(ret, &mm_s->pinned_vm);
> + fp_va += ret * PAGE_SIZE;
> + npages -= ret;
> + got += ret;
> + }
> + mmap_read_unlock(mm_s);
> +
> + return umem;
> +
> +out_err:
> + mmap_read_unlock(mm_s);
> + pcs_umem_release(umem);
> +
> + return ERR_PTR(ret);
> +}
> +
> +static void pcs_mr_free(struct kref *ref)
> +{
> + struct pcs_mr *mr = container_of(ref, struct pcs_mr, ref);
> +
> + pcs_umem_release(mr->umem);
> + kfree(mr);
> +}
> +
> +void pcs_mr_put(struct pcs_mr *mr)
> +{
> + kref_put(&mr->ref, pcs_mr_free);
> +}
> +
> +struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx)
> +{
> + struct pcs_mr *mr;
> +
> + rcu_read_lock();
> + mr = xa_load(&mrs->mr_xa, idx);
> + if (likely(mr && kref_get_unless_zero(&mr->ref))) {
> + rcu_read_unlock();
> + return mr;
> + }
> + return NULL;
> +}
> +
> +/*
> + * Register a MR
> + */
> +int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len)
> +{
> + int ret = 0;
> + struct pcs_mr *mr;
> + struct pcs_umem *umem;
> +
> + if (!len)
> + return -EINVAL;
> +
> + if (!can_do_mlock())
> + return -EPERM;
> +
> + if (atomic_inc_return(&mrs->mr_num) > PCS_MAX_MR) {
> + atomic_dec(&mrs->mr_num);
> + return -ENOMEM;
> + }
> +
> + umem = pcs_umem_get(start, len);
> + if (IS_ERR(umem)) {
> + atomic_dec(&mrs->mr_num);
> + return PTR_ERR(umem);
> + }
> +
> + mr = kzalloc(sizeof(*mr), GFP_KERNEL);
> + if (!mr) {
> + ret = -ENOMEM;
> + goto err_out;
> + }
> +
> + mr->mrs = mrs;
> + mr->va = start;
> + mr->len = len;
> + mr->umem = umem;
> + kref_init(&mr->ref);
> +
> + ret = xa_alloc_cyclic(&mrs->mr_xa, &mr->id, mr,
> + XA_LIMIT(1, PCS_MAX_MR), &mrs->mr_next, GFP_KERNEL);
> + if (ret < 0) {
> + kfree(mr);
> + goto err_out;
> + }
> +
> + mr->id_valid = 1;
> +
> + return mr->id;
> +
> +err_out:
> + pcs_umem_release(umem);
> + atomic_dec(&mrs->mr_num);
> + return ret;
> +}
> +
> +/*
> + * Deregister a MR
> + */
> +int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id)
> +{
> + struct pcs_mr *mr;
> +
> + mr = pcs_mr_get(mrs, id);
> + if (!mr)
> + return -ENXIO;
> +
> + mr->id_valid = 0;
> + pcs_mr_put(mr);
> +
> + /* make sure mr->id_valid is seen */
> + smp_mb();
> +
> + xa_erase(&mrs->mr_xa, mr->id);
> + pcs_mr_put(mr);
> +
> + atomic_dec(&mrs->mr_num);
> +
> + return 0;
> +}
> +
> +void pcs_mrset_init(struct pcs_mr_set *mrs)
> +{
> + xa_init_flags(&mrs->mr_xa, XA_FLAGS_ALLOC1);
> + mrs->mr_next = 0;
> + atomic_set(&mrs->mr_num, 0);
> +}
> +
> +void pcs_mrset_fini(struct pcs_mr_set *mrs)
> +{
> + struct pcs_mr *mr;
> + unsigned long idx;
> +
> + if (atomic_read(&mrs->mr_num) == 0)
> + return;
> +
> + /*clean all registered MRs*/
> + xa_for_each(&mrs->mr_xa, idx, mr) {
> + BUG_ON(kref_read(&mr->ref) != 1);
> + pcs_mr_put(mr);
> + atomic_dec(&mrs->mr_num);
> + }
> +
> + BUG_ON(atomic_read(&mrs->mr_num) != 0);
> + xa_destroy(&mrs->mr_xa);
> +}
> diff --git a/fs/fuse/kio/pcs/pcs_mr.h b/fs/fuse/kio/pcs/pcs_mr.h
> new file mode 100644
> index 000000000000..0eaa9f263090
> --- /dev/null
> +++ b/fs/fuse/kio/pcs/pcs_mr.h
> @@ -0,0 +1,64 @@
> +/*
> + * Copyright (c) 2018-2024 Virtuozzo International GmbH. All rights reserved.
> + */
> +
> +#ifndef _PCS_MR_H_
> +#define _PCS_MR_H_ 1
> +
> +#include <linux/types.h>
> +#include <linux/highmem-internal.h>
> +
> +struct pcs_umem;
> +
> +#define PCS_MAX_MR 0x10000
> +
> +struct pcs_mr_set {
> + struct xarray mr_xa; /* array of registered MRs*/
> + u32 mr_next; /* next index of the mr xarray*/
> +
> + atomic_t mr_num; /* number of registered MRs*/
> +};
> +
> +struct pcs_umem {
> + u64 fp_addr; /* First page base address */
> + int npages; /* number of pinned pages */
> + struct page **pages; /* array of pinned pages */
> + struct mm_struct *mm; /* mm the memory belongs to */
> +};
> +
> +struct pcs_mr {
> + struct pcs_mr_set *mrs; /* set holding this mr */
> + struct kref ref;
> + struct pcs_umem *umem;
> + u64 va; /* starting address of MR */
> + u64 len; /* length of MR */
> + u32 id; /* index in kdev->mr_xa, returned to userspace */
> + u8 id_valid; /* valid or invalid */
> + u8 rsvd[3];
> +};
> +
> +/*
> + * Get page pointer for the address
> + */
> +static inline struct page *pcs_umem_page(struct pcs_umem *umem, u64 addr)
> +{
> + unsigned int idx = (addr - umem->fp_addr) >> PAGE_SHIFT;
> +
> + if (likely(idx < umem->npages))
> + return umem->pages[idx];
> + return NULL;
> +}
> +
> +struct pcs_umem *pcs_umem_get(u64 start, u64 len);
> +void pcs_umem_release(struct pcs_umem *umem);
> +
> +struct pcs_mr *pcs_mr_get(struct pcs_mr_set *mrs, int idx);
> +void pcs_mr_put(struct pcs_mr *mr);
> +
> +int pcs_reg_mr(struct pcs_mr_set *mrs, u64 start, u64 len);
> +int pcs_dereg_mr(struct pcs_mr_set *mrs, u32 id);
> +
> +void pcs_mrset_init(struct pcs_mr_set *mrs);
> +void pcs_mrset_fini(struct pcs_mr_set *mrs);
> +
> +#endif
More information about the Devel
mailing list