[Devel] [PATCH VZ9 3/3] fs/fuse: enhanced splice support

Alexey Kuznetsov kuznet at virtuozzo.com
Tue Jan 9 16:40:46 MSK 2024


Unfortunately, existing support of splice in fuse is completely useless,
it has many flaws, each of them is fatal, even taken separately.

- it passes only single splice, which requires of user space
  to prepare one more splice to merge header.
- ... and does not allow to use splices coming from TCP as they
  can be huge and do not fit to single pipe buffer.
- it uses kvmalloc(!!!) for temp buffer, which is very expensive

Nevertheless, using splices to transfer date from network is the only
our chance to get some reasonable throughput. F.e. without splice
we cannot saturate even single 100G ethernet.

So, the following approach suggested, which is the simplest and
the cleanest for FUSE_READ. Extend existing fuse reply
messages, allowing it to carry metadata instead of data. Metadata
is a list of pipe file descriptors, which can be processed one by one,
sucking pages from them.

Extensions are possible, though as it is now, only FUSE_READ
needs this. One day we probably will want to use it with FUSE_IOCTL,
other request types do not carry any significant data.

With this patch we are finally able to saturate 100G ethernet,
higher speeds are still not tested.

Signed-off-by: Alexey Kuznetsov <kuznet at acronis.com>
---
 fs/fuse/dev.c             | 155 +++++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/fuse.h |   3 +
 2 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 59b9465..ccc30d9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -31,6 +31,7 @@
 
 static struct kmem_cache *fuse_req_cachep;
 extern struct workqueue_struct *fuse_fput_wq;
+static const struct file_operations *fuse_g_splice_ops;
 
 static struct fuse_dev *fuse_get_dev(struct file *file)
 {
@@ -1997,6 +1998,144 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
 			      args->out_args, args->page_zeroing);
 }
 
+static int copy_out_splices(struct fuse_copy_state *cs, struct fuse_args *args,
+			    unsigned int nbytes)
+{
+	struct fuse_args_pages *ap = container_of(args, typeof(*ap), args);
+	struct pipe_inode_info *pipe;
+	u32 fdarr_inline[8];
+	u32 *fdarr;
+	unsigned int nsplices;
+	int err, i;
+	int didx = 0;
+	int doff = ap->descs[0].offset;
+	int dend = doff + ap->descs[0].length;
+	struct page *dpage = ap->pages[0];
+
+	nsplices = nbytes - sizeof(struct fuse_out_header);
+	if (nsplices & 3)
+		return -EINVAL;
+	if (nsplices > 8) {
+		fdarr = kmalloc(nsplices, GFP_KERNEL);
+		if (!fdarr)
+			return -ENOMEM;
+	} else
+		fdarr = fdarr_inline;
+
+	err = fuse_copy_one(cs, fdarr, nsplices);
+	if (err)
+		goto out;
+
+	nsplices /= 4;
+
+	fuse_copy_finish(cs);
+
+	for (i = 0; i < nsplices; i++) {
+		void *src, *dst;
+		struct fd f = fdget(fdarr[i]);
+
+		if (f.file) {
+			unsigned int head, tail, mask;
+
+			if (unlikely(f.file->f_op != fuse_g_splice_ops)) {
+				if (fuse_g_splice_ops == NULL) {
+					struct file *probe_files[2];
+
+					if (create_pipe_files(probe_files, 0)) {
+						err = -EBADF;
+						goto out;
+					}
+					fuse_g_splice_ops = probe_files[0]->f_op;
+					fput(probe_files[0]);
+					fput(probe_files[1]);
+				}
+				if (unlikely(f.file->f_op != fuse_g_splice_ops)) {
+					err = -EBADF;
+					goto out;
+				}
+			}
+
+			pipe = f.file->private_data;
+
+			if (pipe == NULL) {
+				err = -EBADF;
+				goto out;
+			}
+
+			pipe_lock(pipe);
+
+			head = pipe->head;
+			mask = pipe->ring_size - 1;
+
+			for (tail = pipe->tail; tail != head; tail++) {
+				struct page *ipage = pipe->bufs[tail & mask].page;
+				int ioff = pipe->bufs[tail & mask].offset;
+				int ilen = pipe->bufs[tail & mask].len;
+
+				while (ilen > 0) {
+					int copy = ilen;
+
+					if (doff >= dend) {
+						didx++;
+						if (didx >= ap->num_pages)
+							goto out_overflow;
+						dpage = ap->pages[didx];
+						doff = ap->descs[didx].offset;
+						dend = doff + ap->descs[didx].length;
+					}
+
+					if (copy > dend - doff)
+						copy = dend - doff;
+					src = kmap_atomic(ipage);
+					dst = kmap_atomic(dpage);
+					memcpy(dst + doff, src + ioff, copy);
+					kunmap_atomic(dst);
+					kunmap_atomic(src);
+
+					doff += copy;
+					ioff += copy;
+					ilen -= copy;
+				}
+				put_page(ipage);
+				pipe->bufs[tail & mask].ops = NULL;
+				pipe->bufs[tail & mask].page = NULL;
+				pipe->tail = tail + 1;
+			}
+			pipe_unlock(pipe);
+			fdput(f);
+		} else {
+			err = -EBADF;
+			goto out;
+		}
+	}
+
+	if (args->page_zeroing && didx < ap->num_pages) {
+		if (doff < dend) {
+			void *dst = kmap_atomic(dpage);
+
+			memset(dst + doff, 0, dend - doff);
+			kunmap_atomic(dst);
+		}
+		for (didx++ ; didx < ap->num_pages; didx++) {
+			void *dst = kmap_atomic(ap->pages[didx]);
+
+			memset(dst + ap->descs[didx].offset, 0, ap->descs[didx].length);
+			kunmap_atomic(dst);
+		}
+	}
+	err = 0;
+
+out:
+	if (fdarr != fdarr_inline)
+		kfree(fdarr);
+	return err;
+
+out_overflow:
+	pipe_unlock(pipe);
+	err = -EMSGSIZE;
+	goto out;
+}
+
 /*
  * Write a single reply to a request.  First the header is copied from
  * the write buffer.  The request is then searched on the processing
@@ -2035,7 +2174,8 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 	}
 
 	err = -EINVAL;
-	if (oh.error <= -512 || oh.error > 0)
+	if (oh.error != FUSE_OUT_SPLICES &&
+	    (oh.error <= -512 || oh.error > 0))
 		goto copy_finish;
 
 	spin_lock(&fpq->lock);
@@ -2067,6 +2207,14 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 		goto copy_finish;
 	}
 
+	if (oh.error == FUSE_OUT_SPLICES) {
+		if (req->args->out_numargs != 1 || !req->args->out_pages) {
+			spin_unlock(&fpq->lock);
+			err = -EINVAL;
+			goto copy_finish;
+		}
+	}
+
 	clear_bit(FR_SENT, &req->flags);
 	list_move(&req->list, &fpq->io);
 	req->out.h = oh;
@@ -2076,7 +2224,10 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
 	if (!req->args->page_replace)
 		cs->move_pages = 0;
 
-	if (oh.error)
+	if (oh.error == FUSE_OUT_SPLICES) {
+		req->out.h.error = 0;
+		err = copy_out_splices(cs, req->args, nbytes);
+	} else if (oh.error)
 		err = nbytes != sizeof(oh) ? -EINVAL : 0;
 	else
 		err = copy_out_args(cs, req->args, nbytes);
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 607b1c1..0529595 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -875,6 +875,9 @@ struct fuse_out_header {
 	uint64_t	unique;
 };
 
+/* Special error value for fuse_out_header */
+#define FUSE_OUT_SPLICES	0x40000000
+
 struct fuse_dirent {
 	uint64_t	ino;
 	uint64_t	off;
-- 
1.8.3.1



More information about the Devel mailing list