[Devel] [PATCH RHEL7 COMMIT] fs: kernel direct aio
Konstantin Khorenko
khorenko at virtuozzo.com
Fri Dec 25 00:45:39 PST 2015
The commit is pushed to "branch-rh7-3.10.0-327.3.1-vz7.10.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.3.1.el7
------>
commit 553598cabaa1339a7096fc03cddbb91bd0193ff4
Author: Dmitry Monakhov <dmonakhov at openvz.org>
Date: Fri Dec 25 12:45:30 2015 +0400
fs: kernel direct aio
This is a port of 2f3ecb6 ("fs: kernel direct aio")
onto rebased kernel (based on 3.10.0-327.3.1.el7).
fs: kernel direct aio
Port of 95-diff-kernel-direct-aio-combined from
https://jira.sw.ru/browse/PSBM-18169
Signed-off-by: Maxim Patlasov <MPatlasov at parallels.com>
https://jira.sw.ru/browse/PSBM-42312
Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
fs/aio.c | 140 +++++++++++
fs/ceph/file.c | 10 +-
fs/cifs/file.c | 7 +-
fs/fuse/file.c | 12 +-
include/linux/aio.h | 15 ++
include/linux/blk_types.h | 8 +
include/linux/fs.h | 142 ++++++++++-
include/uapi/linux/aio_abi.h | 2 +
mm/Makefile | 3 +-
mm/filemap.c | 563 ++++++++++++++-----------------------------
mm/iov-iter.c | 474 ++++++++++++++++++++++++++++++++++++
11 files changed, 972 insertions(+), 404 deletions(-)
diff --git a/fs/aio.c b/fs/aio.c
index 8427423..8ec32e2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -936,6 +936,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
atomic_set(&iocb->ki_users, 0);
wake_up_process(iocb->ki_obj.tsk);
return;
+ } else if (is_kernel_kiocb(iocb)) {
+ iocb->ki_obj.complete(iocb->ki_user_data, res);
+ aio_kernel_free(iocb);
+ return;
}
/*
@@ -1377,6 +1381,51 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
return 0;
}
+static ssize_t aio_read_iter(struct kiocb *iocb)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ if (unlikely(!is_kernel_kiocb(iocb)))
+ return -EINVAL;
+
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ return -EBADF;
+
+ ret = security_file_permission(file, MAY_READ);
+ if (unlikely(ret))
+ return ret;
+
+ if (!file->f_op->read_iter)
+ return -EINVAL;
+
+ return file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+}
+
+static ssize_t aio_write_iter(struct kiocb *iocb)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ if (unlikely(!is_kernel_kiocb(iocb)))
+ return -EINVAL;
+
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
+ ret = security_file_permission(file, MAY_WRITE);
+ if (unlikely(ret))
+ return ret;
+
+ if (!file->f_op->write_iter)
+ return -EINVAL;
+
+ file_start_write(file);
+ ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+ file_end_write(file);
+ return ret;
+}
+
/*
* aio_setup_iocb:
* Performs the initial checks and aio retry method
@@ -1428,6 +1477,14 @@ rw_common:
ret = aio_rw_vect_retry(req, rw, rw_op);
break;
+ case IOCB_CMD_READ_ITER:
+ ret = aio_read_iter(req);
+ break;
+
+ case IOCB_CMD_WRITE_ITER:
+ ret = aio_write_iter(req);
+ break;
+
case IOCB_CMD_FDSYNC:
if (!file->f_op->aio_fsync)
return -EINVAL;
@@ -1462,6 +1519,89 @@ rw_common:
return 0;
}
+/*
+ * This allocates an iocb that will be used to submit and track completion of
+ * an IO that is issued from kernel space.
+ *
+ * The caller is expected to call the appropriate aio_kernel_init_() functions
+ * and then call aio_kernel_submit(). From that point forward progress is
+ * guaranteed by the file system aio method. Eventually the caller's
+ * completion callback will be called.
+ *
+ * These iocbs are special. They don't have a context, we don't limit the
+ * number pending, they can't be canceled, and can't be retried. In the short
+ * term callers need to be careful not to call operations which might retry by
+ * only calling new ops which never add retry support. In the long term
+ * retry-based AIO should be removed.
+ */
+struct kiocb *aio_kernel_alloc(gfp_t gfp)
+{
+ struct kiocb *iocb = kzalloc(sizeof(struct kiocb), gfp);
+ if (iocb)
+ iocb->ki_ctx = (void *)-1;
+ return iocb;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_alloc);
+
+void aio_kernel_free(struct kiocb *iocb)
+{
+ kfree(iocb);
+}
+EXPORT_SYMBOL_GPL(aio_kernel_free);
+
+/*
+ * The iter count must be set before calling here. Some filesystems uses
+ * iocb->ki_left as an indicator of the size of an IO.
+ */
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+ unsigned short op, struct iov_iter *iter, loff_t off)
+{
+ iocb->ki_filp = filp;
+ iocb->ki_iter = iter;
+ iocb->ki_opcode = op;
+ iocb->ki_pos = off;
+ iocb->ki_nbytes = iov_iter_count(iter);
+ iocb->ki_left = iocb->ki_nbytes;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_iter);
+
+void aio_kernel_init_callback(struct kiocb *iocb,
+ void (*complete)(u64 user_data, long res),
+ u64 user_data)
+{
+ iocb->ki_obj.complete = complete;
+ iocb->ki_user_data = user_data;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_callback);
+
+/*
+ * The iocb is our responsibility once this is called. The caller must not
+ * reference it. This comes from aio_setup_iocb() modifying the iocb.
+ *
+ * Callers must be prepared for their iocb completion callback to be called the
+ * moment they enter this function. The completion callback may be called from
+ * any context.
+ *
+ * Returns: 0: the iocb completion callback will be called with the op result
+ * negative errno: the operation was not submitted and the iocb was freed
+ */
+int aio_kernel_submit(struct kiocb *iocb)
+{
+ int ret;
+
+ BUG_ON(!is_kernel_kiocb(iocb));
+ BUG_ON(!iocb->ki_obj.complete);
+ BUG_ON(!iocb->ki_filp);
+
+ ret = aio_run_iocb(iocb, 0);
+
+ if (ret)
+ aio_kernel_free(iocb);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_submit);
+
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
struct iocb *iocb, bool compat)
{
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1655236..ccc51a4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -806,8 +806,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i,
zero_user_segment(inline_page, inline_len, end);
while (left) {
- void __user *udata = i->iov->iov_base + i->iov_offset;
- size_t n = min(i->iov->iov_len - i->iov_offset, left);
+ struct iovec *iov = iov_iter_iovec(i);
+ void __user *udata = iov->iov_base + i->iov_offset;
+ size_t n = min(iov->iov_len - i->iov_offset, left);
if (__copy_to_user(udata, kdata, n)) {
ret = -EFAULT;
@@ -824,8 +825,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i,
size_t left = min_t(loff_t, iocb->ki_pos + len, i_size) - pos;
while (left) {
- void __user *udata = i->iov->iov_base + i->iov_offset;
- size_t n = min(i->iov->iov_len - i->iov_offset, left);
+ struct iovec *iov = iov_iter_iovec(i);
+ void __user *udata = iov->iov_base + i->iov_offset;
+ size_t n = min(iov->iov_len - i->iov_offset, left);
if (__clear_user(udata, n)) {
ret = -EFAULT;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0bc0fad..401fa67 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2435,8 +2435,9 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
save_len = cur_len;
for (i = 0; i < nr_pages; i++) {
bytes = min_t(const size_t, cur_len, PAGE_SIZE);
- copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+ copied = iov_iter_copy_from_user(wdata->pages[i], from, 0, bytes);
cur_len -= copied;
+ iov_iter_advance(from, copied);
/*
* If we didn't copy as much as we expected, then that
* may mean we trod into an unmapped area. Stop copying
@@ -2865,8 +2866,10 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
for (i = 0; i < rdata->nr_pages; i++) {
struct page *page = rdata->pages[i];
size_t copy = min_t(size_t, remaining, PAGE_SIZE);
- size_t written = copy_page_to_iter(page, 0, copy, iter);
+ size_t written = iov_iter_copy_to_user(page, iter, 0, copy);
+
remaining -= written;
+ iov_iter_advance(iter, written);
if (written < copy && iov_iter_count(iter) > 0)
break;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8f16755..f432b70 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1518,7 +1518,12 @@ static inline void fuse_page_descs_length_init(struct fuse_req *req,
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
- return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+ struct iovec *iov;
+
+ BUG_ON(!iov_iter_has_iovec(ii));
+ iov = (struct iovec *)ii->data;
+
+ return (unsigned long)iov->iov_base + ii->iov_offset;
}
static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
@@ -2637,8 +2642,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
kaddr = kmap(page);
while (todo) {
- char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
- size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+ struct iovec *iiov = (struct iovec *)ii.data;
+ char __user *uaddr = iiov->iov_base + ii.iov_offset;
+ size_t iov_len = iiov->iov_len - ii.iov_offset;
size_t copy = min(todo, iov_len);
size_t left;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 161aa0c..0aa7dd3 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -42,6 +42,7 @@ struct kiocb {
union {
void __user *user;
struct task_struct *tsk;
+ void (*complete)(u64 user_data, long res);
} ki_obj;
__u64 ki_user_data; /* user's data for completion */
@@ -66,6 +67,7 @@ struct kiocb {
* this is the underlying eventfd context to deliver events to.
*/
struct eventfd_ctx *ki_eventfd;
+ struct iov_iter *ki_iter;
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -73,6 +75,11 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
return kiocb->ki_ctx == NULL;
}
+static inline bool is_kernel_kiocb(struct kiocb *kiocb)
+{
+ return kiocb->ki_ctx == (void *)-1;
+}
+
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
@@ -93,6 +100,14 @@ extern void exit_aio(struct mm_struct *mm);
extern long do_io_submit(aio_context_t ctx_id, long nr,
struct iocb __user *__user *iocbpp, bool compat);
void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
+struct kiocb *aio_kernel_alloc(gfp_t gfp);
+void aio_kernel_free(struct kiocb *iocb);
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+ unsigned short op, struct iov_iter *iter, loff_t off);
+void aio_kernel_init_callback(struct kiocb *iocb,
+ void (*complete)(u64 user_data, long res),
+ u64 user_data);
+int aio_kernel_submit(struct kiocb *iocb);
#else
static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
static inline void aio_put_req(struct kiocb *iocb) { }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e3c8bfb..1251977 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -28,6 +28,14 @@ struct bio_vec {
unsigned int bv_offset;
};
+static inline ssize_t bvec_length(const struct bio_vec *bvec, unsigned long nr)
+{
+ ssize_t bytes = 0;
+ while (nr--)
+ bytes += (bvec++)->bv_len;
+ return bytes;
+}
+
/*
* RHEL7 auxillary shadow structure used to extend 'struct bio' without
* breaking RHEL kABI -- bio_init_aux() must be used to set bio->bio_aux
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2f26ee8..9e6f777 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -318,35 +318,138 @@ struct address_space;
struct writeback_control;
struct iov_iter {
- const struct iovec *iov;
+ struct iov_iter_ops *ops;
+ unsigned long data;
unsigned long nr_segs;
size_t iov_offset;
size_t count;
};
-size_t iov_iter_copy_from_user_atomic(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes);
-size_t iov_iter_copy_from_user(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes);
-void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
-size_t iov_iter_single_seg_count(const struct iov_iter *i);
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i);
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i);
+struct iov_iter_ops {
+ size_t (*ii_copy_to_user_atomic)(struct page *, struct iov_iter *,
+ unsigned long, size_t);
+ size_t (*ii_copy_to_user)(struct page *, struct iov_iter *,
+ unsigned long, size_t);
+ size_t (*ii_copy_from_user_atomic)(struct page *, struct iov_iter *,
+ unsigned long, size_t);
+ size_t (*ii_copy_from_user)(struct page *, struct iov_iter *,
+ unsigned long, size_t);
+ void (*ii_advance)(struct iov_iter *, size_t);
+ int (*ii_fault_in_readable)(struct iov_iter *, size_t);
+ size_t (*ii_single_seg_count)(const struct iov_iter *);
+ int (*ii_shorten)(struct iov_iter *, size_t);
+};
+
+static inline size_t iov_iter_copy_to_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ return i->ops->ii_copy_to_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_to_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ return i->ops->ii_copy_to_user(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ return i->ops->ii_copy_from_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ return i->ops->ii_copy_from_user(page, i, offset, bytes);
+}
+static inline void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+ return i->ops->ii_advance(i, bytes);
+}
+static inline int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ return i->ops->ii_fault_in_readable(i, bytes);
+}
+static inline size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+ return i->ops->ii_single_seg_count(i);
+}
+static inline int iov_iter_shorten(struct iov_iter *i, size_t count)
+{
+ return i->ops->ii_shorten(i, count);
+}
+
+extern struct iov_iter_ops ii_bvec_ops;
+
+struct bio_vec;
+static inline void iov_iter_init_bvec(struct iov_iter *i,
+ struct bio_vec *bvec,
+ unsigned long nr_segs,
+ size_t count, size_t written)
+{
+ i->ops = &ii_bvec_ops;
+ i->data = (unsigned long)bvec;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count + written;
+
+ iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_bvec(struct iov_iter *i)
+{
+ return i->ops == &ii_bvec_ops;
+}
+static inline struct bio_vec *iov_iter_bvec(struct iov_iter *i)
+{
+ BUG_ON(!iov_iter_has_bvec(i));
+ return (struct bio_vec *)i->data;
+}
+
+extern struct iov_iter_ops ii_page_ops;
+
+static inline void iov_iter_init_page(struct iov_iter *i,
+ struct page *page,
+ size_t count, size_t written)
+{
+ i->ops = &ii_page_ops;
+ i->data = (unsigned long)page;
+ i->nr_segs = 1;
+ i->iov_offset = 0;
+ i->count = count + written;
+
+ iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_page(struct iov_iter *i)
+{
+ return i->ops == &ii_page_ops;
+}
+static inline struct page *iov_iter_page(struct iov_iter *i)
+{
+ BUG_ON(!iov_iter_has_page(i));
+ return (struct page *)i->data;
+}
+
+extern struct iov_iter_ops ii_iovec_ops;
static inline void iov_iter_init(struct iov_iter *i,
const struct iovec *iov, unsigned long nr_segs,
size_t count, size_t written)
{
- i->iov = iov;
+ i->ops = &ii_iovec_ops;
+ i->data = (unsigned long)iov;
i->nr_segs = nr_segs;
i->iov_offset = 0;
i->count = count + written;
iov_iter_advance(i, written);
}
+static inline int iov_iter_has_iovec(const struct iov_iter *i)
+{
+ return i->ops == &ii_iovec_ops;
+}
+static inline struct iovec *iov_iter_iovec(struct iov_iter *i)
+{
+ BUG_ON(!iov_iter_has_iovec(i));
+ return (struct iovec *)i->data;
+}
static inline size_t iov_iter_count(struct iov_iter *i)
{
@@ -408,6 +511,10 @@ struct address_space_operations {
void (*freepage)(struct page *);
ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
loff_t offset, unsigned long nr_segs);
+ ssize_t (*direct_IO_bvec)(int, struct kiocb *, struct bio_vec *bvec,
+ loff_t offset, unsigned long bvec_len);
+ ssize_t (*direct_IO_page)(int, struct kiocb *, struct page *page,
+ loff_t offset);
int (*get_xip_mem)(struct address_space *, pgoff_t, int,
void **, unsigned long *);
/*
@@ -1665,7 +1772,9 @@ struct file_operations {
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t (*read_iter) (struct kiocb *, struct iov_iter *, loff_t);
ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ ssize_t (*write_iter) (struct kiocb *, struct iov_iter *, loff_t);
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@ -2702,13 +2811,20 @@ extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *, loff_t);
extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
loff_t *);
+extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t *);
extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t);
extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
unsigned long *, loff_t, loff_t *, size_t, size_t);
+extern ssize_t generic_file_direct_write_iter(struct kiocb *, struct iov_iter *,
+ loff_t, loff_t *, size_t);
extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
unsigned long, loff_t, loff_t *, size_t, ssize_t);
+extern ssize_t generic_file_buffered_write_iter(struct kiocb *, struct iov_iter *,
+ loff_t, loff_t *, ssize_t);
extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
extern int generic_segment_checks(const struct iovec *iov,
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index bb2554f..22ce4bd 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -44,6 +44,8 @@ enum {
IOCB_CMD_NOOP = 6,
IOCB_CMD_PREADV = 7,
IOCB_CMD_PWRITEV = 8,
+ IOCB_CMD_READ_ITER = 9,
+ IOCB_CMD_WRITE_ITER = 10,
};
/*
diff --git a/mm/Makefile b/mm/Makefile
index 204a614..4c3899b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -20,7 +20,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o balloon_compaction.o \
- interval_tree.o list_lru.o workingset.o oom_group.o $(mmu-y)
+ interval_tree.o list_lru.o workingset.o oom_group.o \
+ iov-iter.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/filemap.c b/mm/filemap.c
index ad2939d..605b5d3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1439,162 +1439,6 @@ static void shrink_readahead_size_eio(struct file *filp,
ra->ra_pages /= 4;
}
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
- void *kaddr, *from;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- if (!fault_in_pages_writeable(buf, copy)) {
- kaddr = kmap_atomic(page);
- from = kaddr + offset;
-
- /* first chunk, usually the only one */
- left = __copy_to_user_inatomic(buf, from, copy);
- copy -= left;
- skip += copy;
- from += copy;
- bytes -= copy;
-
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_to_user_inatomic(buf, from, copy);
- copy -= left;
- skip = copy;
- from += copy;
- bytes -= copy;
- }
- if (likely(!bytes)) {
- kunmap_atomic(kaddr);
- goto done;
- }
- offset = from - kaddr;
- buf += copy;
- kunmap_atomic(kaddr);
- copy = min(bytes, iov->iov_len - skip);
- }
- /* Too bad - revert to non-atomic kmap */
- kaddr = kmap(page);
- from = kaddr + offset;
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip += copy;
- from += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_to_user(buf, from, copy);
- copy -= left;
- skip = copy;
- from += copy;
- bytes -= copy;
- }
- kunmap(page);
-done:
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
-{
- size_t skip, copy, left, wanted;
- const struct iovec *iov;
- char __user *buf;
- void *kaddr, *to;
-
- if (unlikely(bytes > i->count))
- bytes = i->count;
-
- if (unlikely(!bytes))
- return 0;
-
- wanted = bytes;
- iov = i->iov;
- skip = i->iov_offset;
- buf = iov->iov_base + skip;
- copy = min(bytes, iov->iov_len - skip);
-
- if (!fault_in_pages_readable(buf, copy)) {
- kaddr = kmap_atomic(page);
- to = kaddr + offset;
-
- /* first chunk, usually the only one */
- left = __copy_from_user_inatomic(to, buf, copy);
- copy -= left;
- skip += copy;
- to += copy;
- bytes -= copy;
-
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_from_user_inatomic(to, buf, copy);
- copy -= left;
- skip = copy;
- to += copy;
- bytes -= copy;
- }
- if (likely(!bytes)) {
- kunmap_atomic(kaddr);
- goto done;
- }
- offset = to - kaddr;
- buf += copy;
- kunmap_atomic(kaddr);
- copy = min(bytes, iov->iov_len - skip);
- }
- /* Too bad - revert to non-atomic kmap */
- kaddr = kmap(page);
- to = kaddr + offset;
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip += copy;
- to += copy;
- bytes -= copy;
- while (unlikely(!left && bytes)) {
- iov++;
- buf = iov->iov_base;
- copy = min(bytes, iov->iov_len);
- left = __copy_from_user(to, buf, copy);
- copy -= left;
- skip = copy;
- to += copy;
- bytes -= copy;
- }
- kunmap(page);
-done:
- i->count -= wanted - bytes;
- i->nr_segs -= iov - i->iov;
- i->iov = iov;
- i->iov_offset = skip;
- return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_from_iter);
-
/**
* do_generic_file_read - generic file read routine
* @filp: the file to read
@@ -1912,31 +1756,60 @@ int generic_segment_checks(const struct iovec *iov,
}
EXPORT_SYMBOL(generic_segment_checks);
+static ssize_t mapping_direct_IO(struct address_space *mapping, int rw,
+ struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos)
+{
+ if (iov_iter_has_iovec(iter))
+ return mapping->a_ops->direct_IO(rw, iocb, iov_iter_iovec(iter),
+ pos, iter->nr_segs);
+ else if (iov_iter_has_bvec(iter))
+ return mapping->a_ops->direct_IO_bvec(rw, iocb,
+ iov_iter_bvec(iter), pos,
+ iter->nr_segs);
+ else if (iov_iter_has_page(iter))
+ return mapping->a_ops->direct_IO_page(rw, iocb,
+ iov_iter_page(iter), pos);
+ else
+ BUG();
+}
+
+static int file_read_iter_actor(read_descriptor_t *desc, struct page *page,
+ unsigned long offset, unsigned long size)
+{
+ struct iov_iter *iter = desc->arg.data;
+ unsigned long copied = 0;
+
+ if (size > desc->count)
+ size = desc->count;
+
+ copied = iov_iter_copy_to_user(page, iter, offset, size);
+ if (copied < size)
+ desc->error = -EFAULT;
+
+ iov_iter_advance(iter, copied);
+ desc->count -= copied;
+ desc->written += copied;
+
+ return copied;
+}
+
+
/**
- * generic_file_aio_read - generic filesystem read routine
+ * generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
- * @iov: io vector request
- * @nr_segs: number of segments in the iovec
+ * @iov_iter: memory vector
* @pos: current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
*/
ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
struct file *filp = iocb->ki_filp;
- ssize_t retval;
- unsigned long seg = 0;
- size_t count;
+ read_descriptor_t desc;
+ ssize_t retval = 0;
+ size_t count = iov_iter_count(iter);
loff_t *ppos = &iocb->ki_pos;
- count = 0;
- retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
- if (retval)
- return retval;
-
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
loff_t size;
@@ -1950,10 +1823,10 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
size = i_size_read(inode);
if (pos < size) {
retval = filemap_write_and_wait_range(mapping, pos,
- pos + iov_length(iov, nr_segs) - 1);
+ pos + count - 1);
if (!retval) {
- retval = mapping->a_ops->direct_IO(READ, iocb,
- iov, pos, nr_segs);
+ retval = mapping_direct_IO(mapping, READ,
+ iocb, iter, pos);
}
if (retval > 0) {
*ppos = pos + retval;
@@ -1975,42 +1848,49 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
}
}
- count = retval;
- for (seg = 0; seg < nr_segs; seg++) {
- read_descriptor_t desc;
- loff_t offset = 0;
+ iov_iter_advance(iter, retval);
- /*
- * If we did a short DIO read we need to skip the section of the
- * iov that we've already read data into.
- */
- if (count) {
- if (count > iov[seg].iov_len) {
- count -= iov[seg].iov_len;
- continue;
- }
- offset = count;
- count = 0;
- }
+ desc.written = 0;
+ desc.arg.data = iter;
+ desc.count = count;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_iter_actor);
- desc.written = 0;
- desc.arg.buf = iov[seg].iov_base + offset;
- desc.count = iov[seg].iov_len - offset;
- if (desc.count == 0)
- continue;
- desc.error = 0;
- do_generic_file_read(filp, ppos, &desc, file_read_actor);
- retval += desc.written;
- if (desc.error) {
- retval = retval ?: desc.error;
- break;
- }
- if (desc.count > 0)
- break;
- }
+ retval += desc.written;
+ if (desc.error && !retval)
+ retval = desc.error;
out:
return retval;
}
+EXPORT_SYMBOL(generic_file_read_iter);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iov: io vector request
+ * @nr_segs: number of segments in the iovec
+ * @pos: current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct iov_iter iter;
+ int ret;
+ size_t count;
+
+ count = 0;
+ ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (ret)
+ return ret;
+
+ iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+ return generic_file_read_iter(iocb, &iter, pos);
+}
EXPORT_SYMBOL(generic_file_aio_read);
#ifdef CONFIG_MMU
@@ -2477,150 +2357,6 @@ struct page *read_cache_page(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page);
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- size_t copied = 0, left = 0;
-
- while (bytes) {
- char __user *buf = iov->iov_base + base;
- int copy = min(bytes, iov->iov_len - base);
-
- base = 0;
- left = __copy_from_user_inatomic(vaddr, buf, copy);
- copied += copy;
- bytes -= copy;
- vaddr += copy;
- iov++;
-
- if (unlikely(left))
- break;
- }
- return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied. If a fault is encountered then return the number of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- BUG_ON(!in_atomic());
- kaddr = kmap_atomic(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap_atomic(kaddr);
-
- return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap(page);
- return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct iovec *iov = i->iov;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !iov->iov_len)) {
- int copy;
-
- copy = min(bytes, iov->iov_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- nr_segs--;
- base = 0;
- }
- }
- i->iov = iov;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
- char __user *buf = i->iov->iov_base + i->iov_offset;
- bytes = min(bytes, i->iov->iov_len - i->iov_offset);
- return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
- const struct iovec *iov = i->iov;
- if (i->nr_segs == 1)
- return i->count;
- else
- return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
/*
* Performs necessary checks before doing a write
*
@@ -2726,9 +2462,8 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
EXPORT_SYMBOL(pagecache_write_end);
ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long *nr_segs, loff_t pos, loff_t *ppos,
- size_t count, size_t ocount)
+generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, loff_t *ppos, size_t count)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -2737,10 +2472,13 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
size_t write_len;
pgoff_t end;
- if (count != ocount)
- *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+ if (count != iov_iter_count(iter)) {
+ written = iov_iter_shorten(iter, count);
+ if (written)
+ goto out;
+ }
- write_len = iov_length(iov, *nr_segs);
+ write_len = count;
end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2767,7 +2505,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
}
}
- written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+ written = mapping_direct_IO(mapping, WRITE, iocb, iter, pos);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2793,6 +2531,23 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
out:
return written;
}
+EXPORT_SYMBOL(generic_file_direct_write_iter);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, size_t ocount)
+{
+ struct iov_iter iter;
+ ssize_t ret;
+
+ iov_iter_init(&iter, iov, *nr_segs, ocount, 0);
+ ret = generic_file_direct_write_iter(iocb, &iter, pos, ppos, count);
+ /* generic_file_direct_write_iter() might have shortened the vec */
+ if (*nr_segs != iter.nr_segs)
+ *nr_segs = iter.nr_segs;
+ return ret;
+}
EXPORT_SYMBOL(generic_file_direct_write);
/*
@@ -2926,18 +2681,15 @@ again:
}
ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
+generic_file_buffered_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, loff_t *ppos, ssize_t written)
{
struct file *file = iocb->ki_filp;
ssize_t status;
- struct iov_iter i;
virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
- iov_iter_init(&i, iov, nr_segs, count, written);
- status = generic_perform_write(file, &i, pos);
+ status = generic_perform_write(file, iter, pos);
if (likely(status >= 0)) {
written += status;
@@ -2946,13 +2698,24 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
return written ? written : status;
}
+EXPORT_SYMBOL(generic_file_buffered_write_iter);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ size_t count, ssize_t written)
+{
+ struct iov_iter iter;
+ iov_iter_init(&iter, iov, nr_segs, count, written);
+ return generic_file_buffered_write_iter(iocb, &iter, pos, ppos,
+ written);
+}
EXPORT_SYMBOL(generic_file_buffered_write);
/**
* __generic_file_aio_write - write data to a file
* @iocb: IO state structure (file, offset, etc.)
- * @iov: vector with data to write
- * @nr_segs: number of segments in the vector
+ * @iter: iov_iter specifying memory to write
* @ppos: position where to write
*
* This function does all the work needed for actually writing data to a
@@ -2967,24 +2730,18 @@ EXPORT_SYMBOL(generic_file_buffered_write);
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
*/
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t *ppos)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
- size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
loff_t pos;
ssize_t written;
ssize_t err;
- ocount = 0;
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err)
- return err;
-
- count = ocount;
+ count = iov_iter_count(iter);
pos = *ppos;
/* We can write back this queue in page reclaim */
@@ -3011,8 +2768,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
loff_t endbyte;
ssize_t written_buffered;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
- ppos, count, ocount);
+ written = generic_file_direct_write_iter(iocb, iter, pos,
+ ppos, count);
if (written < 0 || written == count)
goto out;
/*
@@ -3021,9 +2778,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
*/
pos += written;
count -= written;
- written_buffered = generic_file_buffered_write(iocb, iov,
- nr_segs, pos, ppos, count,
- written);
+ iov_iter_advance(iter, written);
+ written_buffered = generic_file_buffered_write_iter(iocb, iter,
+ pos, ppos, written);
/*
* If generic_file_buffered_write() retuned a synchronous error
* then we want to return the number of bytes which were
@@ -3055,13 +2812,57 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
*/
}
} else {
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, ppos, count, written);
+ iter->count = count;
+ written = generic_file_buffered_write_iter(iocb, iter,
+ pos, ppos, written);
}
out:
current->backing_dev_info = NULL;
return written ? written : err;
}
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ ssize_t ret;
+
+ mutex_lock(&inode->i_mutex);
+ ret = __generic_file_write_iter(iocb, iter, &iocb->ki_pos);
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(generic_file_write_iter);
+
+ssize_t
+__generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct iov_iter iter;
+ size_t count;
+ int ret;
+
+ count = 0;
+ ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+ if (ret)
+ goto out;
+
+ iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+ ret = __generic_file_write_iter(iocb, &iter, ppos);
+out:
+ return ret;
+}
EXPORT_SYMBOL(__generic_file_aio_write);
/**
diff --git a/mm/iov-iter.c b/mm/iov-iter.c
new file mode 100644
index 0000000..e6fc15a
--- /dev/null
+++ b/mm/iov-iter.c
@@ -0,0 +1,474 @@
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/hardirq.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+static size_t __iovec_copy_to_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_to_user_inatomic(buf, vaddr, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_to_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ struct iovec *iov = (struct iovec *)i->data;
+ char *kaddr;
+ size_t copied;
+
+ BUG_ON(!in_atomic());
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = iov->iov_base + i->iov_offset;
+ left = __copy_to_user_inatomic(buf, kaddr + offset, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+ iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr);
+
+ return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_to_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_to_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ struct iovec *iov = (struct iovec *)i->data;
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = iov->iov_base + i->iov_offset;
+ left = copy_to_user(buf, kaddr + offset, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+ iov, i->iov_offset, bytes);
+ }
+ kunmap(page);
+ return copied;
+}
+
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ struct iovec *iov = (struct iovec *)i->data;
+ char *kaddr;
+ size_t copied;
+
+ BUG_ON(!in_atomic());
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr);
+
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ struct iovec *iov = (struct iovec *)i->data;
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = iov->iov_base + i->iov_offset;
+ left = __copy_from_user(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ iov, i->iov_offset, bytes);
+ }
+ kunmap(page);
+ return copied;
+}
+
+static void ii_iovec_advance(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ struct iovec *iov = (struct iovec *)i->data;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
+ int copy;
+
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->data = (unsigned long)iov;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+static int ii_iovec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ struct iovec *iov = (struct iovec *)i->data;
+ char __user *buf = iov->iov_base + i->iov_offset;
+ bytes = min(bytes, iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+}
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+static size_t ii_iovec_single_seg_count(const struct iov_iter *i)
+{
+ struct iovec *iov = (struct iovec *)i->data;
+ if (i->nr_segs == 1)
+ return i->count;
+ else
+ return min(i->count, iov->iov_len - i->iov_offset);
+}
+
+static int ii_iovec_shorten(struct iov_iter *i, size_t count)
+{
+ struct iovec *iov = (struct iovec *)i->data;
+ i->nr_segs = iov_shorten(iov, i->nr_segs, count);
+ return 0;
+}
+
+struct iov_iter_ops ii_iovec_ops = {
+ .ii_copy_to_user_atomic = ii_iovec_copy_to_user_atomic,
+ .ii_copy_to_user = ii_iovec_copy_to_user,
+ .ii_copy_from_user_atomic = ii_iovec_copy_from_user_atomic,
+ .ii_copy_from_user = ii_iovec_copy_from_user,
+ .ii_advance = ii_iovec_advance,
+ .ii_fault_in_readable = ii_iovec_fault_in_readable,
+ .ii_single_seg_count = ii_iovec_single_seg_count,
+ .ii_shorten = ii_iovec_shorten,
+};
+EXPORT_SYMBOL(ii_iovec_ops);
+
+/*
+ * As an easily verifiable first pass, we implement all the methods that
+ * copy data to and from bvec pages with one function. We implement it
+ * all with kmap_atomic().
+ */
+static size_t bvec_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+ unsigned long page_offset, size_t bytes,
+ int topage)
+{
+ struct bio_vec *bvec = (struct bio_vec *)iter->data;
+ size_t bvec_offset = iter->iov_offset;
+ size_t remaining = bytes;
+ void *bvec_map;
+ void *page_map;
+ size_t copy;
+
+ page_map = kmap_atomic(page);
+
+ BUG_ON(bytes > iter->count);
+ while (remaining) {
+ BUG_ON(bvec->bv_len == 0);
+ BUG_ON(bvec_offset >= bvec->bv_len);
+ copy = min(remaining, bvec->bv_len - bvec_offset);
+ bvec_map = kmap_atomic(bvec->bv_page);
+ if (topage)
+ memcpy(page_map + page_offset,
+ bvec_map + bvec->bv_offset + bvec_offset,
+ copy);
+ else
+ memcpy(bvec_map + bvec->bv_offset + bvec_offset,
+ page_map + page_offset,
+ copy);
+ kunmap_atomic(bvec_map);
+ remaining -= copy;
+ bvec_offset += copy;
+ page_offset += copy;
+ if (bvec_offset == bvec->bv_len) {
+ bvec_offset = 0;
+ bvec++;
+ }
+ }
+
+ kunmap_atomic(page_map);
+
+ return bytes;
+}
+
+size_t ii_bvec_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_to_user(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_bvec_copy_from_user(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+/*
+ * bio_vecs have a stricter structure than iovecs that might have
+ * come from userspace. There are no zero length bio_vec elements.
+ */
+void ii_bvec_advance(struct iov_iter *i, size_t bytes)
+{
+ struct bio_vec *bvec = (struct bio_vec *)i->data;
+ size_t offset = i->iov_offset;
+ size_t delta;
+
+ BUG_ON(i->count < bytes);
+ while (bytes) {
+ BUG_ON(bvec->bv_len == 0);
+ BUG_ON(bvec->bv_len <= offset);
+ delta = min(bytes, bvec->bv_len - offset);
+ offset += delta;
+ i->count -= delta;
+ bytes -= delta;
+ if (offset == bvec->bv_len) {
+ bvec++;
+ offset = 0;
+ }
+ }
+
+ i->data = (unsigned long)bvec;
+ i->iov_offset = offset;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_bvec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ return 0;
+}
+
+size_t ii_bvec_single_seg_count(const struct iov_iter *i)
+{
+ const struct bio_vec *bvec = (struct bio_vec *)i->data;
+ if (i->nr_segs == 1)
+ return i->count;
+ else
+ return min(i->count, bvec->bv_len - i->iov_offset);
+}
+
+static int ii_bvec_shorten(struct iov_iter *i, size_t count)
+{
+ return -EINVAL;
+}
+
+struct iov_iter_ops ii_bvec_ops = {
+ .ii_copy_to_user_atomic = ii_bvec_copy_to_user_atomic,
+ .ii_copy_to_user = ii_bvec_copy_to_user,
+ .ii_copy_from_user_atomic = ii_bvec_copy_from_user_atomic,
+ .ii_copy_from_user = ii_bvec_copy_from_user,
+ .ii_advance = ii_bvec_advance,
+ .ii_fault_in_readable = ii_bvec_fault_in_readable,
+ .ii_single_seg_count = ii_bvec_single_seg_count,
+ .ii_shorten = ii_bvec_shorten,
+};
+EXPORT_SYMBOL(ii_bvec_ops);
+
+/* Functions to get on with single page */
+
+static size_t page_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+ unsigned long page_offset, size_t bytes,
+ int topage)
+{
+ struct page *ipage = (struct page *)iter->data;
+ size_t ipage_offset = iter->iov_offset;
+ void *ipage_map;
+ void *page_map;
+
+ BUG_ON(bytes > iter->count);
+ BUG_ON(bytes > PAGE_SIZE - ipage_offset);
+ BUG_ON(ipage_offset >= PAGE_SIZE);
+
+ page_map = kmap_atomic(page);
+ ipage_map = kmap_atomic(ipage);
+
+ if (topage)
+ memcpy(page_map + page_offset,
+ ipage_map + ipage_offset,
+ bytes);
+ else
+ memcpy(ipage_map + ipage_offset,
+ page_map + page_offset,
+ bytes);
+
+ kunmap_atomic(ipage_map);
+ kunmap_atomic(page_map);
+
+ return bytes;
+}
+
+size_t ii_page_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_to_user(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_page_copy_from_user(struct page *page, struct iov_iter *i,
+ unsigned long offset, size_t bytes)
+{
+ return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+void ii_page_advance(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+ BUG_ON(i->iov_offset >= PAGE_SIZE);
+ BUG_ON(bytes > PAGE_SIZE - i->iov_offset);
+
+ i->iov_offset += bytes;
+ i->count -= bytes;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_page_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ return 0;
+}
+
+size_t ii_page_single_seg_count(const struct iov_iter *i)
+{
+ BUG_ON(i->nr_segs != 1);
+
+ return i->count;
+}
+
+static int ii_page_shorten(struct iov_iter *i, size_t count)
+{
+ return -EINVAL;
+}
+
+struct iov_iter_ops ii_page_ops = {
+ .ii_copy_to_user_atomic = ii_page_copy_to_user_atomic,
+ .ii_copy_to_user = ii_page_copy_to_user,
+ .ii_copy_from_user_atomic = ii_page_copy_from_user_atomic,
+ .ii_copy_from_user = ii_page_copy_from_user,
+ .ii_advance = ii_page_advance,
+ .ii_fault_in_readable = ii_page_fault_in_readable,
+ .ii_single_seg_count = ii_page_single_seg_count,
+ .ii_shorten = ii_page_shorten,
+};
+EXPORT_SYMBOL(ii_page_ops);
More information about the Devel
mailing list