[Devel] [PATCH RHEL7 COMMIT] ext4: add mfsync support
Konstantin Khorenko
khorenko at virtuozzo.com
Mon Jun 22 03:38:33 PDT 2015
The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.15
------>
commit 4eedf21279755db123027a35b7c6bfeb193078ca
Author: Dmitry Monakhov <dmonakhov at openvz.org>
Date: Mon Jun 22 14:38:33 2015 +0400
ext4: add mfsync support
Add EXT4_IOC_MFSYNC ioctl which allow to perform sync on given set of files
in optimized way (only 1 barrier will be required in best scenario)
https://jira.sw.ru/browse/PSBM-18567
Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
---
fs/ext4/ext4.h | 7 +++
fs/ext4/fsync.c | 122 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/ioctl.c | 53 +++++++++++++++++++
include/trace/events/ext4.h | 54 ++++++++++++++++++++
4 files changed, 236 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 806bbef..a4c98cc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -523,6 +523,11 @@ struct compat_ext4_new_group_input {
};
#endif
+struct ext4_ioc_mfsync_info {
+ __u32 size;
+ __u32 fd[0];
+};
+
/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
__u32 group;
@@ -615,6 +620,7 @@ enum {
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_OPEN_BALLOON _IO('f', 42)
+#define EXT4_IOC_MFSYNC _IO('f', 43)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -2032,6 +2038,7 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+extern int ext4_sync_files(struct file **, unsigned int *, unsigned int);
extern int ext4_flush_unwritten_io(struct inode *);
/* hash.c */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 427b228..519fe54 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -182,3 +182,125 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext4_sync_file_exit(inode, ret);
return ret;
}
+
+int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int nr_files)
+{
+ struct super_block *sb;
+ journal_t *journal;
+ int err = 0, err2 = 0, i = 0, j = 0;
+ int force_commit = 0, datawriteback = 0;
+ tid_t commit_tid = 0;
+ int need_barrier = 0;
+
+ J_ASSERT(ext4_journal_current_handle() == NULL);
+ if (!nr_files)
+ return 0;
+
+ sb = files[0]->f_mapping->host->i_sb;
+ journal = EXT4_SB(sb)->s_journal;
+ if (sb->s_flags & MS_RDONLY) {
+ /* Make shure that we read updated s_mount_flags value */
+ smp_rmb();
+ if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+ return -EROFS;
+ return 0;
+ }
+ for (i = 0; i < nr_files; i++) {
+ struct address_space * mapping = files[i]->f_mapping;
+ struct inode *inode = mapping->host;
+
+ BUG_ON(sb != inode->i_sb);
+ if (!mapping->nrpages)
+ continue;
+
+ err = filemap_fdatawrite(mapping);
+ if (err)
+ break;
+
+ }
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
+ if (err == -EIO)
+ goto out;
+
+ for (j = 0; j < i; j++) {
+ struct address_space * mapping = files[j]->f_mapping;
+ struct inode *inode = mapping->host;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ unsigned int datasync = flags[j];
+ tid_t tid;
+
+ if (mapping->nrpages) {
+ err2 = filemap_fdatawait(mapping);
+ if (!err || err2 == -EIO)
+ err = err2;
+ }
+
+ mutex_lock(&inode->i_mutex);
+ err2 = ext4_flush_unwritten_io(inode);
+ if (!err || err2 == -EIO)
+ err = err2;
+ force_commit |= ext4_should_journal_data(inode);
+ datawriteback |= ext4_should_writeback_data(inode);
+ tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+ mutex_unlock(&inode->i_mutex);
+ trace_ext4_sync_files_iterate(files[j]->f_path.dentry, tid, datasync);
+ if (j == 0 || !tid_geq(commit_tid, tid))
+ commit_tid = tid;
+ }
+
+ /* Ext4 specific stuff starts here */
+ if (!journal) {
+ for (j = 0; j < i; j++) {
+ /* Implementation is suboptimal because issue barrier for each
+ * inode */
+ struct address_space * mapping = files[j]->f_mapping;
+ struct inode *inode = mapping->host;
+
+ err2 = __sync_inode(inode, flags[j]);
+ if (!err2 && !hlist_empty(&inode->i_dentry))
+ err2 = ext4_sync_parent(inode);
+ if (!err)
+ err = err2;
+ }
+ } else if (force_commit) {
+ /* data=journal:
+ * filemap_fdatawrite won't do anything (the buffers are clean).
+ * ext4_force_commit will write the file data into the journal and
+ * will wait on that.
+ * filemap_fdatawait() will encounter a ton of newly-dirtied pages
+ * (they were dirtied by commit). But that's OK - the blocks are
+ * safe in-journal, which is all fsync() needs to ensure.
+ */
+ err2 = ext4_force_commit(sb);
+ } else {
+ /*
+ * data=writeback,ordered:
+ * The caller's filemap_fdatawrite()/wait will sync the data.
+ * Metadata is in the journal, we wait for proper transaction to
+ * commit here.
+ */
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ need_barrier = true;
+
+ err2 = jbd2_complete_transaction(journal, commit_tid);
+ /* Even if we had to wait for commit completion, it does not
+ * mean a flush has been issued after data demanded by this
+ * fsync were written back. Commit could be in state after
+ * it is already done, but not yet in state where we should
+ * not wait.
+ */
+ if (need_barrier)
+ err2 = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+ }
+out:
+ trace_ext4_sync_files_exit(files[0]->f_path.dentry, commit_tid, need_barrier);
+ if (!err || err2 == -EIO)
+ err = err2;
+ return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1de39c3..597ab44 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -716,7 +716,60 @@ resizefs_out:
return ext4_dump_pfcache(inode->i_sb,
(struct pfcache_dump_request __user *) arg);
+ case EXT4_IOC_MFSYNC:
+ {
+ struct ext4_ioc_mfsync_info mfsync;
+ struct file **filpp;
+ unsigned int *flags;
+ int i, err;
+
+ if (copy_from_user(&mfsync, (struct ext4_ioc_mfsync_info *)arg,
+ sizeof(mfsync))) {
+ printk("%s:%d", __FUNCTION__, __LINE__);
+ return -EFAULT;
+ }
+ if (mfsync.size == 0)
+ return 0;
+ filpp = kzalloc(mfsync.size * sizeof(*filp), GFP_KERNEL);
+ if (!filpp)
+ return -ENOMEM;
+ flags = kzalloc(mfsync.size * sizeof(*flags), GFP_KERNEL);
+ if (!flags) {
+ kfree(filpp);
+ return -ENOMEM;
+ }
+ for (i = 0; i < mfsync.size; i++) {
+ int fd;
+ int ret;
+
+ err = -EFAULT;
+ ret = get_user(fd, mfsync.fd + i);
+ if (ret) {
+ printk("%s:%d i:%d p:%p", __FUNCTION__, __LINE__,
+ i, mfsync.fd + i);
+ goto mfsync_fput;
+ }
+ /* negative fd means fdata_sync */
+ flags[i] = (fd & (1<< 31)) != 0;
+ fd &= ~(1<< 31);
+
+ err = -EBADF;
+ filpp[i] = fget(fd);
+ if (!filpp[i]) {
+ printk("%s:%d", __FUNCTION__, __LINE__);
+ goto mfsync_fput;
+ }
+ }
+ err = ext4_sync_files(filpp, flags, mfsync.size);
+mfsync_fput:
+ for (i = 0; i < mfsync.size; i++)
+ if (filpp[i])
+ fput(filpp[i]);
+ kfree(filpp);
+ kfree(flags);
+ return err;
+ }
default:
return -ENOTTY;
}
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 43d9977..6195a9e 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -856,6 +856,60 @@ TRACE_EVENT(ext4_sync_file_exit,
__entry->ret)
);
+TRACE_EVENT(ext4_sync_files_iterate,
+ TP_PROTO(struct dentry *dentry, tid_t tid, int datasync),
+
+ TP_ARGS(dentry, tid, datasync),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ino_t, parent )
+ __field( int, datasync )
+ __field( unsigned int, tid )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dentry->d_inode->i_sb->s_dev;
+ __entry->ino = dentry->d_inode->i_ino;
+ __entry->datasync = datasync;
+ __entry->parent = dentry->d_parent->d_inode->i_ino;
+ __entry->tid = tid;
+ ),
+
+ TP_printk("dev %d,%d ino %ld parent %ld datasync %d tid %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+ (unsigned long) __entry->parent, __entry->datasync,
+ __entry->tid)
+);
+
+TRACE_EVENT(ext4_sync_files_exit,
+ TP_PROTO(struct dentry *dentry, tid_t tid, int barrier),
+
+ TP_ARGS(dentry, tid, barrier),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ino_t, parent )
+ __field( int, barrier )
+ __field( unsigned int, tid )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = dentry->d_inode->i_sb->s_dev;
+ __entry->ino = dentry->d_inode->i_ino;
+ __entry->parent = dentry->d_parent->d_inode->i_ino;
+ __entry->tid = tid;
+ __entry->barrier = barrier;
+ ),
+
+ TP_printk("dev %d,%d ino %ld parent %ld explicit_barrier %d tid %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+ (unsigned long) __entry->parent, __entry->barrier,
+ __entry->tid)
+);
+
TRACE_EVENT(ext4_sync_fs,
TP_PROTO(struct super_block *sb, int wait),
More information about the Devel
mailing list