[Devel] [PATCH RH9 09/12] ext4: add mfsync support

Kirill Tkhai ktkhai at virtuozzo.com
Thu Oct 7 13:20:43 MSK 2021


From: Dmitry Monakhov <dmonakhov at openvz.org>

Add EXT4_IOC_MFSYNC ioctl  which allow to perform sync on given set of files
in optimized way (only 1 barrier will be required in best scenario)

https://jira.sw.ru/browse/PSBM-18567

Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>

+++
Comment on rebasing to rh7 kernel-3.10.0-229.7.2.el7:

1) compile fix for ext4-add-mfsync-support

   ext4_flush_unwritten_io was removed in rh7-3.10.0-229.7.2

   https://jira.sw.ru/browse/PSBM-34909

2) compile fix for ext4-add-mfsync-support part2

   __sync_inode was removed in rh7-3.10.0-229.7.2
   It is honest to simply disable mfsync in  nojournal mode since we
   so not test nojournal mode at all.

   https://jira.sw.ru/browse/PSBM-34910

Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>

Rebase to vz8 kernel note:
  mutex_unlock(&inode->i_mutex) -> inode_lock_shared(inode)

Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 fs/ext4/ext4.h              |    7 +++
 fs/ext4/fsync.c             |  108 +++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/ioctl.c             |   60 ++++++++++++++++++++++++
 include/trace/events/ext4.h |   54 ++++++++++++++++++++++
 4 files changed, 229 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index df46d5586ca1..5f6fdd5514b2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -615,6 +615,11 @@ struct compat_ext4_new_group_input {
 };
 #endif
 
+struct ext4_ioc_mfsync_info {
+	__u32 size;
+	__u32 fd[0];
+};
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
@@ -722,6 +727,7 @@ enum {
 #define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
 #define EXT4_IOC_OPEN_BALLOON		_IO('f', 42)
 #define EXT4_IOC_CHECKPOINT		_IOW('f', 43, __u32)
+#define EXT4_IOC_MFSYNC			_IO('f', 43)
 
 #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
 
@@ -2814,6 +2820,7 @@ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+extern int ext4_sync_files(struct file **, unsigned int *, unsigned int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 027a7d7037a0..8179066765bd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -185,3 +185,111 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }
+
+int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int nr_files)
+{
+	struct super_block *sb;
+	journal_t *journal;
+	int err = 0, err2 = 0, i = 0, j = 0;
+	int force_commit = 0, datawriteback = 0;
+	tid_t commit_tid = 0;
+	int need_barrier = 0;
+
+	J_ASSERT(ext4_journal_current_handle() == NULL);
+	if (!nr_files)
+		return 0;
+
+	sb = files[0]->f_mapping->host->i_sb;
+	journal = EXT4_SB(sb)->s_journal;
+	if (sb->s_flags & SB_RDONLY) {
+		/* Make shure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
+		return 0;
+	}
+	for (i = 0; i < nr_files; i++) {
+		struct address_space * mapping = files[i]->f_mapping;
+		struct inode *inode = mapping->host;
+
+		BUG_ON(sb != inode->i_sb);
+		if (!mapping->nrpages)
+			continue;
+
+		err = filemap_fdatawrite(mapping);
+		if (err)
+			break;
+
+	}
+	/*
+	 * Even if the above returned error, the pages may be
+	 * written partially (e.g. -ENOSPC), so we wait for it.
+	 * But the -EIO is special case, it may indicate the worst
+	 * thing (e.g. bug) happened, so we avoid waiting for it.
+	 */
+	if (err == -EIO)
+		goto out;
+
+	for (j = 0; j < i; j++) {
+		struct address_space * mapping = files[j]->f_mapping;
+		struct inode *inode = mapping->host;
+		struct ext4_inode_info *ei = EXT4_I(inode);
+		unsigned int datasync = flags[j];
+		tid_t tid;
+
+		if (mapping->nrpages) {
+			err2 = filemap_fdatawait(mapping);
+			if (!err || err2 == -EIO)
+				err = err2;
+		}
+
+		inode_lock_shared(inode);
+		force_commit  |= ext4_should_journal_data(inode);
+		datawriteback |= ext4_should_writeback_data(inode);
+		tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+		inode_unlock_shared(inode);
+		trace_ext4_sync_files_iterate(files[j]->f_path.dentry, tid, datasync);
+		if (j == 0 || !tid_geq(commit_tid, tid))
+			commit_tid = tid;
+	}
+
+	/* Ext4 specific stuff starts here */
+	if (!journal) {
+		 return -ENOTSUPP;
+	} else if (force_commit) {
+		/* data=journal:
+		 *  filemap_fdatawrite won't do anything (the buffers are clean).
+		 *  ext4_force_commit will write the file data into the journal and
+		 *  will wait on that.
+		 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+		 *  (they were dirtied by commit).  But that's OK - the blocks are
+		 *  safe in-journal, which is all fsync() needs to ensure.
+		 */
+		err2 = ext4_force_commit(sb);
+	} else {
+		/*
+		 * data=writeback,ordered:
+		 * The caller's filemap_fdatawrite()/wait will sync the data.
+		 * Metadata is in the journal, we wait for proper transaction to
+		 * commit here.
+		 */
+		if (journal->j_flags & JBD2_BARRIER &&
+		    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+			need_barrier = true;
+
+		err2 = jbd2_complete_transaction(journal, commit_tid);
+		/* Even if we had to wait for commit completion, it does not
+		 * mean a flush has been issued after data demanded by this
+		 * fsync were written back. Commit could be in state after
+		 * it is already done, but not yet in state where we should
+		 * not wait.
+		 */
+		if (need_barrier)
+			err2 = blkdev_issue_flush(sb->s_bdev);
+	}
+out:
+	trace_ext4_sync_files_exit(files[0]->f_path.dentry, commit_tid, need_barrier);
+	if (!err || err2 == -EIO)
+		err = err2;
+	return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6e2be4859571..0bac68174793 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -903,6 +903,63 @@ static int ext4_open_balloon(struct super_block *sb, struct vfsmount *mnt)
 	return err;
 }
 
+static int ext4_mfsync(unsigned long arg)
+{
+	struct ext4_ioc_mfsync_info mfsync;
+	struct file **filpp;
+	unsigned int *flags;
+	__u32 __user *usr_fd;
+	int i, err;
+
+	if (!ve_is_super(get_exec_env()))
+		return -ENOTSUPP;
+	if (copy_from_user(&mfsync, (struct ext4_ioc_mfsync_info *)arg,
+			   sizeof(mfsync)))
+		return -EFAULT;
+
+	if (mfsync.size == 0)
+		return 0;
+	if (mfsync.size > NR_FILE)
+		return -ENFILE;
+
+	usr_fd = (__u32 __user *) (arg + sizeof(__u32));
+
+	filpp = kzalloc(mfsync.size * sizeof(*filpp), GFP_KERNEL);
+	if (!filpp)
+		return -ENOMEM;
+	flags = kzalloc(mfsync.size * sizeof(*flags), GFP_KERNEL);
+	if (!flags) {
+		kfree(filpp);
+		return -ENOMEM;
+	}
+	for (i = 0; i < mfsync.size; i++) {
+		int fd;
+		int ret;
+
+		err = -EFAULT;
+		ret = get_user(fd, usr_fd + i);
+		if (ret)
+			goto mfsync_fput;
+
+		/* negative fd means fdata_sync */
+		flags[i] = (fd & (1<< 31)) != 0;
+		fd &= ~(1<< 31);
+
+		err = -EBADF;
+		filpp[i] = fget(fd);
+		if (!filpp[i])
+			goto mfsync_fput;
+	}
+	err = ext4_sync_files(filpp, flags, mfsync.size);
+mfsync_fput:
+	for (i = 0; i < mfsync.size; i++)
+		if (filpp[i])
+			fput(filpp[i]);
+	kfree(filpp);
+	kfree(flags);
+	return err;
+}
+
 static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -1298,6 +1355,9 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_SHUTDOWN:
 		return ext4_shutdown(sb, arg);
 
+	case EXT4_IOC_MFSYNC:
+		return ext4_mfsync(arg);
+
 	case FS_IOC_ENABLE_VERITY:
 		if (!ext4_has_feature_verity(sb))
 			return -EOPNOTSUPP;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 0ea36b2b0662..d441a01335df 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -966,6 +966,60 @@ TRACE_EVENT(ext4_sync_file_exit,
 		  __entry->ret)
 );
 
+TRACE_EVENT(ext4_sync_files_iterate,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int datasync),
+
+	TP_ARGS(dentry, tid, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->datasync	= datasync;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+	),
+
+	TP_printk("dev %d,%d ino %ld parent %ld datasync %d tid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->datasync,
+		  __entry->tid)
+);
+
+TRACE_EVENT(ext4_sync_files_exit,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int barrier),
+
+	TP_ARGS(dentry, tid, barrier),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	barrier			)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+		__entry->barrier	= barrier;
+	),
+
+	TP_printk("dev %d,%d ino %ld parent %ld explicit_barrier %d tid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->barrier,
+		  __entry->tid)
+);
+
 TRACE_EVENT(ext4_sync_fs,
 	TP_PROTO(struct super_block *sb, int wait),
 




More information about the Devel mailing list