[Devel] [PATCH RHEL9 COMMIT] ve/fs: support per-CT disable of filesystem sync operations

Konstantin Khorenko khorenko at virtuozzo.com
Thu Dec 9 11:23:21 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-4.vz9.10.34
------>
commit 5e0e63e3b426f44d261c76058855f9f740b3859c
Author: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
Date:   Thu Dec 9 11:23:21 2021 +0300

    ve/fs: support per-CT disable of filesystem sync operations
    
    This patch adds fs.fsync-enable sysctl (virtualized, can be changed from
    Containers).
    
    In ve0, the value of this sysctl is ignored.
    
    In veX, setting this to zero will result in:
     - sync(2), fsync(2), fdatasync(2), sync_file_range(2), msync(2)
       silently do nothing,
     - open(2) and fcntl(2) silently clear O_SYNC flag if it is passed.
    
    Operation of AIO is not affected.
    
    Changing this sysctl does not affect O_SYNC flag of already opened
    file descriptors.
    
    Default sysctl value: 1 (syncs are enabled in Container).
    
    https://jira.sw.ru/browse/PSBM-44684
    Feature: fs: per-CT sync behavior management
    
    Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 fs/fcntl.c          |  2 ++
 fs/open.c           |  3 +++
 fs/sync.c           | 26 ++++++++++++++++++++++++--
 include/linux/fs.h  | 12 ++++++++++++
 include/linux/ve.h  |  2 ++
 kernel/ve/ve.c      |  2 ++
 kernel/ve/veowner.c |  8 ++++++++
 mm/msync.c          |  2 ++
 8 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2e0c8515bd1a..8af146ea9231 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -68,6 +68,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	if (!may_use_odirect())
 		arg &= ~O_DIRECT;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		arg &= ~O_SYNC;
 	/*
 	 * O_APPEND cannot be cleared if the file is marked as append-only
 	 * and the file is open for write.
diff --git a/fs/open.c b/fs/open.c
index 040df8bc6e76..65e60aa661a8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -785,6 +785,9 @@ static int do_dentry_open(struct file *f,
 	if (!may_use_odirect())
 		f->f_flags &= ~O_DIRECT;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		f->f_flags &= ~O_SYNC;
+
 	if (unlikely(f->f_flags & O_PATH)) {
 		f->f_mode = FMODE_PATH | FMODE_OPENED;
 		f->f_op = &empty_fops;
diff --git a/fs/sync.c b/fs/sync.c
index 1373a610dc78..cca10ec7a90f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -16,6 +16,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
+#include <linux/ve.h>
 #include "internal.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -96,6 +97,17 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
+int ve_fsync_behavior(void)
+{
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (ve_is_super(ve))
+		return FSYNC_ALWAYS;
+	else
+		return ve->fsync_enable;
+}
+
 /*
  * Sync everything. We start by waking flusher threads so that most of
  * writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -110,6 +122,9 @@ void ksys_sync(void)
 {
 	int nowait = 0, wait = 1;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		return;
+
 	wakeup_flusher_threads(WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
@@ -162,18 +177,22 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 {
 	struct fd f = fdget(fd);
 	struct super_block *sb;
-	int ret, ret2;
+	int ret = 0, ret2 = 0;
 
 	if (!f.file)
 		return -EBADF;
 	sb = f.file->f_path.dentry->d_sb;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		goto fdput;
+
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
 	up_read(&sb->s_umount);
 
 	ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
 
+fdput:
 	fdput(f);
 	return ret ? ret : ret2;
 }
@@ -221,7 +240,10 @@ static int do_fsync(unsigned int fd, int datasync)
 	int ret = -EBADF;
 
 	if (f.file) {
-		ret = vfs_fsync(f.file, datasync);
+		if (ve_fsync_behavior() != FSYNC_NEVER)
+			ret = vfs_fsync(f.file, datasync);
+		else
+			ret = 0;
 		fdput(f);
 	}
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 01419dbd864b..fb21d1a32cdb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3146,6 +3146,18 @@ extern bool path_is_under(const struct path *, const struct path *);
 
 extern char *file_path(struct file *, char *, int);
 
+#define FSYNC_NEVER	0	/* ve syncs are ignored    */
+#define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+
+#ifdef CONFIG_VE
+int ve_fsync_behavior(void);
+#else
+static inline int ve_fsync_behavior(void)
+{
+	return FSYNC_ALWAYS;
+}
+#endif
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 1a66063d9ba8..4c8f7d308829 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -61,6 +61,8 @@ struct ve_struct {
 	struct kstat_lat_pcpu_struct    sched_lat_ve;
 	int			odirect_enable;
 
+	int			fsync_enable;
+
 #if IS_ENABLED(CONFIG_BINFMT_MISC)
 	struct binfmt_misc	*binfmt_misc;
 #endif
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 48be49337a87..e94aa90aff25 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -70,6 +70,7 @@ struct ve_struct ve0 = {
 	.sched_lat_ve.cur	= &ve0_lat_stats,
 	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
 	.netns_max_nr		= INT_MAX,
+	.fsync_enable		= FSYNC_ALWAYS,
 	._randomize_va_space	=
 #ifdef CONFIG_COMPAT_BRK
 					1,
@@ -930,6 +931,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	ve->meminfo_val = VE_MEMINFO_DEFAULT;
 
 	ve->odirect_enable = 2;
+	ve->fsync_enable = FSYNC_ALWAYS;
 
 	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
 	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
index b0aba35b6be9..e255fe57d447 100644
--- a/kernel/ve/veowner.c
+++ b/kernel/ve/veowner.c
@@ -7,6 +7,7 @@
  *
  */
 
+#include <linux/ve.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
@@ -66,6 +67,13 @@ static struct ctl_table vz_fs_table[] = {
 		.extra1		= &ve_mount_nr_min,
 		.extra2		= &ve_mount_nr_max,
 	},
+	{
+		.procname	= "fsync-enable",
+		.data		= &ve0.fsync_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= &proc_dointvec_virtual,
+	},
 	{ }
 };
 
diff --git a/mm/msync.c b/mm/msync.c
index 137d1c104f3e..20737eb4b76b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -51,6 +51,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 	if (end < start)
 		goto out;
 	error = 0;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		goto out;
 	if (end == start)
 		goto out;
 	/*


More information about the Devel mailing list