[Devel] [PATCH RHEL8 COMMIT] ve/fs/sync: Per containter sync and syncfs and fs.fsync-enable sysctl

Konstantin Khorenko khorenko at virtuozzo.com
Thu Apr 15 16:55:59 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.14
------>
commit a3d65258a0a116eb5d766f6d63db9dfaaa15cde2
Author: Konstantin Khorenko <khorenko at virtuozzo.com>
Date:   Wed Apr 14 10:57:51 2021 +0300

    ve/fs/sync: Per containter sync and syncfs and fs.fsync-enable sysctl
    
    "sync/fsync" called from inside a Container might have different behavior.
    
    Affects sys_sync, sys_fsync, sys_fdatasync, sys_sync_file_range
    syscalls.
    aio_fsync (sys_io_submit) not affected.
    
    syncs cannot be disabled for ve0.
    All values described below (even if set on ve0) affect veX behavior only.
    
    Possible values for the Hardware Node:
    ======================================
    0 (FSYNC_NEVER)         CT fsync and syncs are ignored
    1 (FSYNC_ALWAYS)        CT fsync and syncs work as usual, all inodes
                            for all filesystem will be synced
    2 (FSYNC_FILTERED)      CT fsync as usual, syncs only its file data
                            (only CT-relayed files and filesystems will be flushed)
    
    Possible values inside a Container:
    ======================================
    0                       CT fsync and syncs are ignored
    2                       Use HN global value
    any other value         Same as 2 (FSYNC_FILTERED)
    
    Default kernel value (for both HN and CT): 2 (FSYNC_FILTERED).
    
    =====================================================
    ve/fs: Port fs.fsync-enable and fs.odirect_enable sysctls
    
    This is a part of 74-diff-ve-mix-combined.
    
    https://jira.sw.ru/browse/PSBM-17903
    Signed-off-by: Kirill Tkhai <ktkhai at parallels.com>
    
    =====================================================
    ve/fs: check container odirect and fsync settings in __dentry_open
    
    sys_open for conventional filesystems doesn't call dentry_open,
    it calls __dentry_open (in nameidata_to_filp), so we have to move
    checks for odirect and fsync behaviour to __dentry_open
    to make them working on ploop containers.
    
    https://jira.sw.ru/browse/PSBM-17157
    
    Signed-off-by: Dmitry Guryanov <dguryanov at parallels.com>
    Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>
    
    ================================================
    ve: initialize fsync_enable also for non ve0 environment
    
    Patchset description:
    
    ve: fix initialization and remove sysctl_fsync_enable
    
    v2:
    - initialize only on ve cgroup creation, remove get_ve_features
    - rename setup_iptables_mask into ve_setup_iptables_mask
    
    https://jira.sw.ru/browse/PSBM-34286
    https://jira.sw.ru/browse/PSBM-34285
    
    Pavel Tikhomirov (4):
      ve: remove sysctl_fsync_enable and use ve_fsync_behavior instead
      ve: initialize fsync_enable also for non ve0 environment
      ve: iptables: fix mask initialization and changing
      ve: cgroup: initialize odirect_enable, features and _randomize_va_space
    
    =====================================================================
    Combined several vz7 patches into one:
     d35caf1 ("ve/fs/sync: per containter sync and syncfs")
     3016bac ("ve: remove sync_mutex")
     4cc281e ("ve: remove sysctl_fsync_enable and use ve_fsync_behavior instead")
     c3e4103 ("ve/fs: introduce "fs.fsync-enable" and "fs.odirect_enable" sysctls")
     fdbb570 ("fs: Restrict ve sync methods")
    
    VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127782
    Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
    
    khorenko@ changes:
     - "2" -> "FSYNC_FILTERED" in a couple of places
     - -               if (!sb_rdonly(sb) && sb->s_root && sb->s_bdi)
       +               if (!sb_rdonly(sb) && sb->s_root && (sb->s_flags & SB_BORN))
---
 fs/fcntl.c          |   2 +
 fs/mount.h          |   2 +
 fs/namespace.c      |   2 +-
 fs/open.c           |   3 +
 fs/super.c          |   2 +-
 fs/sync.c           | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h  |   8 +++
 include/linux/ve.h  |   2 +
 kernel/ve/ve.c      |   3 +
 kernel/ve/veowner.c |   8 +++
 mm/msync.c          |   2 +
 11 files changed, 216 insertions(+), 5 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index e00cfb700cd8..299670a9d89b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -67,6 +67,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	if (!may_use_odirect())
 		arg &= ~O_DIRECT;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		arg &= ~O_SYNC;
 	/*
 	 * O_APPEND cannot be cleared if the file is marked as append-only
 	 * and the file is open for write.
diff --git a/fs/mount.h b/fs/mount.h
index 6250de544760..b077957509ca 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -90,6 +90,8 @@ static inline int is_mounted(struct vfsmount *mnt)
 	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
 }
 
+extern struct rw_semaphore namespace_sem;
+
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namespace.c b/fs/namespace.c
index ecf2909a8cd0..07c63c485940 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,7 +68,7 @@ static DEFINE_IDA(mnt_group_ida);
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
-static DECLARE_RWSEM(namespace_sem);
+DECLARE_RWSEM(namespace_sem);
 
 /* /sys/fs */
 struct kobject *fs_kobj;
diff --git a/fs/open.c b/fs/open.c
index cc107929ea4c..dc9603f07dbc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -730,6 +730,9 @@ static int do_dentry_open(struct file *f,
 	if (!may_use_odirect())
 		f->f_flags &= ~O_DIRECT;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		f->f_flags &= ~O_SYNC;
+
 	if (unlikely(f->f_flags & O_PATH)) {
 		f->f_mode = FMODE_PATH | FMODE_OPENED;
 		f->f_op = &empty_fops;
diff --git a/fs/super.c b/fs/super.c
index 903d7a2c379e..651593446a58 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -327,7 +327,7 @@ static void __put_super(struct super_block *s)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 01e82170545a..ef4b1d17fe5a 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -16,7 +17,9 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
+#include <linux/ve.h>
 #include "internal.h"
+#include "mount.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -95,6 +98,133 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
+struct sync_sb {
+	struct list_head list;
+	struct super_block *sb;
+};
+
+static void sync_release_filesystems(struct list_head *sync_list)
+{
+	struct sync_sb *ss, *tmp;
+
+	list_for_each_entry_safe(ss, tmp, sync_list, list) {
+		list_del(&ss->list);
+		put_super(ss->sb);
+		kfree(ss);
+	}
+}
+
+static int sync_filesystem_collected(struct list_head *sync_list, struct super_block *sb)
+{
+	struct sync_sb *ss;
+
+	list_for_each_entry(ss, sync_list, list)
+		if (ss->sb == sb)
+			return 1;
+	return 0;
+}
+
+static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
+{
+	struct mount *mnt;
+	struct mnt_namespace *mnt_ns = ve->ve_ns->mnt_ns;
+	struct sync_sb *ss;
+	int ret = 0;
+
+	BUG_ON(!list_empty(sync_list));
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
+		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
+			continue;
+
+		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+		if (ss == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
+		ss->sb = mnt->mnt.mnt_sb;
+		/*
+		 * We hold mount point and thus can be sure, that superblock is
+		 * alive. And it means, that we can safely increase it's usage
+		 * counter.
+		 */
+		spin_lock(&sb_lock);
+		ss->sb->s_count++;
+		spin_unlock(&sb_lock);
+		list_add_tail(&ss->list, sync_list);
+	}
+	up_read(&namespace_sem);
+	return ret;
+}
+
+static void sync_filesystems_ve(struct ve_struct *ve, int wait)
+{
+	struct super_block *sb;
+	LIST_HEAD(sync_list);
+	struct sync_sb *ss;
+
+	/*
+	 * We don't need to care about allocating failure here. At least we
+	 * don't need to skip sync on such error.
+	 * Let's sync what we collected already instead.
+	 */
+	sync_collect_filesystems(ve, &sync_list);
+
+	list_for_each_entry(ss, &sync_list, list) {
+		sb = ss->sb;
+		down_read(&sb->s_umount);
+		if (!sb_rdonly(sb) && sb->s_root && (sb->s_flags & SB_BORN))
+			__sync_filesystem(sb, wait);
+		up_read(&sb->s_umount);
+	}
+
+	sync_release_filesystems(&sync_list);
+}
+
+static int is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
+{
+	struct mount *mnt;
+	struct mnt_namespace *mnt_ns = ve->ve_ns->mnt_ns;
+	int ret = 0;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
+		if (mnt->mnt.mnt_sb == sb) {
+			ret = 1;
+			break;
+		}
+	}
+	up_read(&namespace_sem);
+	return ret;
+}
+
+static int __ve_fsync_behavior(struct ve_struct *ve)
+{
+	/*
+	 * - __ve_fsync_behavior() is not called for ve0
+	 * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
+	 * - FSYNC_FILTERED for veX means "get value from ve0"
+	 */
+	if (ve->fsync_enable == FSYNC_FILTERED)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
+}
+
+int ve_fsync_behavior(void)
+{
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (ve_is_super(ve))
+		return FSYNC_ALWAYS;
+	else
+		return __ve_fsync_behavior(ve);
+}
+
 /*
  * Sync everything. We start by waking flusher threads so that most of
  * writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -107,8 +237,32 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 void ksys_sync(void)
 {
+	struct ve_struct *ve = get_exec_env();
 	int nowait = 0, wait = 1;
 
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			return;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			return;
+
+		if (fsb == FSYNC_FILTERED) {
+			sync_filesystems_ve(ve, nowait);
+			sync_filesystems_ve(ve, wait);
+			return;
+		}
+	}
+
 	wakeup_flusher_threads(WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
@@ -161,16 +315,39 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 {
 	struct fd f = fdget(fd);
 	struct super_block *sb;
-	int ret;
+	struct ve_struct *ve;
+	int ret = 0;
 
 	if (!f.file)
 		return -EBADF;
 	sb = f.file->f_path.dentry->d_sb;
 
+	ve = get_exec_env();
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto fdput;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto fdput;
+
+		if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
+			goto fdput;
+	}
+
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
 	up_read(&sb->s_umount);
-
+fdput:
 	fdput(f);
 	return ret;
 }
@@ -214,9 +391,13 @@ EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
 {
-	struct fd f = fdget(fd);
+	struct fd f;
 	int ret = -EBADF;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		return 0;
+
+	f = fdget(fd);
 	if (f.file) {
 		ret = vfs_fsync(f.file, datasync);
 		fdput(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 48fb585a3c18..654b73c520a8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2283,6 +2283,7 @@ void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
+void put_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 int get_anon_bdev(dev_t *);
 void free_anon_bdev(dev_t);
@@ -3009,6 +3010,13 @@ extern bool path_is_under(const struct path *, const struct path *);
 
 extern char *file_path(struct file *, char *, int);
 
+int ve_fsync_behavior(void);
+
+#define FSYNC_NEVER	0	/* ve syncs are ignored    */
+#define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 9a8d02abd328..21edd432206f 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -62,6 +62,8 @@ struct ve_struct {
 	int			_randomize_va_space;
 	int			odirect_enable;
 
+	int			fsync_enable;
+
 	u64			_uevent_seqnum;
 
 	struct kstat_lat_pcpu_struct	sched_lat_ve;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 41e81aec3822..aa4b568a1391 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -61,6 +61,7 @@ struct ve_struct ve0 = {
 	.init_cred		= &init_cred,
 	.features		= -1,
 	.sched_lat_ve.cur	= &ve0_lat_stats,
+	.fsync_enable		= FSYNC_FILTERED,
 	._randomize_va_space	=
 #ifdef CONFIG_COMPAT_BRK
 					1,
@@ -836,6 +837,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	ve->_randomize_va_space = ve0._randomize_va_space;
 
 	ve->odirect_enable = 2;
+	/* for veX FSYNC_FILTERED means "get value from ve0 */
+	ve->fsync_enable = FSYNC_FILTERED;
 
 	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
 	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
index 4a90464ef97e..c13a90956fad 100644
--- a/kernel/ve/veowner.c
+++ b/kernel/ve/veowner.c
@@ -5,6 +5,7 @@
  *
  */
 
+#include <linux/ve.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
@@ -45,6 +46,13 @@ static void prepare_proc(void)
  */
 
 static struct ctl_table vz_fs_table[] = {
+	{
+		.procname	= "fsync-enable",
+		.data		= &ve0.fsync_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= &proc_dointvec_virtual,
+	},
 	{ }
 };
 
diff --git a/mm/msync.c b/mm/msync.c
index ef30a429623a..3a79d8f2be4d 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -48,6 +48,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 	end = start + len;
 	if (end < start)
 		goto out;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		goto out;
 	error = 0;
 	if (end == start)
 		goto out;


More information about the Devel mailing list