[Devel] [PATCH 04/13] ve/fs/sync: per containter sync and syncfs and fs.fsync-enable sysctl

Wed Apr 14 10:57:51 MSK 2021

From: Konstantin Khorenko <khorenko at virtuozzo.com>

ve/vfs: introduce "odirect_enable" sysctl and disable it by default

khorenko@: we want to disable direct access from inside Container
	because this is limited numbers of direct requests available
	on the system (128), and in case they are busy next request
	is provided only after some requst is completed.
	There is no any scheduler at this level => DDoS is possible
	from inside a CT: just run _many_ processes writing with O_DIRECT.

diff-vfs-odirect-enable && diff-vfs-odirect-enable-location-fix

Signed-off-by: Kirill Tkhai <ktkhai at parallels.com>

+++
ve/fs: Port fs.fsync-enable and fs.odirect_enable sysctls

This is a part of 74-diff-ve-mix-combined.

https://jira.sw.ru/browse/PSBM-17903

Signed-off-by: Kirill Tkhai <ktkhai at parallels.com>

=====================================================

ve/fs: check container odirect and fsync settings in __dentry_open

sys_open for conventional filesystems doesn't call dentry_open,
it calls __dentry_open (in nameidata_to_filp), so we have to move
checks for odirect and fsync behaviour to __dentry_open
to make them working on ploop containers.

https://jira.sw.ru/browse/PSBM-17157

Signed-off-by: Dmitry Guryanov <dguryanov at parallels.com>

Acked-by: Dmitry Monakhov <dmonakhov at openvz.org>
Signed-off-by: Dmitry Monakhov <dmonakhov at openvz.org>

================================================

ve: initialize fsync_enable also for non ve0 environment

Patchset description:

ve: fix initialization and remove sysctl_fsync_enable

v2:
- initialize only on ve cgroup creation, remove get_ve_features
- rename setup_iptables_mask into ve_setup_iptables_mask

https://jira.sw.ru/browse/PSBM-34286
https://jira.sw.ru/browse/PSBM-34285

Pavel Tikhomirov (4):
  ve: remove sysctl_fsync_enable and use ve_fsync_behavior instead
  ve: initialize fsync_enable also for non ve0 environment
  ve: iptables: fix mask initialization and changing
  ve: cgroup: initialize odirect_enable, features and _randomize_va_space

=====================================================================
This patch description:

v2: only on ve cgroup creation

https://jira.sw.ru/browse/PSBM-34286
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Acked-by: Dmitry Monakhov <dmonakhov at openvz.org>

=====================================================================

Combined several patches into one:
	d35caf1 ("ve/fs/sync: per containter sync and syncfs")
	3016bac ("ve: remove sync_mutex")
	4cc281e ("ve: remove sysctl_fsync_enable and use ve_fsync_behavior instead")
	c3e4103 ("ve/fs: introduce "fs.fsync-enable" and "fs.odirect_enable" sysctls")
	fdbb570 ("fs: Restrict ve sync methods")

VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127782

Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
---
 fs/fcntl.c          |   2 +
 fs/mount.h          |   2 +
 fs/namespace.c      |   2 +-
 fs/open.c           |   3 +
 fs/super.c          |   2 +-
 fs/sync.c           | 182 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h  |   7 ++
 include/linux/ve.h  |   2 +
 kernel/ve/ve.c      |   2 +
 kernel/ve/veowner.c |   8 ++
 mm/msync.c          |   2 +
 11 files changed, 209 insertions(+), 5 deletions(-)

diff --git a/fs/fcntl.c b/fs/fcntl.c
index e00cfb700cd8..299670a9d89b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -67,6 +67,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	if (!may_use_odirect())
 		arg &= ~O_DIRECT;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		arg &= ~O_SYNC;
 	/*
 	 * O_APPEND cannot be cleared if the file is marked as append-only
 	 * and the file is open for write.
diff --git a/fs/mount.h b/fs/mount.h
index 6250de544760..b077957509ca 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -90,6 +90,8 @@ static inline int is_mounted(struct vfsmount *mnt)
 	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
 }
 
+extern struct rw_semaphore namespace_sem;
+
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namespace.c b/fs/namespace.c
index ecf2909a8cd0..07c63c485940 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -68,7 +68,7 @@ static DEFINE_IDA(mnt_group_ida);
 static struct hlist_head *mount_hashtable __read_mostly;
 static struct hlist_head *mountpoint_hashtable __read_mostly;
 static struct kmem_cache *mnt_cache __read_mostly;
-static DECLARE_RWSEM(namespace_sem);
+DECLARE_RWSEM(namespace_sem);
 
 /* /sys/fs */
 struct kobject *fs_kobj;
diff --git a/fs/open.c b/fs/open.c
index cc107929ea4c..dc9603f07dbc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -730,6 +730,9 @@ static int do_dentry_open(struct file *f,
 	if (!may_use_odirect())
 		f->f_flags &= ~O_DIRECT;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		f->f_flags &= ~O_SYNC;
+
 	if (unlikely(f->f_flags & O_PATH)) {
 		f->f_mode = FMODE_PATH | FMODE_OPENED;
 		f->f_op = &empty_fops;
diff --git a/fs/super.c b/fs/super.c
index 903d7a2c379e..651593446a58 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -327,7 +327,7 @@ static void __put_super(struct super_block *s)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 01e82170545a..ff22a9298281 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -8,6 +8,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -16,7 +17,9 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
+#include <linux/ve.h>
 #include "internal.h"
+#include "mount.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -95,6 +98,128 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
+struct sync_sb {
+	struct list_head list;
+	struct super_block *sb;
+};
+
+static void sync_release_filesystems(struct list_head *sync_list)
+{
+	struct sync_sb *ss, *tmp;
+
+	list_for_each_entry_safe(ss, tmp, sync_list, list) {
+		list_del(&ss->list);
+		put_super(ss->sb);
+		kfree(ss);
+	}
+}
+
+static int sync_filesystem_collected(struct list_head *sync_list, struct super_block *sb)
+{
+	struct sync_sb *ss;
+
+	list_for_each_entry(ss, sync_list, list)
+		if (ss->sb == sb)
+			return 1;
+	return 0;
+}
+
+static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
+{
+	struct mount *mnt;
+	struct mnt_namespace *mnt_ns = ve->ve_ns->mnt_ns;
+	struct sync_sb *ss;
+	int ret = 0;
+
+	BUG_ON(!list_empty(sync_list));
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
+		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
+			continue;
+
+		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+		if (ss == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
+		ss->sb = mnt->mnt.mnt_sb;
+		/*
+		 * We hold mount point and thus can be sure, that superblock is
+		 * alive. And it means, that we can safely increase it's usage
+		 * counter.
+		 */
+		spin_lock(&sb_lock);
+		ss->sb->s_count++;
+		spin_unlock(&sb_lock);
+		list_add_tail(&ss->list, sync_list);
+	}
+	up_read(&namespace_sem);
+	return ret;
+}
+
+static void sync_filesystems_ve(struct ve_struct *ve, int wait)
+{
+	struct super_block *sb;
+	LIST_HEAD(sync_list);
+	struct sync_sb *ss;
+
+	/*
+	 * We don't need to care about allocating failure here. At least we
+	 * don't need to skip sync on such error.
+	 * Let's sync what we collected already instead.
+	 */
+	sync_collect_filesystems(ve, &sync_list);
+
+	list_for_each_entry(ss, &sync_list, list) {
+		sb = ss->sb;
+		down_read(&sb->s_umount);
+		if (!sb_rdonly(sb) && sb->s_root && sb->s_bdi)
+			__sync_filesystem(sb, wait);
+		up_read(&sb->s_umount);
+	}
+
+	sync_release_filesystems(&sync_list);
+}
+
+static int is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
+{
+	struct mount *mnt;
+	struct mnt_namespace *mnt_ns = ve->ve_ns->mnt_ns;
+	int ret = 0;
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
+		if (mnt->mnt.mnt_sb == sb) {
+			ret = 1;
+			break;
+		}
+	}
+	up_read(&namespace_sem);
+	return ret;
+}
+
+static int __ve_fsync_behavior(struct ve_struct *ve)
+{
+	if (ve->fsync_enable == 2)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
+}
+
+int ve_fsync_behavior(void)
+{
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (ve_is_super(ve))
+		return FSYNC_ALWAYS;
+	else
+		return __ve_fsync_behavior(ve);
+}
+
 /*
  * Sync everything. We start by waking flusher threads so that most of
  * writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -107,8 +232,32 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 void ksys_sync(void)
 {
+	struct ve_struct *ve = get_exec_env();
 	int nowait = 0, wait = 1;
 
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			return;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			return;
+
+		if (fsb == FSYNC_FILTERED) {
+			sync_filesystems_ve(ve, nowait);
+			sync_filesystems_ve(ve, wait);
+			return;
+		}
+	}
+
 	wakeup_flusher_threads(WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
@@ -161,16 +310,39 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 {
 	struct fd f = fdget(fd);
 	struct super_block *sb;
-	int ret;
+	struct ve_struct *ve;
+	int ret = 0;
 
 	if (!f.file)
 		return -EBADF;
 	sb = f.file->f_path.dentry->d_sb;
 
+	ve = get_exec_env();
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto fdput;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto fdput;
+
+		if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
+			goto fdput;
+	}
+
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
 	up_read(&sb->s_umount);
-
+fdput:
 	fdput(f);
 	return ret;
 }
@@ -214,9 +386,13 @@ EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
 {
-	struct fd f = fdget(fd);
+	struct fd f;
 	int ret = -EBADF;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		return 0;
+
+	f = fdget(fd);
 	if (f.file) {
 		ret = vfs_fsync(f.file, datasync);
 		fdput(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 48fb585a3c18..196900daa889 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2283,6 +2283,7 @@ void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
+void put_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 int get_anon_bdev(dev_t *);
 void free_anon_bdev(dev_t);
@@ -3009,6 +3010,12 @@ extern bool path_is_under(const struct path *, const struct path *);
 
 extern char *file_path(struct file *, char *, int);
 
+int ve_fsync_behavior(void);
+
+#define FSYNC_NEVER	0	/* ve syncs are ignored    */
+#define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 3b487f8a4a50..0f769a96e805 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -66,6 +66,8 @@ struct ve_struct {
 #endif
 	int			odirect_enable;
 
+	int			fsync_enable;
+
 	u64			_uevent_seqnum;
 
 	struct kstat_lat_pcpu_struct	sched_lat_ve;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 031b104075c8..ecc02cd3c8f3 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -62,6 +62,7 @@ struct ve_struct ve0 = {
 	.init_cred		= &init_cred,
 	.features		= -1,
 	.sched_lat_ve.cur	= &ve0_lat_stats,
+	.fsync_enable		= FSYNC_FILTERED,
 	._randomize_va_space	=
 #ifdef CONFIG_COMPAT_BRK
 					1,
@@ -866,6 +867,7 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	ve->_randomize_va_space = ve0._randomize_va_space;
 
 	ve->odirect_enable = 2;
+	ve->fsync_enable = 2;
 
 #ifdef CONFIG_VE_IPTABLES
 	ve->ipt_mask = ve_setup_iptables_mask(VE_IP_DEFAULT);
diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
index da9fd60e5282..2dfa9920da40 100644
--- a/kernel/ve/veowner.c
+++ b/kernel/ve/veowner.c
@@ -5,6 +5,7 @@
  *
  */
 
+#include <linux/ve.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
@@ -65,6 +66,13 @@ static void prepare_proc(void)
  */
 
 static struct ctl_table vz_fs_table[] = {
+	{
+		.procname	= "fsync-enable",
+		.data		= &ve0.fsync_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= &proc_dointvec_virtual,
+	},
 	{ }
 };
 
diff --git a/mm/msync.c b/mm/msync.c
index ef30a429623a..3a79d8f2be4d 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -48,6 +48,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 	end = start + len;
 	if (end < start)
 		goto out;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		goto out;
 	error = 0;
 	if (end == start)
 		goto out;
-- 
2.28.0