[Devel] [PATCH vz9 5/5] fs: per-VE sync

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Mon Nov 22 09:20:32 MSK 2021


This contains part of vz7/vz8 per-VE sync code, updated to support
non-root mount namespaces within VE.

https://jira.sw.ru/browse/PSBM-44684
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 fs/sync.c          | 98 ++++++++++++++++++++++++++++++++++++++++------
 include/linux/fs.h |  2 +
 kernel/ve/ve.c     |  5 ++-
 3 files changed, 90 insertions(+), 15 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index 31e6f0c6402d..9ec0a8073300 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -70,11 +70,17 @@ int sync_filesystem(struct super_block *sb)
 EXPORT_SYMBOL(sync_filesystem);
 
 struct sync_arg {
+	struct ve_struct *ve;
 	int wait;
 };
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
+	struct sync_arg *sarg = arg;
+
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
+
 	if (!sb_rdonly(sb))
 		sync_inodes_sb(sb);
 }
@@ -83,9 +89,19 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
 {
 	struct sync_arg *sarg = arg;
 
-	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
-	    sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, sarg->wait);
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
+
+	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC)) {
+		if (sb->s_op->sync_fs)
+			sb->s_op->sync_fs(sb, sarg->wait);
+
+		/* For ve-local sync, process bdev here, since there is no easy
+		 * equivalent of is_sb_ve_accessible() for bdevs
+		 */
+		if (sarg->ve)
+			__sync_blockdev(sb->s_bdev, sarg->wait);
+	}
 }
 
 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
@@ -103,6 +119,21 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
+static int __ve_fsync_behavior(struct ve_struct *ve)
+{
+	/*
+	 * - __ve_fsync_behavior() is not called for ve0
+	 * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
+	 * - FSYNC_FILTERED for veX means "get value from ve0"
+	 */
+	if (ve->fsync_enable == FSYNC_FILTERED)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
+}
+
 int ve_fsync_behavior(void)
 {
 	struct ve_struct *ve;
@@ -111,7 +142,7 @@ int ve_fsync_behavior(void)
 	if (ve_is_super(ve))
 		return FSYNC_ALWAYS;
 	else
-		return ve->fsync_enable;
+		return __ve_fsync_behavior(ve);
 }
 
 /*
@@ -126,21 +157,41 @@ int ve_fsync_behavior(void)
  */
 void ksys_sync(void)
 {
+	struct ve_struct *ve = get_exec_env();
 	struct sync_arg sarg;
 
-	if (ve_fsync_behavior() == FSYNC_NEVER)
-		return;
+	sarg.ve = NULL;
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			return;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			return;
+		if (fsb == FSYNC_FILTERED)
+			sarg.ve = ve;
+	}
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
+	iterate_supers(sync_inodes_one_sb, &sarg);
 	sarg.wait = 0;
 	iterate_supers(sync_fs_one_sb, &sarg);
 	sarg.wait = 1;
 	iterate_supers(sync_fs_one_sb, &sarg);
-	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_bdevs(fdatawait_one_bdev, NULL);
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
+	if (!sarg.ve) {
+		iterate_bdevs(fdatawrite_one_bdev, NULL);
+		iterate_bdevs(fdatawait_one_bdev, NULL);
+		if (unlikely(laptop_mode))
+			laptop_sync_completion();
+	}
 }
 
 SYSCALL_DEFINE0(sync)
@@ -153,6 +204,7 @@ static void do_sync_work(struct work_struct *work)
 {
 	struct sync_arg sarg;
 
+	sarg.ve = NULL;
 	sarg.wait = 0;
 
 	/*
@@ -188,13 +240,33 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 	struct fd f = fdget(fd);
 	struct super_block *sb;
 	int ret = 0, ret2 = 0;
+	struct ve_struct *ve;
 
 	if (!f.file)
 		return -EBADF;
 	sb = f.file->f_path.dentry->d_sb;
 
-	if (ve_fsync_behavior() == FSYNC_NEVER)
-		goto fdput;
+	ve = get_exec_env();
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto fdput;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto fdput;
+
+		if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
+			goto fdput;
+	}
 
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb21d1a32cdb..9f34e9384f88 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3148,6 +3148,8 @@ extern char *file_path(struct file *, char *, int);
 
 #define FSYNC_NEVER	0	/* ve syncs are ignored    */
 #define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
 
 #ifdef CONFIG_VE
 int ve_fsync_behavior(void);
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e94aa90aff25..557a14f216c4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -70,7 +70,7 @@ struct ve_struct ve0 = {
 	.sched_lat_ve.cur	= &ve0_lat_stats,
 	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
 	.netns_max_nr		= INT_MAX,
-	.fsync_enable		= FSYNC_ALWAYS,
+	.fsync_enable		= FSYNC_FILTERED,
 	._randomize_va_space	=
 #ifdef CONFIG_COMPAT_BRK
 					1,
@@ -931,7 +931,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	ve->meminfo_val = VE_MEMINFO_DEFAULT;
 
 	ve->odirect_enable = 2;
-	ve->fsync_enable = FSYNC_ALWAYS;
+	/* for veX FSYNC_FILTERED means "get value from ve0 */
+	ve->fsync_enable = FSYNC_FILTERED;
 
 	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
 	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
-- 
2.30.2



More information about the Devel mailing list