[Devel] [PATCH vz9 v2 5/5] fs: per-VE sync
    Nikita Yushchenko 
    nikita.yushchenko at virtuozzo.com
       
    Mon Nov 22 11:58:13 MSK 2021
    
    
  
This contains part of vz7/vz8 per-VE sync code, updated to support
non-root mount namespaces within VE.
https://jira.sw.ru/browse/PSBM-44684
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
v2: add comments on syncing bdevs in VE-local operation, make the
    action exactly the same as with global sync, just moved.
 fs/sync.c          | 130 +++++++++++++++++++++++++++++++++++++--------
 include/linux/fs.h |   2 +
 kernel/ve/ve.c     |   5 +-
 3 files changed, 113 insertions(+), 24 deletions(-)
diff --git a/fs/sync.c b/fs/sync.c
index 31e6f0c6402d..6ec1b66d004d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -69,12 +69,33 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
+static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
+{
+	filemap_fdatawrite(bdev->bd_inode->i_mapping);
+}
+
+static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
+{
+	/*
+	 * We keep the error status of individual mapping so that
+	 * applications can catch the writeback error using fsync(2).
+	 * See filemap_fdatawait_keep_errors() for details.
+	 */
+	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
+}
+
 struct sync_arg {
+	struct ve_struct *ve;
 	int wait;
 };
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
+	struct sync_arg *sarg = arg;
+
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
+
 	if (!sb_rdonly(sb))
 		sync_inodes_sb(sb);
 }
@@ -83,24 +104,34 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
 {
 	struct sync_arg *sarg = arg;
 
-	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
-	    sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, sarg->wait);
-}
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
 
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
-	filemap_fdatawrite(bdev->bd_inode->i_mapping);
+	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC)) {
+		if (sb->s_op->sync_fs)
+			sb->s_op->sync_fs(sb, sarg->wait);
+
+		/* See comment in ksys_sync() bellow */
+		if (sarg->ve) {
+			fdatawrite_one_bdev(sb->s_bdev, NULL);
+			fdatawait_one_bdev(sb->s_bdev, NULL);
+		}
+	}
 }
 
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
+static int __ve_fsync_behavior(struct ve_struct *ve)
 {
 	/*
-	 * We keep the error status of individual mapping so that
-	 * applications can catch the writeback error using fsync(2).
-	 * See filemap_fdatawait_keep_errors() for details.
+	 * - __ve_fsync_behavior() is not called for ve0
+	 * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
+	 * - FSYNC_FILTERED for veX means "get value from ve0"
 	 */
-	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
+	if (ve->fsync_enable == FSYNC_FILTERED)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
 }
 
 int ve_fsync_behavior(void)
@@ -111,7 +142,7 @@ int ve_fsync_behavior(void)
 	if (ve_is_super(ve))
 		return FSYNC_ALWAYS;
 	else
-		return ve->fsync_enable;
+		return __ve_fsync_behavior(ve);
 }
 
 /*
@@ -126,21 +157,55 @@ int ve_fsync_behavior(void)
  */
 void ksys_sync(void)
 {
+	struct ve_struct *ve = get_exec_env();
 	struct sync_arg sarg;
 
-	if (ve_fsync_behavior() == FSYNC_NEVER)
-		return;
+	sarg.ve = NULL;
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			return;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			return;
+		if (fsb == FSYNC_FILTERED)
+			sarg.ve = ve;
+	}
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
+	iterate_supers(sync_inodes_one_sb, &sarg);
 	sarg.wait = 0;
 	iterate_supers(sync_fs_one_sb, &sarg);
 	sarg.wait = 1;
 	iterate_supers(sync_fs_one_sb, &sarg);
-	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_bdevs(fdatawait_one_bdev, NULL);
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
+
+	/*
+	 * Currently there is no access to raw bdevs from VE, which implies
+	 * that in VE-local sync, need to flush bdev only if it contains
+	 * VE-visible mount. Searching for such mount is linear against number
+	 * of superblocks on the host, and doing that for each bdev turns into
+	 * square complexity on number of mounted bdevs on the host.
+	 *
+	 * Avoid that square complexity by moving bdev sync into the second
+	 * second call to sync_fs_one_sb() above.
+	 *
+	 * If ever implementing access to raw bdevs from VE, this approach will
+	 * be no longer valid.
+	 */
+	if (!sarg.ve) {
+		iterate_bdevs(fdatawrite_one_bdev, NULL);
+		iterate_bdevs(fdatawait_one_bdev, NULL);
+		if (unlikely(laptop_mode))
+			laptop_sync_completion();
+	}
 }
 
 SYSCALL_DEFINE0(sync)
@@ -153,6 +218,7 @@ static void do_sync_work(struct work_struct *work)
 {
 	struct sync_arg sarg;
 
+	sarg.ve = NULL;
 	sarg.wait = 0;
 
 	/*
@@ -188,13 +254,33 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 	struct fd f = fdget(fd);
 	struct super_block *sb;
 	int ret = 0, ret2 = 0;
+	struct ve_struct *ve;
 
 	if (!f.file)
 		return -EBADF;
 	sb = f.file->f_path.dentry->d_sb;
 
-	if (ve_fsync_behavior() == FSYNC_NEVER)
-		goto fdput;
+	ve = get_exec_env();
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto fdput;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto fdput;
+
+		if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
+			goto fdput;
+	}
 
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb21d1a32cdb..9f34e9384f88 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3148,6 +3148,8 @@ extern char *file_path(struct file *, char *, int);
 
 #define FSYNC_NEVER	0	/* ve syncs are ignored    */
 #define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
 
 #ifdef CONFIG_VE
 int ve_fsync_behavior(void);
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e94aa90aff25..557a14f216c4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -70,7 +70,7 @@ struct ve_struct ve0 = {
 	.sched_lat_ve.cur	= &ve0_lat_stats,
 	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
 	.netns_max_nr		= INT_MAX,
-	.fsync_enable		= FSYNC_ALWAYS,
+	.fsync_enable		= FSYNC_FILTERED,
 	._randomize_va_space	=
 #ifdef CONFIG_COMPAT_BRK
 					1,
@@ -931,7 +931,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	ve->meminfo_val = VE_MEMINFO_DEFAULT;
 
 	ve->odirect_enable = 2;
-	ve->fsync_enable = FSYNC_ALWAYS;
+	/* for veX FSYNC_FILTERED means "get value from ve0 */
+	ve->fsync_enable = FSYNC_FILTERED;
 
 	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
 	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
-- 
2.30.2
    
    
More information about the Devel
mailing list