[Devel] [PATCH RHEL9 COMMIT] ve/fs: per-CT sync behavior management

Konstantin Khorenko khorenko at virtuozzo.com
Thu Dec 9 11:23:23 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh9-5.14.0-4.vz9.10.34
------>
commit 60d27e3285bf79b04704d3d0199c5de1966018d5
Author: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
Date:   Thu Dec 9 11:23:22 2021 +0300

    ve/fs: per-CT sync behavior management
    
    This commit brings support for per-container sync. It makes it possible
    to limit effect of sync operations executed by a container to filesystems
    mounted inside that container, either in root or non-root mount namespace.
    
    The control is done over fs.fsync-enable sysctl.
    
    The setting of this sysctl on the host controls all containers (but not
    the host itself). Possible values are:
    
    0 (FSYNC_DISABLED)
        - sync operations inside containers are no-ops
    1 (FSYNC_ALWAYS)
        - sync operations inside containers are fully executed, as if given
          on the host itself
    2 (FSYNC_FILTERED), the default
        - sync operations inside containers cause only writing dirty data
          to filesystems mounted anywhere inside container, and don't
          affect any other dirty data existing on the host.
    
    The setting inside container itself can futher limit the operation:
    
    0 (FSYNC_DISABLED)
        - sync operations inside this container are no-ops, despite of
          setting on host
    2 (FSYNC_FILTERED), the default
        - sync settings are taken from host's sysctl value
    1 (FSYNC_ALWAYS) and other values
        - sync operations cause only writing dirty data
          to filesystems mounted anywhere inside container, and don't
          affect any other dirty data existing on the host.
    
    |======================================================|
    |HN sysctl|CT sysctl|sync behavior inside CT           |
    |======================================================|
    |    0    |    0    | noop                             |
    |    1    |    0    | noop                             |
    |    2    |    0    | noop                             |
    |    0    |    1    | "filtered"                       |
    |    1    |    1    | "filtered"                       |
    |    2    |    1    | "filtered"                       |
    |    0    |    2    | noop                             |
    |    1    |    2    | syncs behave like executed on HN |
    |    2    |    2    | "filtered" (default)             |
    |======================================================|
    
    Sync operations executed on host are always fully executed, regardless
    of fs.fsync-enabled value.
    
    https://jira.sw.ru/browse/PSBM-44684
    Feature: fs: per-CT sync behavior management
    
    Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 fs/sync.c          | 130 ++++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/fs.h |   2 +
 kernel/ve/ve.c     |   5 ++-
 3 files changed, 113 insertions(+), 24 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index 91a764988f41..ffe244370948 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -69,12 +69,33 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
+static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
+{
+	filemap_fdatawrite(bdev->bd_inode->i_mapping);
+}
+
+static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
+{
+	/*
+	 * We keep the error status of individual mapping so that
+	 * applications can catch the writeback error using fsync(2).
+	 * See filemap_fdatawait_keep_errors() for details.
+	 */
+	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
+}
+
 struct sync_arg {
+	struct ve_struct *ve;
 	int wait;
 };
 
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
+	struct sync_arg *sarg = arg;
+
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
+
 	if (!sb_rdonly(sb))
 		sync_inodes_sb(sb);
 }
@@ -83,24 +104,34 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
 {
 	struct sync_arg *sarg = arg;
 
-	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
-	    sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, sarg->wait);
-}
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
 
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
-	filemap_fdatawrite(bdev->bd_inode->i_mapping);
+	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC)) {
+		if (sb->s_op->sync_fs)
+			sb->s_op->sync_fs(sb, sarg->wait);
+
+		/* See comment in ksys_sync() bellow */
+		if (sarg->ve) {
+			fdatawrite_one_bdev(sb->s_bdev, NULL);
+			fdatawait_one_bdev(sb->s_bdev, NULL);
+		}
+	}
 }
 
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
+static int __ve_fsync_behavior(struct ve_struct *ve)
 {
 	/*
-	 * We keep the error status of individual mapping so that
-	 * applications can catch the writeback error using fsync(2).
-	 * See filemap_fdatawait_keep_errors() for details.
+	 * - __ve_fsync_behavior() is not called for ve0
+	 * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
+	 * - FSYNC_FILTERED for veX means "get value from ve0"
 	 */
-	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
+	if (ve->fsync_enable == FSYNC_FILTERED)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
 }
 
 int ve_fsync_behavior(void)
@@ -111,7 +142,7 @@ int ve_fsync_behavior(void)
 	if (ve_is_super(ve))
 		return FSYNC_ALWAYS;
 	else
-		return ve->fsync_enable;
+		return __ve_fsync_behavior(ve);
 }
 
 /*
@@ -126,21 +157,55 @@ int ve_fsync_behavior(void)
  */
 void ksys_sync(void)
 {
+	struct ve_struct *ve = get_exec_env();
 	struct sync_arg sarg;
 
-	if (ve_fsync_behavior() == FSYNC_NEVER)
-		return;
+	sarg.ve = NULL;
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			return;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			return;
+		if (fsb == FSYNC_FILTERED)
+			sarg.ve = ve;
+	}
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
+	iterate_supers(sync_inodes_one_sb, &sarg);
 	sarg.wait = 0;
 	iterate_supers(sync_fs_one_sb, &sarg);
 	sarg.wait = 1;
 	iterate_supers(sync_fs_one_sb, &sarg);
-	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_bdevs(fdatawait_one_bdev, NULL);
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
+
+	/*
+	 * Currently there is no access to raw bdevs from VE, which implies
+	 * that in VE-local sync, need to flush bdev only if it contains
+	 * VE-visible mount. Searching for such mount is linear against number
+	 * of superblocks on the host, and doing that for each bdev turns into
+	 * square complexity on number of mounted bdevs on the host.
+	 *
+	 * Avoid that square complexity by moving bdev sync into the second
+	 * second call to sync_fs_one_sb() above.
+	 *
+	 * If ever implementing access to raw bdevs from VE, this approach will
+	 * be no longer valid.
+	 */
+	if (!sarg.ve) {
+		iterate_bdevs(fdatawrite_one_bdev, NULL);
+		iterate_bdevs(fdatawait_one_bdev, NULL);
+		if (unlikely(laptop_mode))
+			laptop_sync_completion();
+	}
 }
 
 SYSCALL_DEFINE0(sync)
@@ -153,6 +218,7 @@ static void do_sync_work(struct work_struct *work)
 {
 	struct sync_arg sarg;
 
+	sarg.ve = NULL;
 	sarg.wait = 0;
 
 	/*
@@ -188,13 +254,33 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 	struct fd f = fdget(fd);
 	struct super_block *sb;
 	int ret = 0, ret2 = 0;
+	struct ve_struct *ve;
 
 	if (!f.file)
 		return -EBADF;
 	sb = f.file->f_path.dentry->d_sb;
 
-	if (ve_fsync_behavior() == FSYNC_NEVER)
-		goto fdput;
+	ve = get_exec_env();
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto fdput;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto fdput;
+
+		if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
+			goto fdput;
+	}
 
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb21d1a32cdb..9f34e9384f88 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3148,6 +3148,8 @@ extern char *file_path(struct file *, char *, int);
 
 #define FSYNC_NEVER	0	/* ve syncs are ignored    */
 #define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
 
 #ifdef CONFIG_VE
 int ve_fsync_behavior(void);
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e94aa90aff25..557a14f216c4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -70,7 +70,7 @@ struct ve_struct ve0 = {
 	.sched_lat_ve.cur	= &ve0_lat_stats,
 	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
 	.netns_max_nr		= INT_MAX,
-	.fsync_enable		= FSYNC_ALWAYS,
+	.fsync_enable		= FSYNC_FILTERED,
 	._randomize_va_space	=
 #ifdef CONFIG_COMPAT_BRK
 					1,
@@ -931,7 +931,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	ve->meminfo_val = VE_MEMINFO_DEFAULT;
 
 	ve->odirect_enable = 2;
-	ve->fsync_enable = FSYNC_ALWAYS;
+	/* for veX FSYNC_FILTERED means "get value from ve0 */
+	ve->fsync_enable = FSYNC_FILTERED;
 
 	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
 	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;


More information about the Devel mailing list