[Devel] [PATCH vz9 v2 5/5] fs: per-VE sync
Nikita Yushchenko
nikita.yushchenko at virtuozzo.com
Mon Nov 22 11:58:13 MSK 2021
This contains part of vz7/vz8 per-VE sync code, updated to support
non-root mount namespaces within VE.
https://jira.sw.ru/browse/PSBM-44684
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
v2: add comments on syncing bdevs in VE-local operation, make the
action exactly the same as with global sync, just moved.
fs/sync.c | 130 +++++++++++++++++++++++++++++++++++++--------
include/linux/fs.h | 2 +
kernel/ve/ve.c | 5 +-
3 files changed, 113 insertions(+), 24 deletions(-)
diff --git a/fs/sync.c b/fs/sync.c
index 31e6f0c6402d..6ec1b66d004d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -69,12 +69,33 @@ int sync_filesystem(struct super_block *sb)
}
EXPORT_SYMBOL(sync_filesystem);
+static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
+{
+ filemap_fdatawrite(bdev->bd_inode->i_mapping);
+}
+
+static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
+{
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
+}
+
struct sync_arg {
+ struct ve_struct *ve;
int wait;
};
static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
+ struct sync_arg *sarg = arg;
+
+ if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+ return;
+
if (!sb_rdonly(sb))
sync_inodes_sb(sb);
}
@@ -83,24 +104,34 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
struct sync_arg *sarg = arg;
- if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
- sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, sarg->wait);
-}
+ if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+ return;
-static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
-{
- filemap_fdatawrite(bdev->bd_inode->i_mapping);
+ if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC)) {
+ if (sb->s_op->sync_fs)
+ sb->s_op->sync_fs(sb, sarg->wait);
+
+ /* See comment in ksys_sync() bellow */
+ if (sarg->ve) {
+ fdatawrite_one_bdev(sb->s_bdev, NULL);
+ fdatawait_one_bdev(sb->s_bdev, NULL);
+ }
+ }
}
-static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
+static int __ve_fsync_behavior(struct ve_struct *ve)
{
/*
- * We keep the error status of individual mapping so that
- * applications can catch the writeback error using fsync(2).
- * See filemap_fdatawait_keep_errors() for details.
+ * - __ve_fsync_behavior() is not called for ve0
+ * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
+ * - FSYNC_FILTERED for veX means "get value from ve0"
*/
- filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
+ if (ve->fsync_enable == FSYNC_FILTERED)
+ return get_ve0()->fsync_enable;
+ else if (ve->fsync_enable)
+ return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+ else
+ return 0;
}
int ve_fsync_behavior(void)
@@ -111,7 +142,7 @@ int ve_fsync_behavior(void)
if (ve_is_super(ve))
return FSYNC_ALWAYS;
else
- return ve->fsync_enable;
+ return __ve_fsync_behavior(ve);
}
/*
@@ -126,21 +157,55 @@ int ve_fsync_behavior(void)
*/
void ksys_sync(void)
{
+ struct ve_struct *ve = get_exec_env();
struct sync_arg sarg;
- if (ve_fsync_behavior() == FSYNC_NEVER)
- return;
+ sarg.ve = NULL;
+ if (!ve_is_super(ve)) {
+ int fsb;
+ /*
+ * init can't sync during VE stop. Rationale:
+ * - NFS with -o hard will block forever as network is down
+ * - no useful job is performed as VE0 will call umount/sync
+ * by his own later
+ * Den
+ */
+ if (is_child_reaper(task_pid(current)))
+ return;
+
+ fsb = __ve_fsync_behavior(ve);
+ if (fsb == FSYNC_NEVER)
+ return;
+ if (fsb == FSYNC_FILTERED)
+ sarg.ve = ve;
+ }
wakeup_flusher_threads(WB_REASON_SYNC);
- iterate_supers(sync_inodes_one_sb, NULL);
+ iterate_supers(sync_inodes_one_sb, &sarg);
sarg.wait = 0;
iterate_supers(sync_fs_one_sb, &sarg);
sarg.wait = 1;
iterate_supers(sync_fs_one_sb, &sarg);
- iterate_bdevs(fdatawrite_one_bdev, NULL);
- iterate_bdevs(fdatawait_one_bdev, NULL);
- if (unlikely(laptop_mode))
- laptop_sync_completion();
+
+ /*
+ * Currently there is no access to raw bdevs from VE, which implies
+ * that in VE-local sync, need to flush bdev only if it contains
+ * VE-visible mount. Searching for such mount is linear against number
+ * of superblocks on the host, and doing that for each bdev turns into
+ * square complexity on number of mounted bdevs on the host.
+ *
+ * Avoid that square complexity by moving bdev sync into the second
+ * second call to sync_fs_one_sb() above.
+ *
+ * If ever implementing access to raw bdevs from VE, this approach will
+ * be no longer valid.
+ */
+ if (!sarg.ve) {
+ iterate_bdevs(fdatawrite_one_bdev, NULL);
+ iterate_bdevs(fdatawait_one_bdev, NULL);
+ if (unlikely(laptop_mode))
+ laptop_sync_completion();
+ }
}
SYSCALL_DEFINE0(sync)
@@ -153,6 +218,7 @@ static void do_sync_work(struct work_struct *work)
{
struct sync_arg sarg;
+ sarg.ve = NULL;
sarg.wait = 0;
/*
@@ -188,13 +254,33 @@ SYSCALL_DEFINE1(syncfs, int, fd)
struct fd f = fdget(fd);
struct super_block *sb;
int ret = 0, ret2 = 0;
+ struct ve_struct *ve;
if (!f.file)
return -EBADF;
sb = f.file->f_path.dentry->d_sb;
- if (ve_fsync_behavior() == FSYNC_NEVER)
- goto fdput;
+ ve = get_exec_env();
+
+ if (!ve_is_super(ve)) {
+ int fsb;
+ /*
+ * init can't sync during VE stop. Rationale:
+ * - NFS with -o hard will block forever as network is down
+ * - no useful job is performed as VE0 will call umount/sync
+ * by his own later
+ * Den
+ */
+ if (is_child_reaper(task_pid(current)))
+ goto fdput;
+
+ fsb = __ve_fsync_behavior(ve);
+ if (fsb == FSYNC_NEVER)
+ goto fdput;
+
+ if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
+ goto fdput;
+ }
down_read(&sb->s_umount);
ret = sync_filesystem(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb21d1a32cdb..9f34e9384f88 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3148,6 +3148,8 @@ extern char *file_path(struct file *, char *, int);
#define FSYNC_NEVER 0 /* ve syncs are ignored */
#define FSYNC_ALWAYS 1 /* ve syncs work as ususal */
+#define FSYNC_FILTERED 2 /* ve syncs only its files */
+/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
#ifdef CONFIG_VE
int ve_fsync_behavior(void);
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e94aa90aff25..557a14f216c4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -70,7 +70,7 @@ struct ve_struct ve0 = {
.sched_lat_ve.cur = &ve0_lat_stats,
.netns_avail_nr = ATOMIC_INIT(INT_MAX),
.netns_max_nr = INT_MAX,
- .fsync_enable = FSYNC_ALWAYS,
+ .fsync_enable = FSYNC_FILTERED,
._randomize_va_space =
#ifdef CONFIG_COMPAT_BRK
1,
@@ -931,7 +931,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
ve->meminfo_val = VE_MEMINFO_DEFAULT;
ve->odirect_enable = 2;
- ve->fsync_enable = FSYNC_ALWAYS;
+ /* for veX FSYNC_FILTERED means "get value from ve0 */
+ ve->fsync_enable = FSYNC_FILTERED;
atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
--
2.30.2
More information about the Devel
mailing list