[Devel] [PATCH vz9 v2 5/5] fs: per-VE sync
Kirill Tkhai
ktkhai at virtuozzo.com
Mon Nov 22 12:18:04 MSK 2021
I'd moved sync_inodes_one_sb() and sync_fs_one_sb() move to separate patch.
I'd suggested to add the following strings in your ~/.gitconfig:
[diff "default"]
xfuncname = "^[[:alpha:]$_].*[^:]$"
algorithm = patience
[diff]
algorithm = patience
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
On 22.11.2021 11:58, Nikita Yushchenko wrote:
> This contains part of vz7/vz8 per-VE sync code, updated to support
> non-root mount namespaces within VE.
>
> https://jira.sw.ru/browse/PSBM-44684
> Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
> ---
> v2: add comments on syncing bdevs in VE-local operation, make the
> action exactly the same as with global sync, just moved.
>
> fs/sync.c | 130 +++++++++++++++++++++++++++++++++++++--------
> include/linux/fs.h | 2 +
> kernel/ve/ve.c | 5 +-
> 3 files changed, 113 insertions(+), 24 deletions(-)
>
> diff --git a/fs/sync.c b/fs/sync.c
> index 31e6f0c6402d..6ec1b66d004d 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -69,12 +69,33 @@ int sync_filesystem(struct super_block *sb)
> }
> EXPORT_SYMBOL(sync_filesystem);
>
> +static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> +{
> + filemap_fdatawrite(bdev->bd_inode->i_mapping);
> +}
> +
> +static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
> +{
> + /*
> + * We keep the error status of individual mapping so that
> + * applications can catch the writeback error using fsync(2).
> + * See filemap_fdatawait_keep_errors() for details.
> + */
> + filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
> +}
> +
> struct sync_arg {
> + struct ve_struct *ve;
> int wait;
> };
>
> static void sync_inodes_one_sb(struct super_block *sb, void *arg)
> {
> + struct sync_arg *sarg = arg;
> +
> + if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
> + return;
> +
> if (!sb_rdonly(sb))
> sync_inodes_sb(sb);
> }
> @@ -83,24 +104,34 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
> {
> struct sync_arg *sarg = arg;
>
> - if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
> - sb->s_op->sync_fs)
> - sb->s_op->sync_fs(sb, sarg->wait);
> -}
> + if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
> + return;
>
> -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> -{
> - filemap_fdatawrite(bdev->bd_inode->i_mapping);
> + if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC)) {
> + if (sb->s_op->sync_fs)
> + sb->s_op->sync_fs(sb, sarg->wait);
> +
> + /* See comment in ksys_sync() bellow */
> + if (sarg->ve) {
> + fdatawrite_one_bdev(sb->s_bdev, NULL);
> + fdatawait_one_bdev(sb->s_bdev, NULL);
> + }
> + }
> }
>
> -static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
> +static int __ve_fsync_behavior(struct ve_struct *ve)
> {
> /*
> - * We keep the error status of individual mapping so that
> - * applications can catch the writeback error using fsync(2).
> - * See filemap_fdatawait_keep_errors() for details.
> + * - __ve_fsync_behavior() is not called for ve0
> + * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
> + * - FSYNC_FILTERED for veX means "get value from ve0"
> */
> - filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
> + if (ve->fsync_enable == FSYNC_FILTERED)
> + return get_ve0()->fsync_enable;
> + else if (ve->fsync_enable)
> + return FSYNC_FILTERED; /* sync forced by ve is always filtered */
> + else
> + return 0;
> }
>
> int ve_fsync_behavior(void)
> @@ -111,7 +142,7 @@ int ve_fsync_behavior(void)
> if (ve_is_super(ve))
> return FSYNC_ALWAYS;
> else
> - return ve->fsync_enable;
> + return __ve_fsync_behavior(ve);
> }
>
> /*
> @@ -126,21 +157,55 @@ int ve_fsync_behavior(void)
> */
> void ksys_sync(void)
> {
> + struct ve_struct *ve = get_exec_env();
> struct sync_arg sarg;
>
> - if (ve_fsync_behavior() == FSYNC_NEVER)
> - return;
> + sarg.ve = NULL;
> + if (!ve_is_super(ve)) {
> + int fsb;
> + /*
> + * init can't sync during VE stop. Rationale:
> + * - NFS with -o hard will block forever as network is down
> + * - no useful job is performed as VE0 will call umount/sync
> + * by his own later
> + * Den
> + */
> + if (is_child_reaper(task_pid(current)))
> + return;
> +
> + fsb = __ve_fsync_behavior(ve);
> + if (fsb == FSYNC_NEVER)
> + return;
> + if (fsb == FSYNC_FILTERED)
> + sarg.ve = ve;
> + }
>
> wakeup_flusher_threads(WB_REASON_SYNC);
> - iterate_supers(sync_inodes_one_sb, NULL);
> + iterate_supers(sync_inodes_one_sb, &sarg);
> sarg.wait = 0;
> iterate_supers(sync_fs_one_sb, &sarg);
> sarg.wait = 1;
> iterate_supers(sync_fs_one_sb, &sarg);
> - iterate_bdevs(fdatawrite_one_bdev, NULL);
> - iterate_bdevs(fdatawait_one_bdev, NULL);
> - if (unlikely(laptop_mode))
> - laptop_sync_completion();
> +
> + /*
> + * Currently there is no access to raw bdevs from VE, which implies
> + * that in VE-local sync, need to flush bdev only if it contains
> + * VE-visible mount. Searching for such mount is linear against number
> + * of superblocks on the host, and doing that for each bdev turns into
> + * square complexity on number of mounted bdevs on the host.
> + *
> + * Avoid that square complexity by moving bdev sync into the second
> + * second call to sync_fs_one_sb() above.
> + *
> + * If ever implementing access to raw bdevs from VE, this approach will
> + * be no longer valid.
> + */
> + if (!sarg.ve) {
> + iterate_bdevs(fdatawrite_one_bdev, NULL);
> + iterate_bdevs(fdatawait_one_bdev, NULL);
> + if (unlikely(laptop_mode))
> + laptop_sync_completion();
> + }
> }
>
> SYSCALL_DEFINE0(sync)
> @@ -153,6 +218,7 @@ static void do_sync_work(struct work_struct *work)
> {
> struct sync_arg sarg;
>
> + sarg.ve = NULL;
> sarg.wait = 0;
>
> /*
> @@ -188,13 +254,33 @@ SYSCALL_DEFINE1(syncfs, int, fd)
> struct fd f = fdget(fd);
> struct super_block *sb;
> int ret = 0, ret2 = 0;
> + struct ve_struct *ve;
>
> if (!f.file)
> return -EBADF;
> sb = f.file->f_path.dentry->d_sb;
>
> - if (ve_fsync_behavior() == FSYNC_NEVER)
> - goto fdput;
> + ve = get_exec_env();
> +
> + if (!ve_is_super(ve)) {
> + int fsb;
> + /*
> + * init can't sync during VE stop. Rationale:
> + * - NFS with -o hard will block forever as network is down
> + * - no useful job is performed as VE0 will call umount/sync
> + * by his own later
> + * Den
> + */
> + if (is_child_reaper(task_pid(current)))
> + goto fdput;
> +
> + fsb = __ve_fsync_behavior(ve);
> + if (fsb == FSYNC_NEVER)
> + goto fdput;
> +
> + if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
> + goto fdput;
> + }
>
> down_read(&sb->s_umount);
> ret = sync_filesystem(sb);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fb21d1a32cdb..9f34e9384f88 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3148,6 +3148,8 @@ extern char *file_path(struct file *, char *, int);
>
> #define FSYNC_NEVER 0 /* ve syncs are ignored */
> #define FSYNC_ALWAYS 1 /* ve syncs work as ususal */
> +#define FSYNC_FILTERED 2 /* ve syncs only its files */
> +/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
>
> #ifdef CONFIG_VE
> int ve_fsync_behavior(void);
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index e94aa90aff25..557a14f216c4 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -70,7 +70,7 @@ struct ve_struct ve0 = {
> .sched_lat_ve.cur = &ve0_lat_stats,
> .netns_avail_nr = ATOMIC_INIT(INT_MAX),
> .netns_max_nr = INT_MAX,
> - .fsync_enable = FSYNC_ALWAYS,
> + .fsync_enable = FSYNC_FILTERED,
> ._randomize_va_space =
> #ifdef CONFIG_COMPAT_BRK
> 1,
> @@ -931,7 +931,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
> ve->meminfo_val = VE_MEMINFO_DEFAULT;
>
> ve->odirect_enable = 2;
> - ve->fsync_enable = FSYNC_ALWAYS;
> + /* for veX FSYNC_FILTERED means "get value from ve0 */
> + ve->fsync_enable = FSYNC_FILTERED;
>
> atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
> ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
>
More information about the Devel
mailing list