[Devel] [PATCH vz9 v2 5/5] fs: per-VE sync

Kirill Tkhai ktkhai at virtuozzo.com
Mon Nov 22 12:18:04 MSK 2021


I'd moved sync_inodes_one_sb() and sync_fs_one_sb() move to separate patch.
I'd suggested to add the following strings in your ~/.gitconfig:

[diff "default"]
        xfuncname = "^[[:alpha:]$_].*[^:]$"
        algorithm = patience
[diff]
        algorithm = patience

Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com> 

On 22.11.2021 11:58, Nikita Yushchenko wrote:
> This contains part of vz7/vz8 per-VE sync code, updated to support
> non-root mount namespaces within VE.
> 
> https://jira.sw.ru/browse/PSBM-44684
> Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
> ---
> v2: add comments on syncing bdevs in VE-local operation, make the
>     action exactly the same as with global sync, just moved.
> 
>  fs/sync.c          | 130 +++++++++++++++++++++++++++++++++++++--------
>  include/linux/fs.h |   2 +
>  kernel/ve/ve.c     |   5 +-
>  3 files changed, 113 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/sync.c b/fs/sync.c
> index 31e6f0c6402d..6ec1b66d004d 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -69,12 +69,33 @@ int sync_filesystem(struct super_block *sb)
>  }
>  EXPORT_SYMBOL(sync_filesystem);
>  
> +static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> +{
> +	filemap_fdatawrite(bdev->bd_inode->i_mapping);
> +}
> +
> +static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
> +{
> +	/*
> +	 * We keep the error status of individual mapping so that
> +	 * applications can catch the writeback error using fsync(2).
> +	 * See filemap_fdatawait_keep_errors() for details.
> +	 */
> +	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
> +}
> +
>  struct sync_arg {
> +	struct ve_struct *ve;
>  	int wait;
>  };
>  
>  static void sync_inodes_one_sb(struct super_block *sb, void *arg)
>  {
> +	struct sync_arg *sarg = arg;
> +
> +	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
> +		return;
> +
>  	if (!sb_rdonly(sb))
>  		sync_inodes_sb(sb);
>  }
> @@ -83,24 +104,34 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg)
>  {
>  	struct sync_arg *sarg = arg;
>  
> -	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
> -	    sb->s_op->sync_fs)
> -		sb->s_op->sync_fs(sb, sarg->wait);
> -}
> +	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
> +		return;
>  
> -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
> -{
> -	filemap_fdatawrite(bdev->bd_inode->i_mapping);
> +	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC)) {
> +		if (sb->s_op->sync_fs)
> +			sb->s_op->sync_fs(sb, sarg->wait);
> +
> +		/* See comment in ksys_sync() bellow */
> +		if (sarg->ve) {
> +			fdatawrite_one_bdev(sb->s_bdev, NULL);
> +			fdatawait_one_bdev(sb->s_bdev, NULL);
> +		}
> +	}
>  }
>  
> -static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
> +static int __ve_fsync_behavior(struct ve_struct *ve)
>  {
>  	/*
> -	 * We keep the error status of individual mapping so that
> -	 * applications can catch the writeback error using fsync(2).
> -	 * See filemap_fdatawait_keep_errors() for details.
> +	 * - __ve_fsync_behavior() is not called for ve0
> +	 * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
> +	 * - FSYNC_FILTERED for veX means "get value from ve0"
>  	 */
> -	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
> +	if (ve->fsync_enable == FSYNC_FILTERED)
> +		return get_ve0()->fsync_enable;
> +	else if (ve->fsync_enable)
> +		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
> +	else
> +		return 0;
>  }
>  
>  int ve_fsync_behavior(void)
> @@ -111,7 +142,7 @@ int ve_fsync_behavior(void)
>  	if (ve_is_super(ve))
>  		return FSYNC_ALWAYS;
>  	else
> -		return ve->fsync_enable;
> +		return __ve_fsync_behavior(ve);
>  }
>  
>  /*
> @@ -126,21 +157,55 @@ int ve_fsync_behavior(void)
>   */
>  void ksys_sync(void)
>  {
> +	struct ve_struct *ve = get_exec_env();
>  	struct sync_arg sarg;
>  
> -	if (ve_fsync_behavior() == FSYNC_NEVER)
> -		return;
> +	sarg.ve = NULL;
> +	if (!ve_is_super(ve)) {
> +		int fsb;
> +		/*
> +		 * init can't sync during VE stop. Rationale:
> +		 *  - NFS with -o hard will block forever as network is down
> +		 *  - no useful job is performed as VE0 will call umount/sync
> +		 *    by his own later
> +		 *  Den
> +		 */
> +		if (is_child_reaper(task_pid(current)))
> +			return;
> +
> +		fsb = __ve_fsync_behavior(ve);
> +		if (fsb == FSYNC_NEVER)
> +			return;
> +		if (fsb == FSYNC_FILTERED)
> +			sarg.ve = ve;
> +	}
>  
>  	wakeup_flusher_threads(WB_REASON_SYNC);
> -	iterate_supers(sync_inodes_one_sb, NULL);
> +	iterate_supers(sync_inodes_one_sb, &sarg);
>  	sarg.wait = 0;
>  	iterate_supers(sync_fs_one_sb, &sarg);
>  	sarg.wait = 1;
>  	iterate_supers(sync_fs_one_sb, &sarg);
> -	iterate_bdevs(fdatawrite_one_bdev, NULL);
> -	iterate_bdevs(fdatawait_one_bdev, NULL);
> -	if (unlikely(laptop_mode))
> -		laptop_sync_completion();
> +
> +	/*
> +	 * Currently there is no access to raw bdevs from VE, which implies
> +	 * that in VE-local sync, need to flush bdev only if it contains
> +	 * VE-visible mount. Searching for such mount is linear against number
> +	 * of superblocks on the host, and doing that for each bdev turns into
> +	 * square complexity on number of mounted bdevs on the host.
> +	 *
> +	 * Avoid that square complexity by moving bdev sync into the second
> +	 * second call to sync_fs_one_sb() above.
> +	 *
> +	 * If ever implementing access to raw bdevs from VE, this approach will
> +	 * be no longer valid.
> +	 */
> +	if (!sarg.ve) {
> +		iterate_bdevs(fdatawrite_one_bdev, NULL);
> +		iterate_bdevs(fdatawait_one_bdev, NULL);
> +		if (unlikely(laptop_mode))
> +			laptop_sync_completion();
> +	}
>  }
>  
>  SYSCALL_DEFINE0(sync)
> @@ -153,6 +218,7 @@ static void do_sync_work(struct work_struct *work)
>  {
>  	struct sync_arg sarg;
>  
> +	sarg.ve = NULL;
>  	sarg.wait = 0;
>  
>  	/*
> @@ -188,13 +254,33 @@ SYSCALL_DEFINE1(syncfs, int, fd)
>  	struct fd f = fdget(fd);
>  	struct super_block *sb;
>  	int ret = 0, ret2 = 0;
> +	struct ve_struct *ve;
>  
>  	if (!f.file)
>  		return -EBADF;
>  	sb = f.file->f_path.dentry->d_sb;
>  
> -	if (ve_fsync_behavior() == FSYNC_NEVER)
> -		goto fdput;
> +	ve = get_exec_env();
> +
> +	if (!ve_is_super(ve)) {
> +		int fsb;
> +		/*
> +		 * init can't sync during VE stop. Rationale:
> +		 *  - NFS with -o hard will block forever as network is down
> +		 *  - no useful job is performed as VE0 will call umount/sync
> +		 *    by his own later
> +		 *  Den
> +		 */
> +		if (is_child_reaper(task_pid(current)))
> +			goto fdput;
> +
> +		fsb = __ve_fsync_behavior(ve);
> +		if (fsb == FSYNC_NEVER)
> +			goto fdput;
> +
> +		if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
> +			goto fdput;
> +	}
>  
>  	down_read(&sb->s_umount);
>  	ret = sync_filesystem(sb);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index fb21d1a32cdb..9f34e9384f88 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3148,6 +3148,8 @@ extern char *file_path(struct file *, char *, int);
>  
>  #define FSYNC_NEVER	0	/* ve syncs are ignored    */
>  #define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
> +#define FSYNC_FILTERED	2	/* ve syncs only its files */
> +/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
>  
>  #ifdef CONFIG_VE
>  int ve_fsync_behavior(void);
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index e94aa90aff25..557a14f216c4 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -70,7 +70,7 @@ struct ve_struct ve0 = {
>  	.sched_lat_ve.cur	= &ve0_lat_stats,
>  	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
>  	.netns_max_nr		= INT_MAX,
> -	.fsync_enable		= FSYNC_ALWAYS,
> +	.fsync_enable		= FSYNC_FILTERED,
>  	._randomize_va_space	=
>  #ifdef CONFIG_COMPAT_BRK
>  					1,
> @@ -931,7 +931,8 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>  	ve->meminfo_val = VE_MEMINFO_DEFAULT;
>  
>  	ve->odirect_enable = 2;
> -	ve->fsync_enable = FSYNC_ALWAYS;
> +	/* for veX FSYNC_FILTERED means "get value from ve0 */
> +	ve->fsync_enable = FSYNC_FILTERED;
>  
>  	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
>  	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
> 



More information about the Devel mailing list