[Devel] [PATCH vz9 1/5] Revert "ve/fs/sync: Per containter sync and syncfs and fs.fsync-enable sysctl"

Mon Nov 22 10:58:02 MSK 2021

On 22.11.2021 09:20, Nikita Yushchenko wrote:
> This reverts commit e6acaa7d3aff56d652adc4121ca44d71e816f53a.
> 
> Different implementation will follow.
> 
> https://jira.sw.ru/browse/PSBM-44684
> Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>

Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>

> ---
>  fs/fcntl.c          |   2 -
>  fs/mount.h          |   2 -
>  fs/namespace.c      |   8 +-
>  fs/open.c           |   3 -
>  fs/sync.c           | 213 +-------------------------------------------
>  include/linux/fs.h  |  12 ---
>  include/linux/ve.h  |   2 -
>  kernel/ve/ve.c      |   3 -
>  kernel/ve/veowner.c |   8 --
>  mm/msync.c          |   2 -
>  10 files changed, 6 insertions(+), 249 deletions(-)
> 
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 8af146ea9231..2e0c8515bd1a 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -68,8 +68,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
>  	if (!may_use_odirect())
>  		arg &= ~O_DIRECT;
>  
> -	if (ve_fsync_behavior() == FSYNC_NEVER)
> -		arg &= ~O_SYNC;
>  	/*
>  	 * O_APPEND cannot be cleared if the file is marked as append-only
>  	 * and the file is open for write.
> diff --git a/fs/mount.h b/fs/mount.h
> index 9043614df477..ce646e66efc4 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -101,8 +101,6 @@ static inline int is_mounted(struct vfsmount *mnt)
>  	return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
>  }
>  
> -extern struct rw_semaphore namespace_sem;
> -
>  extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
>  
>  extern int __legitimize_mnt(struct vfsmount *, unsigned);
> diff --git a/fs/namespace.c b/fs/namespace.c
> index dc73e945b746..558fa26bde50 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -72,7 +72,7 @@ static DEFINE_IDA(mnt_group_ida);
>  static struct hlist_head *mount_hashtable __read_mostly;
>  static struct hlist_head *mountpoint_hashtable __read_mostly;
>  static struct kmem_cache *mnt_cache __read_mostly;
> -DECLARE_RWSEM(namespace_sem);
> +static DECLARE_RWSEM(namespace_sem);
>  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
>  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
>  
> @@ -1325,8 +1325,9 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
>  	return &p->mnt;
>  }
>  
> -struct mount *mnt_list_next(struct mnt_namespace *ns,
> -			    struct list_head *p)
> +#ifdef CONFIG_PROC_FS
> +static struct mount *mnt_list_next(struct mnt_namespace *ns,
> +				   struct list_head *p)
>  {
>  	struct mount *mnt, *ret = NULL;
>  
> @@ -1343,7 +1344,6 @@ struct mount *mnt_list_next(struct mnt_namespace *ns,
>  	return ret;
>  }
>  
> -#ifdef CONFIG_PROC_FS
>  /* iterator; we want it to have access to namespace_sem, thus here... */
>  static void *m_start(struct seq_file *m, loff_t *pos)
>  {
> diff --git a/fs/open.c b/fs/open.c
> index 65e60aa661a8..040df8bc6e76 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -785,9 +785,6 @@ static int do_dentry_open(struct file *f,
>  	if (!may_use_odirect())
>  		f->f_flags &= ~O_DIRECT;
>  
> -	if (ve_fsync_behavior() == FSYNC_NEVER)
> -		f->f_flags &= ~O_SYNC;
> -
>  	if (unlikely(f->f_flags & O_PATH)) {
>  		f->f_mode = FMODE_PATH | FMODE_OPENED;
>  		f->f_op = &empty_fops;
> diff --git a/fs/sync.c b/fs/sync.c
> index 1c78756d4749..1373a610dc78 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -8,7 +8,6 @@
>  #include <linux/fs.h>
>  #include <linux/slab.h>
>  #include <linux/export.h>
> -#include <linux/mount.h>
>  #include <linux/namei.h>
>  #include <linux/sched.h>
>  #include <linux/writeback.h>
> @@ -17,9 +16,7 @@
>  #include <linux/pagemap.h>
>  #include <linux/quotaops.h>
>  #include <linux/backing-dev.h>
> -#include <linux/ve.h>
>  #include "internal.h"
> -#include "mount.h"
>  
>  #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
>  			SYNC_FILE_RANGE_WAIT_AFTER)
> @@ -99,160 +96,6 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
>  	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
>  }
>  
> -struct sync_sb {
> -	struct list_head list;
> -	struct super_block *sb;
> -};
> -
> -static void sync_release_filesystems(struct list_head *sync_list)
> -{
> -	struct sync_sb *ss, *tmp;
> -
> -	list_for_each_entry_safe(ss, tmp, sync_list, list) {
> -		list_del(&ss->list);
> -		put_super(ss->sb);
> -		kfree(ss);
> -	}
> -}
> -
> -static int sync_filesystem_collected(struct list_head *sync_list, struct super_block *sb)
> -{
> -	struct sync_sb *ss;
> -
> -	list_for_each_entry(ss, sync_list, list)
> -		if (ss->sb == sb)
> -			return 1;
> -	return 0;
> -}
> -
> -static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
> -{
> -	struct mount *mnt;
> -	struct mnt_namespace *mnt_ns;
> -	struct nsproxy *ve_ns;
> -	struct sync_sb *ss;
> -	int ret = 0;
> -
> -	BUG_ON(!list_empty(sync_list));
> -
> -	down_read(&namespace_sem);
> -
> -	rcu_read_lock();
> -	ve_ns = rcu_dereference(ve->ve_ns);
> -	if (!ve_ns) {
> -		rcu_read_unlock();
> -		up_read(&namespace_sem);
> -		return 0;
> -	}
> -	mnt_ns = ve_ns->mnt_ns;
> -	rcu_read_unlock();
> -
> -	mnt = mnt_list_next(mnt_ns, &mnt_ns->list);
> -	while (mnt) {
> -		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
> -			goto next;
> -
> -		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
> -		if (ss == NULL) {
> -			ret = -ENOMEM;
> -			break;
> -		}
> -		ss->sb = mnt->mnt.mnt_sb;
> -		/*
> -		 * We hold mount point and thus can be sure, that superblock is
> -		 * alive. And it means, that we can safely increase it's usage
> -		 * counter.
> -		 */
> -		spin_lock(&sb_lock);
> -		ss->sb->s_count++;
> -		spin_unlock(&sb_lock);
> -		list_add_tail(&ss->list, sync_list);
> -next:
> -		mnt = mnt_list_next(mnt_ns, &mnt->mnt_list);
> -	}
> -	up_read(&namespace_sem);
> -	return ret;
> -}
> -
> -static void sync_filesystems_ve(struct ve_struct *ve, int wait)
> -{
> -	struct super_block *sb;
> -	LIST_HEAD(sync_list);
> -	struct sync_sb *ss;
> -
> -	/*
> -	 * We don't need to care about allocating failure here. At least we
> -	 * don't need to skip sync on such error.
> -	 * Let's sync what we collected already instead.
> -	 */
> -	sync_collect_filesystems(ve, &sync_list);
> -
> -	list_for_each_entry(ss, &sync_list, list) {
> -		sb = ss->sb;
> -		down_read(&sb->s_umount);
> -		if (!sb_rdonly(sb) && sb->s_root && (sb->s_flags & SB_BORN))
> -			__sync_filesystem(sb, wait);
> -		up_read(&sb->s_umount);
> -	}
> -
> -	sync_release_filesystems(&sync_list);
> -}
> -
> -static int is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
> -{
> -	struct mount *mnt;
> -	struct mnt_namespace *mnt_ns;
> -	struct nsproxy *ve_ns;
> -	int ret = 0;
> -
> -	down_read(&namespace_sem);
> -
> -	rcu_read_lock();
> -	ve_ns = rcu_dereference(ve->ve_ns);
> -	if (!ve_ns) {
> -		rcu_read_unlock();
> -		up_read(&namespace_sem);
> -		return 0;
> -	}
> -	mnt_ns = ve_ns->mnt_ns;
> -	rcu_read_unlock();
> -
> -	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
> -		if (mnt->mnt.mnt_sb == sb) {
> -			ret = 1;
> -			break;
> -		}
> -	}
> -	up_read(&namespace_sem);
> -	return ret;
> -}
> -
> -static int __ve_fsync_behavior(struct ve_struct *ve)
> -{
> -	/*
> -	 * - __ve_fsync_behavior() is not called for ve0
> -	 * - FSYNC_FILTERED for veX does NOT mean "filtered" behavior
> -	 * - FSYNC_FILTERED for veX means "get value from ve0"
> -	 */
> -	if (ve->fsync_enable == FSYNC_FILTERED)
> -		return get_ve0()->fsync_enable;
> -	else if (ve->fsync_enable)
> -		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
> -	else
> -		return 0;
> -}
> -
> -int ve_fsync_behavior(void)
> -{
> -	struct ve_struct *ve;
> -
> -	ve = get_exec_env();
> -	if (ve_is_super(ve))
> -		return FSYNC_ALWAYS;
> -	else
> -		return __ve_fsync_behavior(ve);
> -}
> -
>  /*
>   * Sync everything. We start by waking flusher threads so that most of
>   * writeback runs on all devices in parallel. Then we sync all inodes reliably
> @@ -265,32 +108,8 @@ int ve_fsync_behavior(void)
>   */
>  void ksys_sync(void)
>  {
> -	struct ve_struct *ve = get_exec_env();
>  	int nowait = 0, wait = 1;
>  
> -	if (!ve_is_super(ve)) {
> -		int fsb;
> -		/*
> -		 * init can't sync during VE stop. Rationale:
> -		 *  - NFS with -o hard will block forever as network is down
> -		 *  - no useful job is performed as VE0 will call umount/sync
> -		 *    by his own later
> -		 *  Den
> -		 */
> -		if (is_child_reaper(task_pid(current)))
> -			return;
> -
> -		fsb = __ve_fsync_behavior(ve);
> -		if (fsb == FSYNC_NEVER)
> -			return;
> -
> -		if (fsb == FSYNC_FILTERED) {
> -			sync_filesystems_ve(ve, nowait);
> -			sync_filesystems_ve(ve, wait);
> -			return;
> -		}
> -	}
> -
>  	wakeup_flusher_threads(WB_REASON_SYNC);
>  	iterate_supers(sync_inodes_one_sb, NULL);
>  	iterate_supers(sync_fs_one_sb, &nowait);
> @@ -343,42 +162,18 @@ SYSCALL_DEFINE1(syncfs, int, fd)
>  {
>  	struct fd f = fdget(fd);
>  	struct super_block *sb;
> -	int ret = 0, ret2 = 0;
> -	struct ve_struct *ve;
> +	int ret, ret2;
>  
>  	if (!f.file)
>  		return -EBADF;
>  	sb = f.file->f_path.dentry->d_sb;
>  
> -	ve = get_exec_env();
> -
> -	if (!ve_is_super(ve)) {
> -		int fsb;
> -		/*
> -		 * init can't sync during VE stop. Rationale:
> -		 *  - NFS with -o hard will block forever as network is down
> -		 *  - no useful job is performed as VE0 will call umount/sync
> -		 *    by his own later
> -		 *  Den
> -		 */
> -		if (is_child_reaper(task_pid(current)))
> -			goto fdput;
> -
> -		fsb = __ve_fsync_behavior(ve);
> -		if (fsb == FSYNC_NEVER)
> -			goto fdput;
> -
> -		if ((fsb == FSYNC_FILTERED) && !is_sb_ve_accessible(ve, sb))
> -			goto fdput;
> -	}
> -
>  	down_read(&sb->s_umount);
>  	ret = sync_filesystem(sb);
>  	up_read(&sb->s_umount);
>  
>  	ret2 = errseq_check_and_advance(&sb->s_wb_err, &f.file->f_sb_err);
>  
> -fdput:
>  	fdput(f);
>  	return ret ? ret : ret2;
>  }
> @@ -422,13 +217,9 @@ EXPORT_SYMBOL(vfs_fsync);
>  
>  static int do_fsync(unsigned int fd, int datasync)
>  {
> -	struct fd f;
> +	struct fd f = fdget(fd);
>  	int ret = -EBADF;
>  
> -	if (ve_fsync_behavior() == FSYNC_NEVER)
> -		return 0;
> -
> -	f = fdget(fd);
>  	if (f.file) {
>  		ret = vfs_fsync(f.file, datasync);
>  		fdput(f);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 42021f0151f5..01419dbd864b 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -71,7 +71,6 @@ struct fsverity_operations;
>  struct fs_context;
>  struct fs_parameter_spec;
>  struct fileattr;
> -struct mnt_namespace;
>  
>  extern void __init inode_init(void);
>  extern void __init inode_init_early(void);
> @@ -2522,7 +2521,6 @@ void kill_anon_super(struct super_block *sb);
>  void kill_litter_super(struct super_block *sb);
>  void deactivate_super(struct super_block *sb);
>  void deactivate_locked_super(struct super_block *sb);
> -void put_super(struct super_block *sb);
>  int set_anon_super(struct super_block *s, void *data);
>  int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
>  int get_anon_bdev(dev_t *);
> @@ -3148,13 +3146,6 @@ extern bool path_is_under(const struct path *, const struct path *);
>  
>  extern char *file_path(struct file *, char *, int);
>  
> -int ve_fsync_behavior(void);
> -
> -#define FSYNC_NEVER	0	/* ve syncs are ignored    */
> -#define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
> -#define FSYNC_FILTERED	2	/* ve syncs only its files */
> -/* For non-ve0 FSYNC_FILTERED value means "get value from ve0". */
> -
>  #include <linux/err.h>
>  
>  /* needed for stackable file system support */
> @@ -3504,9 +3495,6 @@ void setattr_copy(struct user_namespace *, struct inode *inode,
>  
>  extern int file_update_time(struct file *file);
>  
> -extern struct mount *mnt_list_next(struct mnt_namespace *ns,
> -				   struct list_head *p);
> -
>  static inline bool vma_is_dax(const struct vm_area_struct *vma)
>  {
>  	return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 4c8f7d308829..1a66063d9ba8 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -61,8 +61,6 @@ struct ve_struct {
>  	struct kstat_lat_pcpu_struct    sched_lat_ve;
>  	int			odirect_enable;
>  
> -	int			fsync_enable;
> -
>  #if IS_ENABLED(CONFIG_BINFMT_MISC)
>  	struct binfmt_misc	*binfmt_misc;
>  #endif
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 557a14f216c4..48be49337a87 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -70,7 +70,6 @@ struct ve_struct ve0 = {
>  	.sched_lat_ve.cur	= &ve0_lat_stats,
>  	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
>  	.netns_max_nr		= INT_MAX,
> -	.fsync_enable		= FSYNC_FILTERED,
>  	._randomize_va_space	=
>  #ifdef CONFIG_COMPAT_BRK
>  					1,
> @@ -931,8 +930,6 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
>  	ve->meminfo_val = VE_MEMINFO_DEFAULT;
>  
>  	ve->odirect_enable = 2;
> -	/* for veX FSYNC_FILTERED means "get value from ve0 */
> -	ve->fsync_enable = FSYNC_FILTERED;
>  
>  	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
>  	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
> diff --git a/kernel/ve/veowner.c b/kernel/ve/veowner.c
> index e255fe57d447..b0aba35b6be9 100644
> --- a/kernel/ve/veowner.c
> +++ b/kernel/ve/veowner.c
> @@ -7,7 +7,6 @@
>   *
>   */
>  
> -#include <linux/ve.h>
>  #include <linux/init.h>
>  #include <linux/module.h>
>  #include <linux/proc_fs.h>
> @@ -67,13 +66,6 @@ static struct ctl_table vz_fs_table[] = {
>  		.extra1		= &ve_mount_nr_min,
>  		.extra2		= &ve_mount_nr_max,
>  	},
> -	{
> -		.procname	= "fsync-enable",
> -		.data		= &ve0.fsync_enable,
> -		.maxlen		= sizeof(int),
> -		.mode		= 0644 | S_ISVTX,
> -		.proc_handler	= &proc_dointvec_virtual,
> -	},
>  	{ }
>  };
>  
> diff --git a/mm/msync.c b/mm/msync.c
> index 20737eb4b76b..137d1c104f3e 100644
> --- a/mm/msync.c
> +++ b/mm/msync.c
> @@ -51,8 +51,6 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
>  	if (end < start)
>  		goto out;
>  	error = 0;
> -	if (ve_fsync_behavior() == FSYNC_NEVER)
> -		goto out;
>  	if (end == start)
>  		goto out;
>  	/*
>