[Devel] [PATCH RH8] ve/fs/binfmt: fix EBUSY on mounting second binfmt_misc in CT

Alexander Mikhalitsyn alexander.mikhalitsyn at virtuozzo.com
Wed Aug 11 16:19:05 MSK 2021


On Wed, 11 Aug 2021 15:41:39 +0300
Pavel Tikhomirov <ptikhomirov at virtuozzo.com> wrote:

> 
> 
> On 06.08.2021 17:29, Alexander Mikhalitsyn wrote:
> > After rebase to RHEL 8.4 binfmt_misc fs uses new fscontext API,
> > our implementation is incorrect. We have to use get_tree_keyed()
> > helper instead of get_tree_single() which allows only one
> > superblock per HN.
> > 
> > Fixes: 90fb0e274 ("ve/fs/binfmt: virtualization")
> > 
> > https://jira.sw.ru/browse/PSBM-132709
> > 
> > Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
> > ---
> >   fs/binfmt_misc.c | 42 ++++++++++++++++++++++++++++++++++++------
> >   1 file changed, 36 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> > index c21d03fd3f15..ad1d9d603034 100644
> > --- a/fs/binfmt_misc.c
> > +++ b/fs/binfmt_misc.c
> > @@ -833,10 +833,8 @@ static const struct file_operations bm_status_operations = {
> >   static void bm_put_super(struct super_block *sb)
> >   {
> >   	struct binfmt_misc *bm_data = BINFMT_MISC(sb);
> > -	struct ve_struct *ve = sb->s_fs_info;
> >   
> >   	bm_data->enabled = 0;
> > -	put_ve(ve);
> >   }
> >   
> >   static const struct super_operations s_ops = {
> > @@ -875,8 +873,6 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
> >   	}
> >   
> >   	sb->s_op = &s_ops;
> > -	sb->s_fs_info = ve;
> > -	get_ve(ve);
> >   
> >   	bm_data->enabled = 1;
> >   
> > @@ -885,10 +881,36 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
> >   
> >   static int bm_get_tree(struct fs_context *fc)
> >   {
> > -	return get_tree_single(fc, bm_fill_super);
> > +	struct ve_struct *ve = get_exec_env();
> > +
> > +	/*
> > +	 * We need one binfmt_misc superblock per VE,
> > +	 * use get_tree_keyed() helper to get vfs_tree.
> > +	 *
> > +	 * It allows us to find sb by key (in our case ve is the key),
> > +	 * and if it doesn't exists creates new.
> > +	 *
> > +	 * Important: we take ve refcnt here. It will be put
> > +	 * in one of two places:
> > +	 * 1. bm_free_fc() on error path (wrong mnt opt provided for instance)
> > +	 * 2. bm_umount() when sb refcnt becomes zero (last mount umounted)
> > +	 */
> > +	return get_tree_keyed(fc, bm_fill_super, get_ve(ve));
> 
> I can imagine that we can get here as much times as we want (N), each 
> mount syscall would get us here. So in case of no error paths we take N 
> references on same VE but ->kill_sb( would be called only once and we 
> release only one reference. This way VE would likely become stuck forever...

Let see, if you already have superblock initialized, then in sget_fc() function:
	if (test) {
		hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
			if (test(old, fc))
				goto share_extant_sb; <--- get there
		}
then:
	s->s_fs_info = fc->s_fs_info;
	err = set(s, fc);
	if (err) {
		s->s_fs_info = NULL;
		spin_unlock(&sb_lock);
		destroy_unused_super(s);
		return ERR_PTR(err);
	}
	fc->s_fs_info = NULL; <--- here we NULLify s_fs_info on SUCCESS path when NEW superblock was created
	s->s_type = fc->fs_type;
	s->s_iflags |= fc->s_iflags;
	strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
	list_add_tail(&s->s_list, &super_blocks);
	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
	spin_unlock(&sb_lock);
	get_filesystem(s->s_type);
	register_shrinker_prepared(&s->s_shrink);
	return s;

share_extant_sb: <--- here we NOT NULLify fc->s_fs_info and refcnt will be dropped
	/* PSBM-86208: we mount secondary ploop on host for
	 * resize functionality so allow mount in init userns
	 * if fs already mounted in non-init userns
	 */
	if (user_ns != old->s_user_ns &&
	    user_ns != &init_user_ns) {
		spin_unlock(&sb_lock);
		destroy_unused_super(s);
		return ERR_PTR(-EBUSY);
	}
	if (!grab_super(old))
		goto retry;
	destroy_unused_super(s);
	return old;

My comment not fully correct, because I mean that "success" is when new sb allocated, but when we
already found superblock existing is also success case. I will fix comment :)

Thanks! ;)

> 
> > +}
> > +
> > +static void bm_free_fc(struct fs_context *fc)
> > +{
> > +	/*
> > +	 * fc->s_fs_info will be NULL if *no* error was occured
> > +	 * see fs/super.c sget_fc() helper
> > +	 */
> > +	if (fc->s_fs_info)
> > +		put_ve(fc->s_fs_info);
> >   }
> >   
> > +
> >   static const struct fs_context_operations bm_context_ops = {
> > +	.free		= bm_free_fc,
> >   	.get_tree	= bm_get_tree,
> >   };
> >   
> > @@ -898,6 +920,14 @@ static int bm_init_fs_context(struct fs_context *fc)
> >   	return 0;
> >   }
> >   
> > +static void bm_umount(struct super_block *sb)
> > +{
> > +	struct ve_struct *ve = sb->s_fs_info;
> > +
> > +	kill_litter_super(sb);
> > +	put_ve(ve);
> > +}
> > +
> >   static struct linux_binfmt misc_format = {
> >   	.module = THIS_MODULE,
> >   	.load_binary = load_misc_binary,
> > @@ -907,7 +937,7 @@ static struct file_system_type bm_fs_type = {
> >   	.owner		= THIS_MODULE,
> >   	.name		= "binfmt_misc",
> >   	.init_fs_context = bm_init_fs_context,
> > -	.kill_sb	= kill_litter_super,
> > +	.kill_sb	= bm_umount,
> >   	.fs_flags	= FS_VIRTUALIZED | FS_VE_MOUNT,
> >   };
> >   MODULE_ALIAS_FS("binfmt_misc");
> > 
> 
> -- 
> Best regards, Tikhomirov Pavel
> Software Developer, Virtuozzo.

Regards,
Alex


More information about the Devel mailing list