[Devel] [PATCH RH8] ve/fs/binfmt: fix EBUSY on mounting second binfmt_misc in CT
Alexander Mikhalitsyn
alexander.mikhalitsyn at virtuozzo.com
Wed Aug 11 16:19:05 MSK 2021
On Wed, 11 Aug 2021 15:41:39 +0300
Pavel Tikhomirov <ptikhomirov at virtuozzo.com> wrote:
>
>
> On 06.08.2021 17:29, Alexander Mikhalitsyn wrote:
> > After rebase to RHEL 8.4 binfmt_misc fs uses new fscontext API,
> > our implementation is incorrect. We have to use get_tree_keyed()
> > helper instead of get_tree_single() which allows only one
> > superblock per HN.
> >
> > Fixes: 90fb0e274 ("ve/fs/binfmt: virtualization")
> >
> > https://jira.sw.ru/browse/PSBM-132709
> >
> > Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
> > ---
> > fs/binfmt_misc.c | 42 ++++++++++++++++++++++++++++++++++++------
> > 1 file changed, 36 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> > index c21d03fd3f15..ad1d9d603034 100644
> > --- a/fs/binfmt_misc.c
> > +++ b/fs/binfmt_misc.c
> > @@ -833,10 +833,8 @@ static const struct file_operations bm_status_operations = {
> > static void bm_put_super(struct super_block *sb)
> > {
> > struct binfmt_misc *bm_data = BINFMT_MISC(sb);
> > - struct ve_struct *ve = sb->s_fs_info;
> >
> > bm_data->enabled = 0;
> > - put_ve(ve);
> > }
> >
> > static const struct super_operations s_ops = {
> > @@ -875,8 +873,6 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
> > }
> >
> > sb->s_op = &s_ops;
> > - sb->s_fs_info = ve;
> > - get_ve(ve);
> >
> > bm_data->enabled = 1;
> >
> > @@ -885,10 +881,36 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
> >
> > static int bm_get_tree(struct fs_context *fc)
> > {
> > - return get_tree_single(fc, bm_fill_super);
> > + struct ve_struct *ve = get_exec_env();
> > +
> > + /*
> > + * We need one binfmt_misc superblock per VE,
> > + * use get_tree_keyed() helper to get vfs_tree.
> > + *
> > + * It allows us to find sb by key (in our case ve is the key),
> > + * and if it doesn't exists creates new.
> > + *
> > + * Important: we take ve refcnt here. It will be put
> > + * in one of two places:
> > + * 1. bm_free_fc() on error path (wrong mnt opt provided for instance)
> > + * 2. bm_umount() when sb refcnt becomes zero (last mount umounted)
> > + */
> > + return get_tree_keyed(fc, bm_fill_super, get_ve(ve));
>
> I can imagine that we can get here as much times as we want (N), each
> mount syscall would get us here. So in case of no error paths we take N
> references on same VE but ->kill_sb( would be called only once and we
> release only one reference. This way VE would likely become stuck forever...
Let see, if you already have superblock initialized, then in sget_fc() function:
if (test) {
hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
if (test(old, fc))
goto share_extant_sb; <--- get there
}
then:
s->s_fs_info = fc->s_fs_info;
err = set(s, fc);
if (err) {
s->s_fs_info = NULL;
spin_unlock(&sb_lock);
destroy_unused_super(s);
return ERR_PTR(err);
}
fc->s_fs_info = NULL; <--- here we NULLify s_fs_info on SUCCESS path when NEW superblock was created
s->s_type = fc->fs_type;
s->s_iflags |= fc->s_iflags;
strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
list_add_tail(&s->s_list, &super_blocks);
hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
spin_unlock(&sb_lock);
get_filesystem(s->s_type);
register_shrinker_prepared(&s->s_shrink);
return s;
share_extant_sb: <--- here we NOT NULLify fc->s_fs_info and refcnt will be dropped
/* PSBM-86208: we mount secondary ploop on host for
* resize functionality so allow mount in init userns
* if fs already mounted in non-init userns
*/
if (user_ns != old->s_user_ns &&
user_ns != &init_user_ns) {
spin_unlock(&sb_lock);
destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
if (!grab_super(old))
goto retry;
destroy_unused_super(s);
return old;
My comment not fully correct, because I mean that "success" is when new sb allocated, but when we
already found superblock existing is also success case. I will fix comment :)
Thanks! ;)
>
> > +}
> > +
> > +static void bm_free_fc(struct fs_context *fc)
> > +{
> > + /*
> > + * fc->s_fs_info will be NULL if *no* error was occured
> > + * see fs/super.c sget_fc() helper
> > + */
> > + if (fc->s_fs_info)
> > + put_ve(fc->s_fs_info);
> > }
> >
> > +
> > static const struct fs_context_operations bm_context_ops = {
> > + .free = bm_free_fc,
> > .get_tree = bm_get_tree,
> > };
> >
> > @@ -898,6 +920,14 @@ static int bm_init_fs_context(struct fs_context *fc)
> > return 0;
> > }
> >
> > +static void bm_umount(struct super_block *sb)
> > +{
> > + struct ve_struct *ve = sb->s_fs_info;
> > +
> > + kill_litter_super(sb);
> > + put_ve(ve);
> > +}
> > +
> > static struct linux_binfmt misc_format = {
> > .module = THIS_MODULE,
> > .load_binary = load_misc_binary,
> > @@ -907,7 +937,7 @@ static struct file_system_type bm_fs_type = {
> > .owner = THIS_MODULE,
> > .name = "binfmt_misc",
> > .init_fs_context = bm_init_fs_context,
> > - .kill_sb = kill_litter_super,
> > + .kill_sb = bm_umount,
> > .fs_flags = FS_VIRTUALIZED | FS_VE_MOUNT,
> > };
> > MODULE_ALIAS_FS("binfmt_misc");
> >
>
> --
> Best regards, Tikhomirov Pavel
> Software Developer, Virtuozzo.
Regards,
Alex
More information about the Devel
mailing list