[Devel] [PATCH vz8 v2] binfmt_misc: fix mount after umount in CT
Nikita Yushchenko
nikita.yushchenko at virtuozzo.com
Tue Oct 19 17:40:00 MSK 2021
The assumption that bm_fill_super() is not called for the second time
for CT is wrong: umount operation clears sb->s_root, which causes
vfs_get_super() to call fill_super again on the next mount.
Make bm_fill_super() handle multiple-calls corrently:
- initialize bm_data and set ve->binfmt_misc only if it is not done
before,
- delay desctruction of it up to CT destruction.
https://jira.sw.ru/browse/PSBM-133968
Fixes: 8250ff41d190 ("ve/fs/binfmt: clean bm_data reference from ve on err path")
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
Changes from v1:
- fix double-free of bm_data
- make sure it compiles for !CONFIG_VE case
fs/binfmt_misc.c | 58 ++++++++++++++++++++++++++----------------------
1 file changed, 32 insertions(+), 26 deletions(-)
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 0946e7e6caa5..5a3de9fea265 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -852,39 +852,32 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
/* last one */ {""}
};
+#ifdef CONFIG_VE
struct ve_struct *ve = get_exec_env();
- struct binfmt_misc *bm_data;
+ struct binfmt_misc *bm_data = ve->binfmt_misc;
+#else
+ static struct binfmt_misc *bm_data = NULL;
+#endif
- /*
- * bm_get_tree()
- * get_tree_keyed(fc, bm_fill_super, get_ve(ve))
- * fc->s_fs_info = current VE
- * vfs_get_super(fc, vfs_get_keyed_super, bm_fill_super)
- * sb = sget_fc(fc, test, set_anon_super_fc)
- * if (!sb->s_root) {
- * err = bm_fill_super(sb, fc);
- *
- * => we should never get here with initialized ve->binfmt_misc.
- */
- if (WARN_ON_ONCE(ve->binfmt_misc))
- return -EEXIST;
+ if (!bm_data) {
+ bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+ if (!bm_data)
+ return -ENOMEM;
- bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
- if (!bm_data)
- return -ENOMEM;
+ INIT_LIST_HEAD(&bm_data->entries);
+ rwlock_init(&bm_data->entries_lock);
- INIT_LIST_HEAD(&bm_data->entries);
- rwlock_init(&bm_data->entries_lock);
+#ifdef CONFIG_VE
+ ve->binfmt_misc = bm_data;
+ /* this will be cleared by ve_destroy() */
+#endif
+ }
err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
- if (err) {
- kfree(bm_data);
+ if (err)
return err;
- }
sb->s_op = &s_ops;
-
- ve->binfmt_misc = bm_data;
bm_data->enabled = 1;
return 0;
@@ -956,6 +949,7 @@ static struct file_system_type bm_fs_type = {
};
MODULE_ALIAS_FS("binfmt_misc");
+#ifdef CONFIG_VE
static void ve_binfmt_fini(void *data)
{
struct ve_struct *ve = data;
@@ -965,8 +959,17 @@ static void ve_binfmt_fini(void *data)
return;
/*
- * XXX: Note we don't take any locks here. This is safe as long as
- * nobody uses binfmt_misc outside the owner ve.
+ * This is called when VE is being destructed, no more processes are
+ * in VE and thus use of bm_data is unexpected.
+ *
+ * Still, there is a possibility for a race, if a host process
+ * explicitly enters VE's mount namespace and accesses files on
+ * binfmt_misc mount, while VE is being destructed.
+ *
+ * This is extremely unlikely so ignore it for now.
+ *
+ * To fix, need to move this to ve_destroy() path that is executed when
+ * no more references to VE are left.
*/
while (!list_empty(&bm_data->entries))
kill_node(bm_data, list_first_entry(
@@ -978,6 +981,9 @@ static struct ve_hook ve_binfmt_hook = {
.priority = HOOK_PRIO_DEFAULT,
.owner = THIS_MODULE,
};
+#else
+#define ve_binfmt_hook 0
+#endif
static int __init init_misc_binfmt(void)
{
--
2.30.2
More information about the Devel
mailing list