[Devel] [PATCH vz8 v2] binfmt_misc: fix mount after umount in CT

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Tue Oct 19 17:40:00 MSK 2021


The assumption that bm_fill_super() is not called for the second time
for CT is wrong: umount operation clears sb->s_root, which causes
vfs_get_super() to call fill_super again on the next mount.

Make bm_fill_super() handle multiple-calls corrently:
- initialize bm_data and set ve->binfmt_misc only if it is not done
  before,
- delay desctruction of it up to CT destruction.

https://jira.sw.ru/browse/PSBM-133968
Fixes: 8250ff41d190 ("ve/fs/binfmt: clean bm_data reference from ve on err path")
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
Changes from v1:
- fix double-free of bm_data
- make sure it compiles for !CONFIG_VE case

 fs/binfmt_misc.c | 58 ++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 0946e7e6caa5..5a3de9fea265 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -852,39 +852,32 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 		/* last one */ {""}
 	};
 
+#ifdef CONFIG_VE
 	struct ve_struct *ve = get_exec_env();
-	struct binfmt_misc *bm_data;
+	struct binfmt_misc *bm_data = ve->binfmt_misc;
+#else
+	static struct binfmt_misc *bm_data = NULL;
+#endif
 
-	/*
-	 * bm_get_tree()
-	 *  get_tree_keyed(fc, bm_fill_super, get_ve(ve))
-	 *   fc->s_fs_info = current VE
-	 *   vfs_get_super(fc, vfs_get_keyed_super, bm_fill_super)
-	 *    sb = sget_fc(fc, test, set_anon_super_fc)
-	 *    if (!sb->s_root) {
-	 *		err = bm_fill_super(sb, fc);
-	 *
-	 * => we should never get here with initialized ve->binfmt_misc.
-	 */
-	if (WARN_ON_ONCE(ve->binfmt_misc))
-		return -EEXIST;
+	if (!bm_data) {
+		bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!bm_data)
+			return -ENOMEM;
 
-	bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
-	if (!bm_data)
-		return -ENOMEM;
+		INIT_LIST_HEAD(&bm_data->entries);
+		rwlock_init(&bm_data->entries_lock);
 
-	INIT_LIST_HEAD(&bm_data->entries);
-	rwlock_init(&bm_data->entries_lock);
+#ifdef CONFIG_VE
+		ve->binfmt_misc = bm_data;
+		/* this will be cleared by ve_destroy() */
+#endif
+	}
 
 	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
-	if (err) {
-		kfree(bm_data);
+	if (err)
 		return err;
-	}
 
 	sb->s_op = &s_ops;
-
-	ve->binfmt_misc = bm_data;
 	bm_data->enabled = 1;
 
 	return 0;
@@ -956,6 +949,7 @@ static struct file_system_type bm_fs_type = {
 };
 MODULE_ALIAS_FS("binfmt_misc");
 
+#ifdef CONFIG_VE
 static void ve_binfmt_fini(void *data)
 {
 	struct ve_struct *ve = data;
@@ -965,8 +959,17 @@ static void ve_binfmt_fini(void *data)
 		return;
 
 	/*
-	 * XXX: Note we don't take any locks here. This is safe as long as
-	 * nobody uses binfmt_misc outside the owner ve.
+	 * This is called when VE is being destructed, no more processes are
+	 * in VE and thus use of bm_data is unexpected.
+	 *
+	 * Still, there is a possibility for a race, if a host process
+	 * explicitly enters VE's mount namespace and accesses files on
+	 * binfmt_misc mount, while VE is being destructed.
+	 *
+	 * This is extremely unlikely so ignore it for now.
+	 *
+	 * To fix, need to move this to ve_destroy() path that is executed when
+	 * no more references to VE are left.
 	 */
 	while (!list_empty(&bm_data->entries))
 		kill_node(bm_data, list_first_entry(
@@ -978,6 +981,9 @@ static struct ve_hook ve_binfmt_hook = {
 	.priority	= HOOK_PRIO_DEFAULT,
 	.owner		= THIS_MODULE,
 };
+#else
+#define ve_binfmt_hook 0
+#endif
 
 static int __init init_misc_binfmt(void)
 {
-- 
2.30.2



More information about the Devel mailing list