[Devel] [PATCH RHEL8 COMMIT] binfmt_misc: fix mount after umount in CT

Konstantin Khorenko khorenko at virtuozzo.com
Tue Oct 26 19:50:23 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-305.3.1.vz8.7.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-305.3.1.vz8.7.16
------>
commit 98fc9664aceb4fd8bc9a192e3c3a9bd0d488867f
Author: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
Date:   Tue Oct 26 19:50:22 2021 +0300

    binfmt_misc: fix mount after umount in CT
    
    The assumption that bm_fill_super() is not called for the second time
    for CT is wrong: umount operation clears sb->s_root, which causes
    vfs_get_super() to call fill_super again on the next mount.
    
    Make bm_fill_super() handle multiple-calls corrently:
    - initialize bm_data and set ve->binfmt_misc only if it is not done
      before,
    - delay desctruction of it up to CT destruction.
    
    https://jira.sw.ru/browse/PSBM-133968
    Fixes: 8250ff41d190 ("ve/fs/binfmt: clean bm_data reference from ve on err path")
    
    Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 fs/binfmt_misc.c | 58 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 0946e7e6caa5..5a3de9fea265 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -852,39 +852,32 @@ static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
 		/* last one */ {""}
 	};
 
+#ifdef CONFIG_VE
 	struct ve_struct *ve = get_exec_env();
-	struct binfmt_misc *bm_data;
+	struct binfmt_misc *bm_data = ve->binfmt_misc;
+#else
+	static struct binfmt_misc *bm_data = NULL;
+#endif
 
-	/*
-	 * bm_get_tree()
-	 *  get_tree_keyed(fc, bm_fill_super, get_ve(ve))
-	 *   fc->s_fs_info = current VE
-	 *   vfs_get_super(fc, vfs_get_keyed_super, bm_fill_super)
-	 *    sb = sget_fc(fc, test, set_anon_super_fc)
-	 *    if (!sb->s_root) {
-	 *		err = bm_fill_super(sb, fc);
-	 *
-	 * => we should never get here with initialized ve->binfmt_misc.
-	 */
-	if (WARN_ON_ONCE(ve->binfmt_misc))
-		return -EEXIST;
+	if (!bm_data) {
+		bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!bm_data)
+			return -ENOMEM;
 
-	bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
-	if (!bm_data)
-		return -ENOMEM;
+		INIT_LIST_HEAD(&bm_data->entries);
+		rwlock_init(&bm_data->entries_lock);
 
-	INIT_LIST_HEAD(&bm_data->entries);
-	rwlock_init(&bm_data->entries_lock);
+#ifdef CONFIG_VE
+		ve->binfmt_misc = bm_data;
+		/* this will be cleared by ve_destroy() */
+#endif
+	}
 
 	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
-	if (err) {
-		kfree(bm_data);
+	if (err)
 		return err;
-	}
 
 	sb->s_op = &s_ops;
-
-	ve->binfmt_misc = bm_data;
 	bm_data->enabled = 1;
 
 	return 0;
@@ -956,6 +949,7 @@ static struct file_system_type bm_fs_type = {
 };
 MODULE_ALIAS_FS("binfmt_misc");
 
+#ifdef CONFIG_VE
 static void ve_binfmt_fini(void *data)
 {
 	struct ve_struct *ve = data;
@@ -965,8 +959,17 @@ static void ve_binfmt_fini(void *data)
 		return;
 
 	/*
-	 * XXX: Note we don't take any locks here. This is safe as long as
-	 * nobody uses binfmt_misc outside the owner ve.
+	 * This is called when VE is being destructed, no more processes are
+	 * in VE and thus use of bm_data is unexpected.
+	 *
+	 * Still, there is a possibility for a race, if a host process
+	 * explicitly enters VE's mount namespace and accesses files on
+	 * binfmt_misc mount, while VE is being destructed.
+	 *
+	 * This is extremely unlikely so ignore it for now.
+	 *
+	 * To fix, need to move this to ve_destroy() path that is executed when
+	 * no more references to VE are left.
 	 */
 	while (!list_empty(&bm_data->entries))
 		kill_node(bm_data, list_first_entry(
@@ -978,6 +981,9 @@ static struct ve_hook ve_binfmt_hook = {
 	.priority	= HOOK_PRIO_DEFAULT,
 	.owner		= THIS_MODULE,
 };
+#else
+#define ve_binfmt_hook 0
+#endif
 
 static int __init init_misc_binfmt(void)
 {


More information about the Devel mailing list