[Devel] [PATCH RHEL7 COMMIT] ms/move_mount: allow to add a mount into an existing group

Konstantin Khorenko khorenko at virtuozzo.com
Thu Apr 20 21:00:48 MSK 2023


The commit is pushed to "branch-rh7-3.10.0-1160.88.1.vz7.195.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.88.1.vz7.195.2
------>
commit fbd91291b0e7c1999ee0218ad7dc24326c1a0027
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Thu Apr 13 18:47:12 2023 +0800

    ms/move_mount: allow to add a mount into an existing group
    
    Previously a sharing group (shared and master ids pair) can be only
    inherited when mount is created via bindmount. This patch adds an
    ability to add an existing private mount into an existing sharing group.
    
    With this functionality one can first create the desired mount tree from
    only private mounts (without the need to care about undesired mount
    propagation or mount creation order implied by sharing group
    dependencies), and next then setup any desired mount sharing between
    those mounts in tree as needed.
    
    This allows CRIU to restore any set of mount namespaces, mount trees and
    sharing group trees for a container.
    
    We have many issues with restoring mounts in CRIU related to sharing
    groups and propagation:
    - reverse sharing groups vs mount tree order requires complex mounts
      reordering which mostly implies also using some temporary mounts
    (please see https://lkml.org/lkml/2021/3/23/569 for more info)
    
    - mount() syscall creates tons of mounts due to propagation
    - mount re-parenting due to propagation
    - "Mount Trap" due to propagation
    - "Non Uniform" propagation, meaning that with different tricks with
      mount order and temporary children-"lock" mounts one can create mount
      trees which can't be restored without those tricks
    (see https://www.linuxplumbersconf.org/event/7/contributions/640/)
    
    With this new functionality we can resolve all the problems with
    propagation at once.
    
    Link: https://lore.kernel.org/r/20210715100714.120228-1-ptikhomirov@virtuozzo.com
    Cc: Eric W. Biederman <ebiederm at xmission.com>
    Cc: Alexander Viro <viro at zeniv.linux.org.uk>
    Cc: Christian Brauner <christian.brauner at ubuntu.com>
    Cc: Mattias Nissler <mnissler at chromium.org>
    Cc: Aleksa Sarai <cyphar at cyphar.com>
    Cc: Andrei Vagin <avagin at gmail.com>
    Cc: linux-fsdevel at vger.kernel.org
    Cc: linux-api at vger.kernel.org
    Cc: lkml <linux-kernel at vger.kernel.org>
    Co-developed-by: Andrei Vagin <avagin at gmail.com>
    Acked-by: Christian Brauner <christian.brauner at ubuntu.com>
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    Signed-off-by: Andrei Vagin <avagin at gmail.com>
    
    Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
    
    https://jira.sw.ru/browse/PSBM-144416
    (cherry picked from commit 9ffb14ef61bab83fa818736bf3e7e6b6e182e8e2)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    =================
    Patchset description:
    mount: Port move_mount_set_group and mount_setattr
    
    We need this as in Virtuozzo criu after rebase to mainstream criu in u20
    we will switch to this new API for sharing group setting accross mounts.
    
    https://jira.vzint.dev/browse/PSBM-144416
---
 fs/namespace.c          | 77 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/uapi/linux/fs.h |  3 +-
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 94f1e308b354..d10138869c91 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3017,6 +3017,78 @@ static bool check_for_nsfs_mounts(struct mount *subtree)
 	return ret;
 }
 
+static int do_set_group(struct path *from_path, struct path *to_path)
+{
+	struct mount *from, *to;
+	int err;
+
+	from = real_mount(from_path->mnt);
+	to = real_mount(to_path->mnt);
+
+	namespace_lock();
+
+	err = -EINVAL;
+	/* To and From must be mounted */
+	if (!is_mounted(&from->mnt))
+		goto out;
+	if (!is_mounted(&to->mnt))
+		goto out;
+
+	err = -EPERM;
+	/* We should be allowed to modify mount namespaces of both mounts */
+	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out;
+	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+		goto out;
+
+	err = -EINVAL;
+	/* To and From paths should be mount roots */
+	if (from_path->dentry != from_path->mnt->mnt_root)
+		goto out;
+	if (to_path->dentry != to_path->mnt->mnt_root)
+		goto out;
+
+	/* Setting sharing groups is only allowed across same superblock */
+	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
+		goto out;
+
+	/* From mount root should be wider than To mount root */
+	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
+		goto out;
+
+	/* From mount should not have locked children in place of To's root */
+	if (has_locked_children(from, to->mnt.mnt_root))
+		goto out;
+
+	/* Setting sharing groups is only allowed on private mounts */
+	if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
+		goto out;
+
+	/* From should not be private */
+	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
+		goto out;
+
+	if (IS_MNT_SLAVE(from)) {
+		struct mount *m = from->mnt_master;
+
+		list_add(&to->mnt_slave, &m->mnt_slave_list);
+		to->mnt_master = m;
+	}
+
+	if (IS_MNT_SHARED(from)) {
+		to->mnt_group_id = from->mnt_group_id;
+		list_add(&to->mnt_share, &from->mnt_share);
+		lock_mount_hash();
+		set_mnt_shared(to);
+		unlock_mount_hash();
+	}
+
+	err = 0;
+out:
+	namespace_unlock();
+	return err;
+}
+
 static int do_move_mount(struct path *old_path, struct path *new_path)
 {
 	struct path parent_path = {.mnt = NULL, .dentry = NULL};
@@ -3805,7 +3877,10 @@ SYSCALL_DEFINE5(move_mount,
 	if (ret < 0)
 		goto out_to;
 
-	ret = do_move_mount(&from_path, &to_path);
+	if (flags & MOVE_MOUNT_SET_GROUP)
+		ret = do_set_group(&from_path, &to_path);
+	else
+		ret = do_move_mount(&from_path, &to_path);
 
 out_to:
 	path_put(&to_path);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 8c9e6a255341..7d911e984b3e 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -140,7 +140,8 @@ struct inodes_stat_t {
 #define MOVE_MOUNT_T_SYMLINKS		0x00000010 /* Follow symlinks on to path */
 #define MOVE_MOUNT_T_AUTOMOUNTS		0x00000020 /* Follow automounts on to path */
 #define MOVE_MOUNT_T_EMPTY_PATH		0x00000040 /* Empty to path permitted */
-#define MOVE_MOUNT__MASK		0x00000077
+#define MOVE_MOUNT_SET_GROUP		0x00000100 /* Set sharing group instead */
+#define MOVE_MOUNT__MASK		0x00000177
 
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */


More information about the Devel mailing list