[Devel] [PATCH RHEL8 COMMIT] ms/proc/mounts: add cursor

Konstantin Khorenko khorenko at virtuozzo.com
Mon Jun 21 20:56:49 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.47
------>
commit 2b5afb5891b8ad85cce0f64fe93ac3645333a767
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Mon Jun 21 20:56:49 2021 +0300

    ms/proc/mounts: add cursor
    
    Patch-set description:
    We see a race when reading mountinfo from criu line-by-line. When we
    read mountinfo line-by-line and some mounts which were already printed
    were umounted before printing next mounts we can see some of next mounts
    skipped from output (consequent cat /proc/<pid>/mountinfo shows them).
    This leads to situations where the mount tree is inconsistend without
    those missing mounts and criu fails.
    
    So mountinfo printing based on numeric position in mounts list is racy.
    We can switch to new mainstream approach witch adds mountinfo "cursor"
    to mount namespace mounts list so that even if some mounts are deleted
    from the list, we can still continue reading next mounts after "cursor"
    consistently.
    
    Original patch description:
    From: Miklos Szeredi <mszeredi at redhat.com>
    
    If mounts are deleted after a read(2) call on /proc/self/mounts (or its
    kin), the subsequent read(2) could miss a mount that comes after the
    deleted one in the list.  This is because the file position is interpreted
    as the number mount entries from the start of the list.
    
    E.g. first read gets entries #0 to #9; the seq file index will be 10.  Then
    entry #5 is deleted, resulting in #10 becoming #9 and #11 becoming #10,
    etc...  The next read will continue from entry #10, and #9 is missed.
    
    Solve this by adding a cursor entry for each open instance.  Taking the
    global namespace_sem for write seems excessive, since we are only dealing
    with a per-namespace list.  Instead add a per-namespace spinlock and use
    that together with namespace_sem taken for read to protect against
    concurrent modification of the mount list.  This may reduce parallelism of
    is_local_mountpoint(), but it's hardly a big contention point.  We could
    also use RCU freeing of cursors to make traversal not need additional
    locks, if that turns out to be neceesary.
    
    Only move the cursor once for each read (cursor is not added on open) to
    minimize cacheline invalidation.  When EOF is reached, the cursor is taken
    off the list, in order to prevent an excessive number of cursors due to
    inactive open file descriptors.
    
    Reported-by: Karel Zak <kzak at redhat.com>
    Signed-off-by: Miklos Szeredi <mszeredi at redhat.com>
    
    Changes:
    1) make mnt_is_cursor, lock_ns_list, unlock_ns_list externally visible
    2) use it in VZ specific function sync_collect_filesystems to skip
    cursor mounts from walk over mounts of mount namespace.
    
    Note: This patch can be a bit risky because we add empty "cursor" mount
    entries to the mount namespace list. This way we need to be very careful
    when iterating the list (list_for_each_entry or list_first_entry), we
    need to always skip those "cursor" mounts everywhere where we can access
    it's content, or we can get crash. But there are several places where
    mounts are connected with this mnt_list but not connected to namespace
    and it means that they can't be mixed with "cursor" entries, so
    mainstream kernel just relies on this fact and does not explicitly skip
    "cursor" for them.
    
    https://jira.sw.ru/browse/PSBM-125812
    (cherry-picked from ms commit 9f6c61f96f2d97cbb5f7fa85607bc398f843ff0f)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    (cherry-picked from vz7 commit da00800786dd ("proc/mounts: add cursor"))
    
    https://jira.sw.ru/browse/PSBM-127858
    Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 fs/mount.h            | 14 ++++++--
 fs/namespace.c        | 92 +++++++++++++++++++++++++++++++++++++++++----------
 fs/proc_namespace.c   |  4 ++-
 fs/sync.c             |  5 +++
 include/linux/mount.h |  4 ++-
 5 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/fs/mount.h b/fs/mount.h
index 9225fc7bce30..66b863fa3db0 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -9,7 +9,13 @@ struct mnt_namespace {
 	atomic_t		count;
 	struct ns_common	ns;
 	struct mount *	root;
+	/*
+	 * Traversal and modification of .list is protected by either
+	 * - taking namespace_sem for write, OR
+	 * - taking namespace_sem for read AND taking .ns_lock.
+	 */
 	struct list_head	list;
+	spinlock_t              ns_lock;
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
@@ -137,12 +143,13 @@ struct proc_mounts {
 	struct mnt_namespace *ns;
 	struct path root;
 	int (*show)(struct seq_file *, struct vfsmount *);
-	void *cached_mount;
-	u64 cached_event;
-	loff_t cached_index;
+	struct mount cursor;
 };
 
 extern const struct seq_operations mounts_op;
+extern inline void lock_ns_list(struct mnt_namespace *ns);
+extern inline void unlock_ns_list(struct mnt_namespace *ns);
+extern inline bool mnt_is_cursor(struct mount *mnt);
 
 extern bool __is_local_mountpoint(struct dentry *dentry);
 static inline bool is_local_mountpoint(struct dentry *dentry)
@@ -157,3 +164,4 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
 {
 	return ns->seq == 0;
 }
+extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
diff --git a/fs/namespace.c b/fs/namespace.c
index 0096fbb71774..75aa3ae9585e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -671,6 +671,21 @@ struct vfsmount *lookup_mnt(const struct path *path)
 	return m;
 }
 
+inline void lock_ns_list(struct mnt_namespace *ns)
+{
+	spin_lock(&ns->ns_lock);
+}
+
+inline void unlock_ns_list(struct mnt_namespace *ns)
+{
+	spin_unlock(&ns->ns_lock);
+}
+
+inline bool mnt_is_cursor(struct mount *mnt)
+{
+	return mnt->mnt.mnt_flags & MNT_CURSOR;
+}
+
 /*
  * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
  *                         current mount namespace.
@@ -696,11 +711,15 @@ bool __is_local_mountpoint(struct dentry *dentry)
 		goto out;
 
 	down_read(&namespace_sem);
+	lock_ns_list(ns);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		if (mnt_is_cursor(mnt))
+			continue;
 		is_covered = (mnt->mnt_mountpoint == dentry);
 		if (is_covered)
 			break;
 	}
+	unlock_ns_list(ns);
 	up_read(&namespace_sem);
 out:
 	return is_covered;
@@ -1278,46 +1297,70 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
 }
 
 #ifdef CONFIG_PROC_FS
+static struct mount *mnt_list_next(struct mnt_namespace *ns,
+				   struct list_head *p)
+{
+	struct mount *mnt, *ret = NULL;
+
+	lock_ns_list(ns);
+	list_for_each_continue(p, &ns->list) {
+		mnt = list_entry(p, typeof(*mnt), mnt_list);
+		if (!mnt_is_cursor(mnt)) {
+			ret = mnt;
+			break;
+		}
+	}
+	unlock_ns_list(ns);
+
+	return ret;
+}
+
 /* iterator; we want it to have access to namespace_sem, thus here... */
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	struct proc_mounts *p = m->private;
+	struct list_head *prev;
 
 	down_read(&namespace_sem);
-	if (p->cached_event == p->ns->event) {
-		void *v = p->cached_mount;
-		if (*pos == p->cached_index)
-			return v;
-		if (*pos == p->cached_index + 1) {
-			v = seq_list_next(v, &p->ns->list, &p->cached_index);
-			return p->cached_mount = v;
-		}
-	}
+	if (!*pos) {
+		prev = &p->ns->list;
+	} else {
+		prev = &p->cursor.mnt_list;
 
-	p->cached_event = p->ns->event;
-	p->cached_mount = seq_list_start(&p->ns->list, *pos);
-	p->cached_index = *pos;
-	return p->cached_mount;
+		/* Read after we'd reached the end? */
+		if (list_empty(prev))
+			return NULL;
+	}
+	return mnt_list_next(p->ns, prev);
 }
 
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct proc_mounts *p = m->private;
+	struct mount *mnt = v;
 
-	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
-	p->cached_index = *pos;
-	return p->cached_mount;
+	++*pos;
+	return mnt_list_next(p->ns, &mnt->mnt_list);
 }
 
 static void m_stop(struct seq_file *m, void *v)
 {
+	struct proc_mounts *p = m->private;
+	struct mount *mnt = v;
+
+	lock_ns_list(p->ns);
+	if (mnt)
+		list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
+	else
+		list_del_init(&p->cursor.mnt_list);
+	unlock_ns_list(p->ns);
 	up_read(&namespace_sem);
 }
 
 static int m_show(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = m->private;
-	struct mount *r = list_entry(v, struct mount, mnt_list);
+	struct mount *r = v;
 	return p->show(m, &r->mnt);
 }
 
@@ -1327,6 +1370,15 @@ const struct seq_operations mounts_op = {
 	.stop	= m_stop,
 	.show	= m_show,
 };
+
+void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
+{
+	down_read(&namespace_sem);
+	lock_ns_list(ns);
+	list_del(&cursor->mnt_list);
+	unlock_ns_list(ns);
+	up_read(&namespace_sem);
+}
 #endif  /* CONFIG_PROC_FS */
 
 /**
@@ -3309,6 +3361,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	INIT_LIST_HEAD(&new_ns->list);
 	INIT_LIST_HEAD(&new_ns->mntns_list);
 	init_waitqueue_head(&new_ns->poll);
+	spin_lock_init(&new_ns->ns_lock);
 	new_ns->user_ns = get_user_ns(user_ns);
 	new_ns->ucounts = ucounts;
 	return new_ns;
@@ -3821,10 +3874,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 	bool visible = false;
 
 	down_read(&namespace_sem);
+	lock_ns_list(ns);
 	list_for_each_entry(mnt, &ns->list, mnt_list) {
 		struct mount *child;
 		int mnt_flags;
 
+		if (mnt_is_cursor(mnt))
+			continue;
+
 		if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
 			continue;
 
@@ -3872,6 +3929,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
 	next:	;
 	}
 found:
+	unlock_ns_list(ns);
 	up_read(&namespace_sem);
 	return visible;
 }
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e16fb8f2049e..969f9c8fbdc0 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -279,7 +279,8 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	p->ns = ns;
 	p->root = root;
 	p->show = show;
-	p->cached_event = ~0ULL;
+	INIT_LIST_HEAD(&p->cursor.mnt_list);
+	p->cursor.mnt.mnt_flags = MNT_CURSOR;
 
 	return 0;
 
@@ -296,6 +297,7 @@ static int mounts_release(struct inode *inode, struct file *file)
 	struct seq_file *m = file->private_data;
 	struct proc_mounts *p = m->private;
 	path_put(&p->root);
+	mnt_cursor_del(p->ns, &p->cursor);
 	put_mnt_ns(p->ns);
 	return seq_release_private(inode, file);
 }
diff --git a/fs/sync.c b/fs/sync.c
index ef4b1d17fe5a..75a1e019ee7c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -134,7 +134,11 @@ static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync
 	BUG_ON(!list_empty(sync_list));
 
 	down_read(&namespace_sem);
+	lock_ns_list(mnt_ns);
 	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
+		if (mnt_is_cursor(mnt))
+			continue;
+
 		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
 			continue;
 
@@ -154,6 +158,7 @@ static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync
 		spin_unlock(&sb_lock);
 		list_add_tail(&ss->list, sync_list);
 	}
+	unlock_ns_list(mnt_ns);
 	up_read(&namespace_sem);
 	return ret;
 }
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 45b1f56c6c2f..5316b60e9028 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -49,7 +49,8 @@ struct mnt_namespace;
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
+			    MNT_CURSOR)
 
 #define MNT_INTERNAL	0x4000
 
@@ -63,6 +64,7 @@ struct mnt_namespace;
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
 #define MNT_UMOUNT		0x8000000
+#define MNT_CURSOR		0x10000000
 
 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */


More information about the Devel mailing list