[Devel] [PATCH RH7 v2] proc/mounts: add cursor

Tue Mar 16 18:51:24 MSK 2021

We've discussed with Alex that this approach should be optimal in ratio 
between code maintainability and efficiency. This approach teoretycly 
can be deceived by mount pointer reuse, but probability of it is too 
low. But positive side here is that this approach does not need to alter 
general seq file code directly, unlike the previous idea, so it looks 
less intruzive.

Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

On 3/16/21 3:21 PM, Alexander Mikhalitsyn wrote:
> Patch-set description:
> We see a race when reading mountinfo from criu line-by-line. When we
> read mountinfo line-by-line and some mounts which were already printed
> were umounted before printing next mounts we can see some of next mounts
> skipped from output (consequent cat /proc/<pid>/mountinfo shows them).
> This leads to situations where the mount tree is inconsistend without
> those missing mounts and criu fails.
> 
> So mountinfo printing based on numeric position in mounts list is racy.
> We can switch to new mainstream approach witch adds mountinfo "cursor"
> to mount namespace mounts list so that even if some mounts are deleted
> from the list, we can still continue reading next mounts after "cursor"
> consistently.
> 
> Original patch description:
> From: Miklos Szeredi <mszeredi at redhat.com>
> 
> If mounts are deleted after a read(2) call on /proc/self/mounts (or its
> kin), the subsequent read(2) could miss a mount that comes after the
> deleted one in the list.  This is because the file position is interpreted
> as the number mount entries from the start of the list.
> 
> E.g. first read gets entries #0 to #9; the seq file index will be 10.  Then
> entry #5 is deleted, resulting in #10 becoming #9 and #11 becoming #10,
> etc...  The next read will continue from entry #10, and #9 is missed.
> 
> Solve this by adding a cursor entry for each open instance.  Taking the
> global namespace_sem for write seems excessive, since we are only dealing
> with a per-namespace list.  Instead add a per-namespace spinlock and use
> that together with namespace_sem taken for read to protect against
> concurrent modification of the mount list.  This may reduce parallelism of
> is_local_mountpoint(), but it's hardly a big contention point.  We could
> also use RCU freeing of cursors to make traversal not need additional
> locks, if that turns out to be neceesary.
> 
> Only move the cursor once for each read (cursor is not added on open) to
> minimize cacheline invalidation.  When EOF is reached, the cursor is taken
> off the list, in order to prevent an excessive number of cursors due to
> inactive open file descriptors.
> 
> Reported-by: Karel Zak <kzak at redhat.com>
> Signed-off-by: Miklos Szeredi <mszeredi at redhat.com>
> 
> Changes:
> 1) make mnt_is_cursor, lock_ns_list, unlock_ns_list externally visible
> 2) use it in VZ specific function sync_collect_filesystems to skip
> cursor mounts from walk over mounts of mount namespace.
> 
> Note: This patch can be a bit risky because we add empty "cursor" mount
> entries to the mount namespace list. This way we need to be very careful
> when iterating the list (list_for_each_entry or list_first_entry), we
> need to always skip those "cursor" mounts everywhere where we can access
> it's content, or we can get crash. But there are several places where
> mounts are connected with this mnt_list but not connected to namespace
> and it means that they can't be mixed with "cursor" entries, so
> mainstream kernel just relies on this fact and does not explicitly skip
> "cursor" for them.
> 
> --
> 
> This commit was reverted previously due to
> https://jira.sw.ru/browse/PSBM-126568
> 
> Originally, it introduces mounts iterator implementation which is incompatible
> with old seq_file semantics. Corresponding patch from ms
> 3a0cfe5da871 ("fs/seq_file.c: simplify seq_file iteration code and interface")
> needs to be ported to overcome this issue. But this patch introduces many
> other unpredictable issues. So, it was easier to adopt current patch to make
> it work fine with old semantics.
> 
> https://jira.sw.ru/browse/PSBM-125812
> (cherry-picked from 9f6c61f96f2d97cbb5f7fa85607bc398f843ff0f)
> Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
> Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn at virtuozzo.com>
> ---
>   fs/mount.h            |  18 +++++-
>   fs/namespace.c        | 144 +++++++++++++++++++++++++++++++++++++-----
>   fs/proc_namespace.c   |   6 +-
>   fs/sync.c             |   5 +-
>   include/linux/fs.h    |   3 +
>   include/linux/mount.h |   4 +-
>   6 files changed, 158 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index eab4a7a02a1c..b07b3934b746 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -9,7 +9,13 @@ struct mnt_namespace {
>   	atomic_t		count;
>   	struct ns_common	ns;
>   	struct mount *	root;
> +	/*
> +	 * Traversal and modification of .list is protected by either
> +	 * - taking namespace_sem for write, OR
> +	 * - taking namespace_sem for read AND taking .ns_lock.
> +	 */
>   	struct list_head	list;
> +	spinlock_t		ns_lock;
>   	struct user_namespace	*user_ns;
>   	struct ucounts		*ucounts;
>   	u64			seq;	/* Sequence number to prevent loops */
> @@ -141,15 +147,19 @@ struct proc_mounts {
>   	struct mnt_namespace *ns;
>   	struct path root;
>   	int (*show)(struct seq_file *, struct vfsmount *);
> -	void *cached_mount;
> -	u64 cached_event;
> -	loff_t cached_index;
> +	struct mount cursor;
> +	struct mount *last_mntpos;
> +	loff_t last_pos;
>   };
>   
>   #define proc_mounts(p) (container_of((p), struct proc_mounts, m))
>   
>   extern const struct seq_operations mounts_op;
>   
> +extern inline void lock_ns_list(struct mnt_namespace *ns);
> +extern inline void unlock_ns_list(struct mnt_namespace *ns);
> +extern inline bool mnt_is_cursor(struct mount *mnt);
> +
>   extern bool __is_local_mountpoint(struct dentry *dentry);
>   static inline bool is_local_mountpoint(struct dentry *dentry)
>   {
> @@ -166,3 +176,5 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
>   {
>   	return ns->seq == 0;
>   }
> +
> +extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 2139d3825c22..042ede7abc3b 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -744,6 +744,21 @@ struct vfsmount *lookup_mnt(const struct path *path)
>   	return m;
>   }
>   
> +inline void lock_ns_list(struct mnt_namespace *ns)
> +{
> +	spin_lock(&ns->ns_lock);
> +}
> +
> +inline void unlock_ns_list(struct mnt_namespace *ns)
> +{
> +	spin_unlock(&ns->ns_lock);
> +}
> +
> +inline bool mnt_is_cursor(struct mount *mnt)
> +{
> +	return mnt->mnt.mnt_flags & MNT_CURSOR;
> +}
> +
>   /*
>    * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
>    *                         current mount namespace.
> @@ -769,11 +784,15 @@ bool __is_local_mountpoint(struct dentry *dentry)
>   		goto out;
>   
>   	down_read(&namespace_sem);
> +	lock_ns_list(ns);
>   	list_for_each_entry(mnt, &ns->list, mnt_list) {
> +		if (mnt_is_cursor(mnt))
> +			continue;
>   		is_covered = (mnt->mnt_mountpoint == dentry);
>   		if (is_covered)
>   			break;
>   	}
> +	unlock_ns_list(ns);
>   	up_read(&namespace_sem);
>   out:
>   	return is_covered;
> @@ -1408,47 +1427,125 @@ void replace_mount_options(struct super_block *sb, char *options)
>   }
>   EXPORT_SYMBOL(replace_mount_options);
>   
> +struct mount *mnt_list_next(struct mnt_namespace *ns,
> +			    struct list_head *p)
> +{
> +	struct mount *mnt, *ret = NULL;
> +
> +	lock_ns_list(ns);
> +	list_for_each_continue(p, &ns->list) {
> +		mnt = list_entry(p, typeof(*mnt), mnt_list);
> +		if (!mnt_is_cursor(mnt)) {
> +			ret = mnt;
> +			break;
> +		}
> +	}
> +	unlock_ns_list(ns);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL(mnt_list_next);
> +
>   #ifdef CONFIG_PROC_FS
>   /* iterator; we want it to have access to namespace_sem, thus here... */
>   static void *m_start(struct seq_file *m, loff_t *pos)
>   {
>   	struct proc_mounts *p = proc_mounts(m);
> +	struct list_head *prev;
> +	struct mount *mnt;
>   
>   	down_read(&namespace_sem);
> -	if (p->cached_event == p->ns->event) {
> -		void *v = p->cached_mount;
> -		if (*pos == p->cached_index)
> -			return v;
> -		if (*pos == p->cached_index + 1) {
> -			v = seq_list_next(v, &p->ns->list, &p->cached_index);
> -			return p->cached_mount = v;
> -		}
> +	if (!*pos) {
> +		prev = &p->ns->list;
> +	} else {
> +		prev = &p->cursor.mnt_list;
> +
> +		/* Read after we'd reached the end? */
> +		if (list_empty(prev))
> +			return NULL;
>   	}
>   
> -	p->cached_event = p->ns->event;
> -	p->cached_mount = seq_list_start(&p->ns->list, *pos);
> -	p->cached_index = *pos;
> -	return p->cached_mount;
> +	mnt = mnt_list_next(p->ns, prev);
> +
> +	/*
> +	 * We should be careful here. seq_file iterator
> +	 * semantics was changed in ms kernel since
> +	 * 3a0cfe5da871 ("fs/seq_file.c: simplify seq_file
> +	 *               iteration code and interface")
> +	 *
> +	 * Starting from this patch generic seq_file code
> +	 * not increments position counter of iterator by self
> +	 * (except one place to handle buggy ->next() callbacks).
> +	 * So, now when user buffer which was provided for read()
> +	 * syscall exhausted seq_file code not increments seq_file
> +	 * iterator position, but calls user provided ->next()
> +	 * callback which must increment position AND switch current
> +	 * mount to the next. Call sequence looks like:
> +	 *                   ->next() pos = 1, index = 1
> +	 *                   ->show() pos = 1, index = 1
> +	 *                   ->next() pos = 2, index = 2
> +	 * ->show() <- after that we realized that we should stop because
> +	 *             user provided read() buffer is full
> +	 *             (notice it's not about seq_file overflow!)
> +	 *     OUR KERNEL               |       MS KERNEL
> +	 *          pos = 2, index = 2  | ->next() pos = 3, index = 3
> +	 * ->stop() pos = 2, index = 2  | ->stop() pos = 3, index = 3
> +	 *                      next read() syscall
> +	 * ->start() pos = 2, index = 3 | ->start() pos = 3, index = 3
> +	 * ->show()  pos = 2, index = 3 | ->show() pos = 3, index = 3
> +	 *                   ->next()
> +	 * As we can see in both cases index was incremented. But in
> +	 * MS kernel it was increment by ->next() callback. In our
> +	 * kernel it was incremented manually in seq_file code.
> +	 *
> +	 * We should take that into account in ->start() implementation
> +	 * and detect case when index was changed bypassing ->next() callback.
> +	 * Ultra-important here is to detect case when mount in front of which
> +	 * mount cursor was is unmounted. In this case we should *not* move forward
> +	 * mount position because it is, tecnically, already moved forward :)
> +	 *
> +	 * After we step on kernels with that patch we should
> +	 * follow ms implementation 9f6c61f96f2d ("proc/mounts: add cursor")
> +	 * here and just remove following lines and p->last_pos field.
> +	 */
> +	if (mnt && (*pos == p->last_pos + 1) &&
> +	    !(p->last_mntpos && (p->last_mntpos != mnt)))
> +		mnt = mnt_list_next(p->ns, &mnt->mnt_list);
> +
> +	p->last_mntpos = mnt;
> +
> +	return mnt;
>   }
>   
>   static void *m_next(struct seq_file *m, void *v, loff_t *pos)
>   {
>   	struct proc_mounts *p = proc_mounts(m);
> +	struct mount *mnt = v;
>   
> -	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
> -	p->cached_index = *pos;
> -	return p->cached_mount;
> +	++*pos;
> +	p->last_pos = *pos;
> +	p->last_mntpos = mnt_list_next(p->ns, &mnt->mnt_list);
> +	return p->last_mntpos;
>   }
>   
>   static void m_stop(struct seq_file *m, void *v)
>   {
> +	struct proc_mounts *p = proc_mounts(m);
> +	struct mount *mnt = v;
> +
> +	lock_ns_list(p->ns);
> +	if (mnt)
> +		list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
> +	else
> +		list_del_init(&p->cursor.mnt_list);
> +	unlock_ns_list(p->ns);
>   	up_read(&namespace_sem);
>   }
>   
>   static int m_show(struct seq_file *m, void *v)
>   {
>   	struct proc_mounts *p = proc_mounts(m);
> -	struct mount *r = list_entry(v, struct mount, mnt_list);
> +	struct mount *r = v;
>   	return p->show(m, &r->mnt);
>   }
>   
> @@ -1458,6 +1555,15 @@ const struct seq_operations mounts_op = {
>   	.stop	= m_stop,
>   	.show	= m_show,
>   };
> +
> +void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
> +{
> +	down_read(&namespace_sem);
> +	lock_ns_list(ns);
> +	list_del(&cursor->mnt_list);
> +	unlock_ns_list(ns);
> +	up_read(&namespace_sem);
> +}
>   #endif  /* CONFIG_PROC_FS */
>   
>   /**
> @@ -3474,6 +3580,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
>   	INIT_LIST_HEAD(&new_ns->list);
>   	INIT_LIST_HEAD(&new_ns->mntns_list);
>   	init_waitqueue_head(&new_ns->poll);
> +	spin_lock_init(&new_ns->ns_lock);
>   	new_ns->user_ns = get_user_ns(user_ns);
>   	new_ns->ucounts = ucounts;
>   	return new_ns;
> @@ -3989,10 +4096,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
>   	bool visible = false;
>   
>   	down_read(&namespace_sem);
> +	lock_ns_list(ns);
>   	list_for_each_entry(mnt, &ns->list, mnt_list) {
>   		struct mount *child;
>   		int mnt_flags;
>   
> +		if (mnt_is_cursor(mnt))
> +			continue;
> +
>   		if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
>   			continue;
>   
> @@ -4040,6 +4151,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
>   	next:	;
>   	}
>   found:
> +	unlock_ns_list(ns);
>   	up_read(&namespace_sem);
>   	return visible;
>   }
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 335237350f76..6da491b3861f 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -277,7 +277,10 @@ static int mounts_open_common(struct inode *inode, struct file *file,
>   	p->root = root;
>   	p->m.poll_event = ns->event;
>   	p->show = show;
> -	p->cached_event = ~0ULL;
> +	INIT_LIST_HEAD(&p->cursor.mnt_list);
> +	p->cursor.mnt.mnt_flags = MNT_CURSOR;
> +	p->last_mntpos = NULL;
> +	p->last_pos = 0;
>   
>   	return 0;
>   
> @@ -295,6 +298,7 @@ static int mounts_release(struct inode *inode, struct file *file)
>   {
>   	struct proc_mounts *p = proc_mounts(file->private_data);
>   	path_put(&p->root);
> +	mnt_cursor_del(p->ns, &p->cursor);
>   	put_mnt_ns(p->ns);
>   	return seq_release(inode, file);
>   }
> diff --git a/fs/sync.c b/fs/sync.c
> index 07676a8fcec4..d994e5b5f4ed 100644
> --- a/fs/sync.c
> +++ b/fs/sync.c
> @@ -142,7 +142,8 @@ static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync
>   	BUG_ON(!list_empty(sync_list));
>   
>   	down_read(&namespace_sem);
> -	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
> +	mnt = mnt_list_next(mnt_ns, &mnt_ns->list);
> +	while (mnt) {
>   		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
>   			continue;
>   
> @@ -161,6 +162,8 @@ static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync
>   		ss->sb->s_count++;
>   		spin_unlock(&sb_lock);
>   		list_add_tail(&ss->list, sync_list);
> +
> +		mnt = mnt_list_next(mnt_ns, &mnt->mnt_list);
>   	}
>   	up_read(&namespace_sem);
>   	return ret;
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 521ff1a1f3b0..d225c6cbfebc 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -53,6 +53,7 @@ struct cred;
>   struct swap_info_struct;
>   struct seq_file;
>   struct workqueue_struct;
> +struct mnt_namespace;
>   
>   extern void __init inode_init(void);
>   extern void __init inode_init_early(void);
> @@ -3488,6 +3489,8 @@ extern int file_update_time(struct file *file);
>   extern int generic_show_options(struct seq_file *m, struct dentry *root);
>   extern void save_mount_options(struct super_block *sb, char *options);
>   extern void replace_mount_options(struct super_block *sb, char *options);
> +extern struct mount *mnt_list_next(struct mnt_namespace *ns,
> +				   struct list_head *p);
>   
>   static inline bool io_is_direct(struct file *filp)
>   {
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 8e0352af06b7..21c97ab19b6f 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -48,7 +48,8 @@ struct mnt_namespace;
>   #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
>   
>   #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
> -			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
> +			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \
> +			    MNT_CURSOR)
>   
>   #define MNT_INTERNAL	0x4000
>   
> @@ -62,6 +63,7 @@ struct mnt_namespace;
>   #define MNT_SYNC_UMOUNT		0x2000000
>   #define MNT_MARKED		0x4000000
>   #define MNT_UMOUNT		0x8000000
> +#define MNT_CURSOR		0x10000000
>   
>   struct vfsmount {
>   	struct dentry *mnt_root;	/* root of the mounted tree */
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.