[Devel] [PATCH vz9] fs: support syncing nested mount namespaces in CT

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Mon Nov 1 11:11:50 MSK 2021


Before this patch, with fs.fsync-enable == 2 (filtered sync, default),
filesystems mounted inside non-root mount namespace of CT have not been
synced.

This was caused by 'sync' command in CT explicitly iterating over
filesystems of CT's root mount namespace.

Replace that by iterating over all superblocks, and for each, checking
if it has a mount accessible from CT.

Tested by:
- temporary add printk's to show which SBs are synced,
- on host:
    dd if=/dev/zero of=/root/fsimage bs=1M count=128
    mkfs.ext4 /root/fsimage
    losetup /dev/loop0 /root/fsimage
    vzctl set 300 --devnodes loop0:rw
  inside CT, shell 1
    unshare -m /bin/bash
    mount /dev/loop0 /root/mnt
    ls /root/mnt   # ensure that mount is visible (lost+found is there)
  inside CT, shell 2
    ls /root/mnt   # ensure that mount is not visible
    sync
- check logs and ensure that SB corresponding to blkdev 7:0 was synced.

https://jira.sw.ru/browse/PSBM-44684
Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 fs/internal.h  |  10 +++
 fs/namespace.c |  17 +++++
 fs/sync.c      | 193 ++++++++++++-------------------------------------
 3 files changed, 72 insertions(+), 148 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 82e8eb32ff3d..257e51814d2c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,6 +96,16 @@ int path_mount(const char *dev_name, struct path *path,
 		const char *type_page, unsigned long flags, void *data_page);
 int path_umount(struct path *path, int flags);
 
+#ifdef CONFIG_VE
+extern bool is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb);
+#else
+static inline bool is_sb_ve_accessible(struct ve_struct *ve,
+		struct super_block *sb)
+{
+	return true;
+}
+#endif
+
 /*
  * fs_struct.c
  */
diff --git a/fs/namespace.c b/fs/namespace.c
index dc73e945b746..4a57b4f69fa9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2832,6 +2832,23 @@ static inline void ve_mount_nr_dec(struct mount *mnt)
 	mnt->ve_owner = NULL;
 }
 
+bool is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
+{
+	struct mount *mnt;
+	bool ret = false;
+
+	lock_mount_hash();
+	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
+		if (mnt->ve_owner == ve) {
+			ret = true;
+			break;
+		}
+	}
+	unlock_mount_hash();
+
+	return ret;
+}
+
 #else /* CONFIG_VE */
 
 static inline int ve_mount_allowed(void) { return 1; }
diff --git a/fs/sync.c b/fs/sync.c
index 1c78756d4749..1e6f65a40190 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -71,17 +71,38 @@ int sync_filesystem(struct super_block *sb)
 }
 EXPORT_SYMBOL(sync_filesystem);
 
+struct sync_arg {
+	struct ve_struct *ve;
+	int wait;
+};
+
 static void sync_inodes_one_sb(struct super_block *sb, void *arg)
 {
+	struct sync_arg *sarg = arg;
+
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
+
 	if (!sb_rdonly(sb))
 		sync_inodes_sb(sb);
 }
 
 static void sync_fs_one_sb(struct super_block *sb, void *arg)
 {
+	struct sync_arg *sarg = arg;
+
+	if (sarg->ve && !is_sb_ve_accessible(sarg->ve, sb))
+		return;
+
 	if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
 	    sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, *(int *)arg);
+		sb->s_op->sync_fs(sb, sarg->wait);
+
+	/* For ve-local sync, process bdev here, since there is no easy
+	 * equivalent of is_sb_ve_accessible() for bdevs
+	 */
+	if (sarg->ve)
+		__sync_blockdev(sb->s_bdev, sarg->wait);
 }
 
 static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
@@ -99,134 +120,6 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
-struct sync_sb {
-	struct list_head list;
-	struct super_block *sb;
-};
-
-static void sync_release_filesystems(struct list_head *sync_list)
-{
-	struct sync_sb *ss, *tmp;
-
-	list_for_each_entry_safe(ss, tmp, sync_list, list) {
-		list_del(&ss->list);
-		put_super(ss->sb);
-		kfree(ss);
-	}
-}
-
-static int sync_filesystem_collected(struct list_head *sync_list, struct super_block *sb)
-{
-	struct sync_sb *ss;
-
-	list_for_each_entry(ss, sync_list, list)
-		if (ss->sb == sb)
-			return 1;
-	return 0;
-}
-
-static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
-{
-	struct mount *mnt;
-	struct mnt_namespace *mnt_ns;
-	struct nsproxy *ve_ns;
-	struct sync_sb *ss;
-	int ret = 0;
-
-	BUG_ON(!list_empty(sync_list));
-
-	down_read(&namespace_sem);
-
-	rcu_read_lock();
-	ve_ns = rcu_dereference(ve->ve_ns);
-	if (!ve_ns) {
-		rcu_read_unlock();
-		up_read(&namespace_sem);
-		return 0;
-	}
-	mnt_ns = ve_ns->mnt_ns;
-	rcu_read_unlock();
-
-	mnt = mnt_list_next(mnt_ns, &mnt_ns->list);
-	while (mnt) {
-		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
-			goto next;
-
-		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
-		if (ss == NULL) {
-			ret = -ENOMEM;
-			break;
-		}
-		ss->sb = mnt->mnt.mnt_sb;
-		/*
-		 * We hold mount point and thus can be sure, that superblock is
-		 * alive. And it means, that we can safely increase it's usage
-		 * counter.
-		 */
-		spin_lock(&sb_lock);
-		ss->sb->s_count++;
-		spin_unlock(&sb_lock);
-		list_add_tail(&ss->list, sync_list);
-next:
-		mnt = mnt_list_next(mnt_ns, &mnt->mnt_list);
-	}
-	up_read(&namespace_sem);
-	return ret;
-}
-
-static void sync_filesystems_ve(struct ve_struct *ve, int wait)
-{
-	struct super_block *sb;
-	LIST_HEAD(sync_list);
-	struct sync_sb *ss;
-
-	/*
-	 * We don't need to care about allocating failure here. At least we
-	 * don't need to skip sync on such error.
-	 * Let's sync what we collected already instead.
-	 */
-	sync_collect_filesystems(ve, &sync_list);
-
-	list_for_each_entry(ss, &sync_list, list) {
-		sb = ss->sb;
-		down_read(&sb->s_umount);
-		if (!sb_rdonly(sb) && sb->s_root && (sb->s_flags & SB_BORN))
-			__sync_filesystem(sb, wait);
-		up_read(&sb->s_umount);
-	}
-
-	sync_release_filesystems(&sync_list);
-}
-
-static int is_sb_ve_accessible(struct ve_struct *ve, struct super_block *sb)
-{
-	struct mount *mnt;
-	struct mnt_namespace *mnt_ns;
-	struct nsproxy *ve_ns;
-	int ret = 0;
-
-	down_read(&namespace_sem);
-
-	rcu_read_lock();
-	ve_ns = rcu_dereference(ve->ve_ns);
-	if (!ve_ns) {
-		rcu_read_unlock();
-		up_read(&namespace_sem);
-		return 0;
-	}
-	mnt_ns = ve_ns->mnt_ns;
-	rcu_read_unlock();
-
-	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
-		if (mnt->mnt.mnt_sb == sb) {
-			ret = 1;
-			break;
-		}
-	}
-	up_read(&namespace_sem);
-	return ret;
-}
-
 static int __ve_fsync_behavior(struct ve_struct *ve)
 {
 	/*
@@ -266,8 +159,9 @@ int ve_fsync_behavior(void)
 void ksys_sync(void)
 {
 	struct ve_struct *ve = get_exec_env();
-	int nowait = 0, wait = 1;
+	struct sync_arg sarg;
 
+	sarg.ve = NULL;
 	if (!ve_is_super(ve)) {
 		int fsb;
 		/*
@@ -283,22 +177,22 @@ void ksys_sync(void)
 		fsb = __ve_fsync_behavior(ve);
 		if (fsb == FSYNC_NEVER)
 			return;
-
-		if (fsb == FSYNC_FILTERED) {
-			sync_filesystems_ve(ve, nowait);
-			sync_filesystems_ve(ve, wait);
-			return;
-		}
+		if (fsb == FSYNC_FILTERED)
+			sarg.ve = ve;
 	}
 
 	wakeup_flusher_threads(WB_REASON_SYNC);
-	iterate_supers(sync_inodes_one_sb, NULL);
-	iterate_supers(sync_fs_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &wait);
-	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_bdevs(fdatawait_one_bdev, NULL);
-	if (unlikely(laptop_mode))
-		laptop_sync_completion();
+	iterate_supers(sync_inodes_one_sb, &sarg);
+	sarg.wait = 0;
+	iterate_supers(sync_fs_one_sb, &sarg);
+	sarg.wait = 1;
+	iterate_supers(sync_fs_one_sb, &sarg);
+	if (!sarg.ve) {
+		iterate_bdevs(fdatawrite_one_bdev, NULL);
+		iterate_bdevs(fdatawait_one_bdev, NULL);
+		if (unlikely(laptop_mode))
+			laptop_sync_completion();
+	}
 }
 
 SYSCALL_DEFINE0(sync)
@@ -309,17 +203,20 @@ SYSCALL_DEFINE0(sync)
 
 static void do_sync_work(struct work_struct *work)
 {
-	int nowait = 0;
+	struct sync_arg sarg;
+
+	sarg.ve = NULL;
+	sarg.wait = 0;
 
 	/*
 	 * Sync twice to reduce the possibility we skipped some inodes / pages
 	 * because they were temporarily locked
 	 */
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &sarg);
+	iterate_supers(sync_fs_one_sb, &sarg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
-	iterate_supers(sync_inodes_one_sb, &nowait);
-	iterate_supers(sync_fs_one_sb, &nowait);
+	iterate_supers(sync_inodes_one_sb, &sarg);
+	iterate_supers(sync_fs_one_sb, &sarg);
 	iterate_bdevs(fdatawrite_one_bdev, NULL);
 	printk("Emergency Sync complete\n");
 	kfree(work);
-- 
2.30.2



More information about the Devel mailing list