[Devel] [PATCH RHEL7 COMMIT] ve/fs/sync: per containter sync and syncfs

Konstantin Khorenko khorenko at virtuozzo.com
Sat Feb 6 05:04:49 PST 2016


The commit is pushed to "branch-rh7-3.10.0-327.3.1-vz7.10.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.3.1.vz7.10.10
------>
commit 9aac0de533e3a0801aeec40fc53e7fbc79dfbedc
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date:   Sat Feb 6 17:04:49 2016 +0400

    ve/fs/sync: per containter sync and syncfs
    
    The implementation is straightforward, since we already have per container
    writeback. Sync just get user beancounter and launches writeback work for it.
    
    https://jira.sw.ru/browse/PSBM-39583
    
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    Reviewed-by: Vladimir Davydov <vdavydov at virtuozzo.com>
---
 fs/fs-writeback.c         | 51 +++++++++++++++++++++++++++++++++++++++--------
 fs/mount.h                |  3 +++
 fs/namespace.c            |  4 ++--
 fs/super.c                |  2 +-
 fs/sync.c                 | 30 ++++++++++++++++++----------
 include/linux/fs.h        |  1 +
 include/linux/writeback.h |  5 +++++
 7 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 16bc6b3..b6b33d0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -132,7 +132,8 @@ out_unlock:
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic, enum wb_reason reason)
+			struct user_beancounter *ub, bool range_cyclic,
+			enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -151,6 +152,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
 	work->reason	= reason;
+	work->ub	= ub;
 
 	bdi_queue_work(bdi, work);
 }
@@ -170,7 +172,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, reason);
+	__bdi_start_writeback(bdi, nr_pages, NULL, true, reason);
 }
 
 /**
@@ -1149,7 +1151,8 @@ void bdi_writeback_workfn(struct work_struct *work)
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
  */
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+void wakeup_flusher_threads_ub(long nr_pages, struct user_beancounter *ub,
+			enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
@@ -1160,11 +1163,16 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, reason);
+		__bdi_start_writeback(bdi, nr_pages, ub, false, reason);
 	}
 	rcu_read_unlock();
 }
 
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+{
+	wakeup_flusher_threads_ub(nr_pages, NULL, reason);
+}
+
 /*
  * Wake up bdi's periodically to make sure dirtytime inodes gets
  * written back periodically.  We deliberately do *not* check the
@@ -1375,7 +1383,7 @@ out_unlock_inode:
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
-static void wait_sb_inodes(struct super_block *sb)
+static void wait_sb_inodes(struct super_block *sb, struct user_beancounter *ub)
 {
 	struct inode *inode, *old_inode = NULL;
 
@@ -1403,6 +1411,11 @@ static void wait_sb_inodes(struct super_block *sb)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
+		if (ub && (mapping->dirtied_ub != ub)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_sb_list_lock);
@@ -1438,7 +1451,8 @@ static void wait_sb_inodes(struct super_block *sb)
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb_nr(struct super_block *sb,
+static void writeback_inodes_sb_ub_nr(struct super_block *sb,
+			    struct user_beancounter *ub,
 			    unsigned long nr,
 			    enum wb_reason reason)
 {
@@ -1450,6 +1464,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 		.done			= &done,
 		.nr_pages		= nr,
 		.reason			= reason,
+		.ub			= ub,
 	};
 
 	if (sb->s_bdi == &noop_backing_dev_info)
@@ -1458,8 +1473,22 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 }
+
+void writeback_inodes_sb_nr(struct super_block *sb,
+			    unsigned long nr,
+			    enum wb_reason reason)
+{
+
+	writeback_inodes_sb_ub_nr(sb, NULL, nr, reason);
+}
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
+void writeback_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub,
+			enum wb_reason reason)
+{
+	return writeback_inodes_sb_ub_nr(sb, ub, get_nr_dirty_pages(), reason);
+}
+
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1521,7 +1550,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
@@ -1532,6 +1561,7 @@ void sync_inodes_sb(struct super_block *sb)
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
 		.for_sync	= 1,
+		.ub		= ub,
 	};
 
 	/* Nothing to do? */
@@ -1542,7 +1572,12 @@ void sync_inodes_sb(struct super_block *sb)
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 
-	wait_sb_inodes(sb);
+	wait_sb_inodes(sb, ub);
+}
+
+void sync_inodes_sb(struct super_block *sb)
+{
+	sync_inodes_sb_ub(sb, NULL);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
 
diff --git a/fs/mount.h b/fs/mount.h
index 3e16b57..b496064 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -70,6 +70,9 @@ static inline int is_mounted(struct vfsmount *mnt)
 	return !IS_ERR_OR_NULL(real_mount(mnt));
 }
 
+extern struct rw_semaphore namespace_sem;
+extern struct mount *next_mnt(struct mount *p, struct mount *root);
+
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 
 static inline void get_mnt_ns(struct mnt_namespace *ns)
diff --git a/fs/namespace.c b/fs/namespace.c
index 62a12b7..fa9ee9e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -40,7 +40,7 @@ static int mnt_group_start = 1;
 static struct list_head mount_hashtable[HASH_SIZE];
 static struct list_head mountpoint_hashtable[HASH_SIZE];
 static struct kmem_cache *mnt_cache __read_mostly;
-static struct rw_semaphore namespace_sem;
+struct rw_semaphore namespace_sem;
 
 /* /sys/fs */
 struct kobject *fs_kobj;
@@ -756,7 +756,7 @@ static void commit_tree(struct mount *mnt)
 	touch_mnt_namespace(n);
 }
 
-static struct mount *next_mnt(struct mount *p, struct mount *root)
+struct mount *next_mnt(struct mount *p, struct mount *root)
 {
 	struct list_head *next = p->mnt_mounts.next;
 	if (next == &p->mnt_mounts) {
diff --git a/fs/super.c b/fs/super.c
index d09e15a..11de99c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -258,7 +258,7 @@ static void __put_super(struct super_block *sb)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 7ac77bb..8119ab4 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,6 +7,7 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
@@ -17,6 +18,7 @@
 #include <linux/backing-dev.h>
 #include <linux/ve.h>
 #include "internal.h"
+#include "mount.h"
 
 #include <bc/beancounter.h>
 #include <bc/io_acct.h>
@@ -35,9 +37,9 @@ static int __sync_filesystem(struct super_block *sb,
 			     struct user_beancounter *ub, int wait)
 {
 	if (wait)
-		sync_inodes_sb(sb);
+		sync_inodes_sb_ub(sb, ub);
 	else
-		writeback_inodes_sb(sb, WB_REASON_SYNC);
+		writeback_inodes_sb_ub(sb, ub, WB_REASON_SYNC);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
@@ -99,8 +101,6 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait(bdev->bd_inode->i_mapping);
 }
 
-#if 0
-
 struct sync_sb {
 	struct list_head list;
 	struct super_block *sb;
@@ -129,8 +129,8 @@ static int sync_filesystem_collected(struct list_head *sync_list, struct super_b
 
 static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
 {
-	struct vfsmount *root = ve->root_path.mnt;
-	struct vfsmount *mnt;
+	struct mount *root = real_mount(ve->root_path.mnt);
+	struct mount *mnt;
 	struct sync_sb *ss;
 	int ret = 0;
 
@@ -138,7 +138,7 @@ static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync
 
 	down_read(&namespace_sem);
 	for (mnt = root; mnt; mnt = next_mnt(mnt, root)) {
-		if (sync_filesystem_collected(sync_list, mnt->mnt_sb))
+		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
 			continue;
 
 		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
@@ -146,7 +146,7 @@ static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync
 			ret = -ENOMEM;
 			break;
 		}
-		ss->sb = mnt->mnt_sb;
+		ss->sb = mnt->mnt.mnt_sb;
 		/*
 		 * We hold mount point and thus can be sure, that superblock is
 		 * alive. And it means, that we can safely increase it's usage
@@ -189,8 +189,6 @@ static void sync_filesystems_ve(struct ve_struct *ve, struct user_beancounter *u
 	mutex_unlock(&ve->sync_mutex);
 }
 
-#endif
-
 static int __ve_fsync_behavior(struct ve_struct *ve)
 {
 	if (ve->fsync_enable == 2)
@@ -225,7 +223,7 @@ int ve_fsync_behavior(void)
 SYSCALL_DEFINE0(sync)
 {
 	struct ve_struct *ve = get_exec_env();
-	struct user_beancounter *ub;
+	struct user_beancounter *ub, *sync_ub = NULL;
 	int nowait = 0, wait = 1;
 
 	ub = get_exec_ub();
@@ -246,6 +244,16 @@ SYSCALL_DEFINE0(sync)
 		fsb = __ve_fsync_behavior(ve);
 		if (fsb == FSYNC_NEVER)
 			goto skip;
+
+		if (fsb == FSYNC_FILTERED)
+			sync_ub = get_io_ub();
+
+		if (sync_ub && (sync_ub != get_ub0())) {
+			wakeup_flusher_threads_ub(0, sync_ub, WB_REASON_SYNC);
+			sync_filesystems_ve(get_exec_env(), sync_ub, nowait);
+			sync_filesystems_ve(get_exec_env(), sync_ub, wait);
+			goto skip;
+		}
 	}
 
 	wakeup_flusher_threads(0, WB_REASON_SYNC);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e6f777..b035f62 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2164,6 +2164,7 @@ void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
+void put_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 int get_anon_bdev(dev_t *);
 void free_anon_bdev(dev_t);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a193a7e..01e5651 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -93,13 +93,18 @@ struct writeback_control {
 struct bdi_writeback;
 int inode_wait(void *);
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+void writeback_inodes_sb_ub(struct super_block *, struct user_beancounter *,
+							enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
 int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				  enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb_ub(struct super_block *, struct user_beancounter *ub);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
+void wakeup_flusher_threads_ub(long nr_pages, struct user_beancounter *ub,
+			enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */


More information about the Devel mailing list