[Devel] [PATCH RHEL7 COMMIT] files: Shrink big fdtable on close in is_pseudosuper mode

Konstantin Khorenko khorenko at virtuozzo.com
Thu Jan 18 18:05:15 MSK 2018


The commit is pushed to "branch-rh7-3.10.0-693.11.6.vz7.42.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.11.6.vz7.42.1
------>
commit f070e04eb6d3e9e4a4b47ae8fac0b1517dd619c2
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Thu Jan 18 18:05:15 2018 +0300

    files: Shrink big fdtable on close in is_pseudosuper mode
    
    Patchset description:
    Shrink big fdtable on criu restore
    
    This patchset allows to avoid memory overuse introduced by service fds on criu
    restore.
    The solution is simple: smartly check for closed fd number, and shrink fdtable
    if this could be made. The checks are happen in is_pseudosuper mode, so we do
    not affect performance on normal work mode.
    
    The problem is we can't solve this for 100% case in userspace.
    Kernel allows to fix that completely.
    
    https://jira.sw.ru/browse/PSBM-78827
    
    Eric Dumazet (1):
          ms/fs/file.c: don't acquire files->file_lock in fd_install()
    
    Kirill Tkhai (3):
          files: Add new argument to expand_files()
          files: Add fdtable_align() helper
          files: Shrink big fdtable on close in is_pseudosuper mode
    
    Mateusz Guzik (1):
          ms/vfs: grab the lock instead of blocking in __fd_install during resizing
    
    ============================================================
    This patch description:
    
    This trick is going to be used for criu restore, to release excess memory
    occupied by service files:
    we check a closing fd, and if it's a half of max available fdtable number, we
    try to shrink the fdstable and decrease amoung of memory needed to store task's
    fds.
    
    Currently is_pseudosuper state is used to detect restore, but it can be changed
    later if needed.
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    Reviewed-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 fs/file.c | 51 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index f009eb9bf1c8..b85e8ee6143b 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -22,6 +22,7 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
+#include <linux/ve.h>
 
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
@@ -69,19 +70,22 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt, bool shrink
 {
 	unsigned int cpy, set;
 
-	BUG_ON(nfdt->max_fds < ofdt->max_fds);
+	BUG_ON((nfdt->max_fds < ofdt->max_fds) != shrink);
 
-	cpy = ofdt->max_fds * sizeof(struct file *);
+	cpy = min(ofdt->max_fds, nfdt->max_fds) * sizeof(struct file *);
 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
 	memcpy(nfdt->fd, ofdt->fd, cpy);
-	memset((char *)(nfdt->fd) + cpy, 0, set);
+	if (!shrink)
+		memset((char *)(nfdt->fd) + cpy, 0, set);
 
-	cpy = ofdt->max_fds / BITS_PER_BYTE;
+	cpy = min(ofdt->max_fds, nfdt->max_fds) / BITS_PER_BYTE;
 	set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
 	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
-	memset((char *)(nfdt->open_fds) + cpy, 0, set);
+	if (!shrink)
+		memset((char *)(nfdt->open_fds) + cpy, 0, set);
 	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
-	memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+	if (!shrink)
+		memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
 }
 
 static unsigned int fdtable_align(unsigned int nr)
@@ -169,16 +173,26 @@ static int expand_fdtable(struct files_struct *files, int nr, bool shrink)
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
 		return -ENOMEM;
+	cur_fdt = files_fdtable(files);
 	/*
 	 * extremely unlikely race - sysctl_nr_open decreased between the check in
 	 * caller and alloc_fdtable().  Cheaper to catch it here...
 	 */
-	if (unlikely(new_fdt->max_fds <= nr)) {
+	if (unlikely((new_fdt->max_fds <= nr && !shrink) ||
+		     (shrink && new_fdt->max_fds >= cur_fdt->max_fds))) {
 		__free_fdtable(new_fdt);
 		return -EMFILE;
 	}
-	cur_fdt = files_fdtable(files);
-	BUG_ON(nr < cur_fdt->max_fds);
+	if (unlikely(shrink)) {
+		int i;
+		i = find_last_bit(cur_fdt->open_fds, cur_fdt->max_fds);
+		i = fdtable_align(i);
+		if (i == cur_fdt->max_fds) {
+			__free_fdtable(new_fdt);
+			return 1;
+		}
+	}
+	BUG_ON((nr < cur_fdt->max_fds) != shrink);
 	copy_fdtable(new_fdt, cur_fdt, shrink);
 	rcu_assign_pointer(files->fdt, new_fdt);
 	if (cur_fdt != &files->fdtab)
@@ -207,7 +221,7 @@ static int expand_files(struct files_struct *files, int nr, bool shrink)
 	fdt = files_fdtable(files);
 
 	/* Do we need to expand? */
-	if (nr < fdt->max_fds)
+	if (nr < fdt->max_fds && !shrink)
 		return expanded;
 
 	/* Can we expand? */
@@ -222,6 +236,15 @@ static int expand_files(struct files_struct *files, int nr, bool shrink)
 		goto repeat;
 	}
 
+	if (unlikely(shrink)) {
+		unsigned int i;
+		i = find_last_bit(fdt->open_fds, fdt->max_fds);
+		nr = i;
+		i = fdtable_align(i);
+		if (i >= fdt->max_fds)
+			return expanded;
+	}
+
 	/* All good, so we try */
 	files->resize_in_progress = true;
 	expanded = expand_fdtable(files, nr, shrink);
@@ -637,6 +660,14 @@ int __close_fd(struct files_struct *files, unsigned fd)
 	rcu_assign_pointer(fdt->fd[fd], NULL);
 	__clear_close_on_exec(fd, fdt);
 	__put_unused_fd(files, fd);
+
+	/* Try to shrink fdt and to free memory */
+	if (unlikely(fd * 2 >= fdt->max_fds &&
+		     fd > (1024 / sizeof(struct file *))) &&
+		     get_exec_env() != get_ve0() &&
+		     get_exec_env()->is_pseudosuper)
+		expand_files(files, fd, true);
+
 	spin_unlock(&files->file_lock);
 	return filp_close(file, files);
 


More information about the Devel mailing list