[Devel] [PATCH RHEL8 COMMIT] ve/fs/files: Shrink big fdtable on close in is_pseudosuper mode

Konstantin Khorenko khorenko at virtuozzo.com
Fri Apr 30 13:52:42 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.24
------>
commit c4a4fada18ad4dab1b051e49e39065c569ad64c8
Author: Kirill Tkhai <ktkhai at virtuozzo.com>
Date:   Fri Apr 30 13:52:42 2021 +0300

    ve/fs/files: Shrink big fdtable on close in is_pseudosuper mode
    
    Patchset description:
    Shrink big fdtable on criu restore
    
    This patchset allows to avoid memory overuse introduced by service fds on criu
    restore.
    The solution is simple: smartly check for closed fd number, and shrink fdtable
    if this could be made. The checks are happen in is_pseudosuper mode, so we do
    not affect performance on normal work mode.
    
    The problem is we can't solve this for 100% case in userspace.
    Kernel allows to fix that completely.
    
    https://jira.sw.ru/browse/PSBM-78827
    
    Eric Dumazet (1):
          ms/fs/file.c: don't acquire files->file_lock in fd_install()
    
    Kirill Tkhai (3):
          files: Add new argument to expand_files()
          files: Add fdtable_align() helper
          files: Shrink big fdtable on close in is_pseudosuper mode
    
    Mateusz Guzik (1):
          ms/vfs: grab the lock instead of blocking in __fd_install during resizing
    
    ============================================================
    This patch description:
    
    This trick is going to be used for criu restore, to release excess memory
    occupied by service files:
    we check a closing fd, and if it's a half of max available fdtable number, we
    try to shrink the fdstable and decrease amoung of memory needed to store task's
    fds.
    
    Currently is_pseudosuper state is used to detect restore, but it can be changed
    later if needed.
    
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    Reviewed-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    Rebase to vz8:
     - Used rebased to RH7.9 vz7 commit 4b024fd120c5 ("ve/fs/files: Shrink big
       fdtable on close in is_pseudosuper mode") which handles new copy_fd_bitmaps
       helper function.
    
    (cherry picked from vz7 commit 4b024fd120c5 ("ve/fs/files: Shrink big fdtable on
    close in is_pseudosuper mode"))
    Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
 fs/file.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 4f68ef0f6ee0..0ed1c4cd6eae 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -18,6 +18,7 @@
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/ve.h>
 
 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
@@ -47,21 +48,25 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
  * spinlock held for write.
  */
 static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
-			    unsigned int count)
+			    unsigned int count, bool shrink)
 {
 	unsigned int cpy, set;
 
-	cpy = count / BITS_PER_BYTE;
+	cpy = min(count, nfdt->max_fds) / BITS_PER_BYTE;
 	set = (nfdt->max_fds - count) / BITS_PER_BYTE;
 	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
-	memset((char *)nfdt->open_fds + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->open_fds + cpy, 0, set);
+
 	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
-	memset((char *)nfdt->close_on_exec + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->close_on_exec + cpy, 0, set);
 
-	cpy = BITBIT_SIZE(count);
+	cpy = BITBIT_SIZE(min(count, nfdt->max_fds));
 	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
 	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
-	memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->full_fds_bits + cpy, 0, set);
 }
 
 /*
@@ -72,14 +77,15 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt, bool shrink
 {
 	unsigned int cpy, set;
 
-	BUG_ON(nfdt->max_fds < ofdt->max_fds);
+	BUG_ON((nfdt->max_fds < ofdt->max_fds) != shrink);
 
-	cpy = ofdt->max_fds * sizeof(struct file *);
+	cpy = min(ofdt->max_fds, nfdt->max_fds) * sizeof(struct file *);
 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
 	memcpy(nfdt->fd, ofdt->fd, cpy);
-	memset((char *)nfdt->fd + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->fd + cpy, 0, set);
 
-	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds, shrink);
 }
 
 static unsigned int fdtable_align(unsigned int nr)
@@ -170,16 +176,26 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr, bool shri
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
 		return -ENOMEM;
+	cur_fdt = files_fdtable(files);
 	/*
 	 * extremely unlikely race - sysctl_nr_open decreased between the check in
 	 * caller and alloc_fdtable().  Cheaper to catch it here...
 	 */
-	if (unlikely(new_fdt->max_fds <= nr)) {
+	if (unlikely((new_fdt->max_fds <= nr && !shrink) ||
+		     (shrink && new_fdt->max_fds >= cur_fdt->max_fds))) {
 		__free_fdtable(new_fdt);
 		return -EMFILE;
 	}
-	cur_fdt = files_fdtable(files);
-	BUG_ON(nr < cur_fdt->max_fds);
+	if (unlikely(shrink)) {
+		int i;
+		i = find_last_bit(cur_fdt->open_fds, cur_fdt->max_fds);
+		i = fdtable_align(i);
+		if (i == cur_fdt->max_fds) {
+			__free_fdtable(new_fdt);
+			return 1;
+		}
+	}
+	BUG_ON((nr < cur_fdt->max_fds) != shrink);
 	copy_fdtable(new_fdt, cur_fdt, shrink);
 	rcu_assign_pointer(files->fdt, new_fdt);
 	if (cur_fdt != &files->fdtab)
@@ -208,7 +224,7 @@ static int expand_files(struct files_struct *files, unsigned int nr, bool shrink
 	fdt = files_fdtable(files);
 
 	/* Do we need to expand? */
-	if (nr < fdt->max_fds)
+	if (nr < fdt->max_fds && !shrink)
 		return expanded;
 
 	/* Can we expand? */
@@ -223,6 +239,15 @@ static int expand_files(struct files_struct *files, unsigned int nr, bool shrink
 		goto repeat;
 	}
 
+	if (unlikely(shrink)) {
+		unsigned int i;
+		i = find_last_bit(fdt->open_fds, fdt->max_fds);
+		nr = i;
+		i = fdtable_align(i);
+		if (i >= fdt->max_fds)
+			return expanded;
+	}
+
 	/* All good, so we try */
 	files->resize_in_progress = true;
 	expanded = expand_fdtable(files, nr, shrink);
@@ -337,7 +362,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 		open_files = count_open_files(old_fdt);
 	}
 
-	copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+	copy_fd_bitmaps(new_fdt, old_fdt, open_files, false);
 
 	old_fds = old_fdt->fd;
 	new_fds = new_fdt->fd;
@@ -643,6 +668,14 @@ int __close_fd(struct files_struct *files, unsigned fd)
 		goto out_unlock;
 	rcu_assign_pointer(fdt->fd[fd], NULL);
 	__put_unused_fd(files, fd);
+
+	/* Try to shrink fdt and to free memory */
+	if (unlikely(fd * 2 >= fdt->max_fds &&
+		     fd > (1024 / sizeof(struct file *))) &&
+		     get_exec_env() != get_ve0() &&
+		     get_exec_env()->is_pseudosuper)
+		expand_files(files, fd, true);
+
 	spin_unlock(&files->file_lock);
 	return filp_close(file, files);
 


More information about the Devel mailing list