[Devel] [PATCH RH9 15/22] ve/fs/files: Shrink big fdtable on close in is_pseudosuper mode

Andrey Zhadchenko andrey.zhadchenko at virtuozzo.com
Thu Oct 7 13:57:39 MSK 2021


From: Kirill Tkhai <ktkhai at virtuozzo.com>

Patchset description:
Shrink big fdtable on criu restore

This patchset allows to avoid memory overuse introduced by service fds on criu
restore.
The solution is simple: smartly check for closed fd number, and shrink fdtable
if this could be made. The checks are happen in is_pseudosuper mode, so we do
not affect performance on normal work mode.

The problem is we can't solve this for 100% case in userspace.
Kernel allows to fix that completely.

https://jira.sw.ru/browse/PSBM-78827

Eric Dumazet (1):
      ms/fs/file.c: don't acquire files->file_lock in fd_install()

Kirill Tkhai (3):
      files: Add new argument to expand_files()
      files: Add fdtable_align() helper
      files: Shrink big fdtable on close in is_pseudosuper mode

Mateusz Guzik (1):
      ms/vfs: grab the lock instead of blocking in __fd_install during resizing

============================================================
This patch description:

This trick is going to be used for criu restore, to release excess memory
occupied by service files:
we check a closing fd, and if it's a half of max available fdtable number, we
try to shrink the fdstable and decrease amoung of memory needed to store task's
fds.

Currently is_pseudosuper state is used to detect restore, but it can be changed
later if needed.

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>

Reviewed-by: Cyrill Gorcunov <gorcunov at openvz.org>

Rebase to vz8:
 - Used rebased to RH7.9 vz7 commit 4b024fd120c5 ("ve/fs/files: Shrink big
   fdtable on close in is_pseudosuper mode") which handles new copy_fd_bitmaps
   helper function.

(cherry picked from vz7 commit 4b024fd120c5 ("ve/fs/files: Shrink big fdtable on
close in is_pseudosuper mode"))
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>

+++
fs: Fix race with old fdt parallel reuse

We own files->resize_in_progress in this moment,
but it's only seen by guys, who expands the fdt.
If someone wants to use fd inside old fdt max number,
he just doesn't look at this.

So, let's check old maximum fd after lock is
acquired to see such the parallel users.

https://jira.sw.ru/browse/PSBM-82984

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>

(cherry picked from vz7 commit 01eb18f336ef ("fs: Fix race with old fdt
parallel reuse"))

mFixes: c4a4fada18a "ve/fs/files: Shrink big fdtable on close in
is_pseudosuper mode"

Signed-off-by: Vasily Averin <vvs at virtuozzo.com>

(cherry picked from vz8 commit 50a72f1810df59c1aada1a5efba8fb052075693c)
Signed-off-by: Andrey Zhadchenko <andrey.zhadchenko at virtuozzo.com>
---
 fs/file.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 2163301..fabb57d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -21,6 +21,7 @@
 #include <linux/rcupdate.h>
 #include <linux/close_range.h>
 #include <net/sock.h>
+#include <linux/ve.h>
 
 #include "internal.h"
 
@@ -52,21 +53,25 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
  * spinlock held for write.
  */
 static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
-			    unsigned int count)
+			    unsigned int count, bool shrink)
 {
 	unsigned int cpy, set;
 
-	cpy = count / BITS_PER_BYTE;
+	cpy = min(count, nfdt->max_fds) / BITS_PER_BYTE;
 	set = (nfdt->max_fds - count) / BITS_PER_BYTE;
 	memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
-	memset((char *)nfdt->open_fds + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->open_fds + cpy, 0, set);
+
 	memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
-	memset((char *)nfdt->close_on_exec + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->close_on_exec + cpy, 0, set);
 
-	cpy = BITBIT_SIZE(count);
+	cpy = BITBIT_SIZE(min(count, nfdt->max_fds));
 	set = BITBIT_SIZE(nfdt->max_fds) - cpy;
 	memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
-	memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->full_fds_bits + cpy, 0, set);
 }
 
 /*
@@ -77,14 +82,15 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt, bool shrink
 {
 	size_t cpy, set;
 
-	BUG_ON(nfdt->max_fds < ofdt->max_fds);
+	BUG_ON((nfdt->max_fds < ofdt->max_fds) != shrink);
 
-	cpy = ofdt->max_fds * sizeof(struct file *);
+	cpy = min(ofdt->max_fds, nfdt->max_fds) * sizeof(struct file *);
 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
 	memcpy(nfdt->fd, ofdt->fd, cpy);
-	memset((char *)nfdt->fd + cpy, 0, set);
+	if (!shrink)
+		memset((char *)nfdt->fd + cpy, 0, set);
 
-	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
+	copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds, shrink);
 }
 
 static unsigned int fdtable_align(unsigned int nr)
@@ -175,16 +181,25 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr, bool shri
 	spin_lock(&files->file_lock);
 	if (!new_fdt)
 		return -ENOMEM;
+	cur_fdt = files_fdtable(files);
 	/*
 	 * extremely unlikely race - sysctl_nr_open decreased between the check in
 	 * caller and alloc_fdtable().  Cheaper to catch it here...
 	 */
-	if (unlikely(new_fdt->max_fds <= nr)) {
+	if (unlikely((new_fdt->max_fds <= nr && !shrink) ||
+		     (shrink && new_fdt->max_fds >= cur_fdt->max_fds))) {
 		__free_fdtable(new_fdt);
 		return -EMFILE;
 	}
-	cur_fdt = files_fdtable(files);
-	BUG_ON(nr < cur_fdt->max_fds);
+	if (unlikely(shrink)) {
+		int i;
+		i = find_last_bit(cur_fdt->open_fds, cur_fdt->max_fds);
+		if (i >= new_fdt->max_fds) {
+			__free_fdtable(new_fdt);
+			return 1;
+		}
+	}
+	BUG_ON((nr < cur_fdt->max_fds) != shrink);
 	copy_fdtable(new_fdt, cur_fdt, shrink);
 	rcu_assign_pointer(files->fdt, new_fdt);
 	if (cur_fdt != &files->fdtab)
@@ -213,7 +228,7 @@ static int expand_files(struct files_struct *files, unsigned int nr, bool shrink
 	fdt = files_fdtable(files);
 
 	/* Do we need to expand? */
-	if (nr < fdt->max_fds)
+	if (nr < fdt->max_fds && !shrink)
 		return expanded;
 
 	/* Can we expand? */
@@ -228,6 +243,15 @@ static int expand_files(struct files_struct *files, unsigned int nr, bool shrink
 		goto repeat;
 	}
 
+	if (unlikely(shrink)) {
+		unsigned int i;
+		i = find_last_bit(fdt->open_fds, fdt->max_fds);
+		nr = i;
+		i = fdtable_align(i);
+		if (i >= fdt->max_fds)
+			return expanded;
+	}
+
 	/* All good, so we try */
 	files->resize_in_progress = true;
 	expanded = expand_fdtable(files, nr, shrink);
@@ -352,7 +376,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int
 		open_files = sane_fdtable_size(old_fdt, max_fds);
 	}
 
-	copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+	copy_fd_bitmaps(new_fdt, old_fdt, open_files, false);
 
 	old_fds = old_fdt->fd;
 	new_fds = new_fdt->fd;
@@ -618,6 +642,13 @@ static struct file *pick_file(struct files_struct *files, unsigned fd)
 	rcu_assign_pointer(fdt->fd[fd], NULL);
 	__put_unused_fd(files, fd);
 
+	/* Try to shrink fdt and to free memory */
+	if (unlikely(fd * 2 >= fdt->max_fds &&
+		     fd > (1024 / sizeof(struct file *))) &&
+		     get_exec_env() != get_ve0() &&
+		     get_exec_env()->is_pseudosuper)
+		expand_files(files, fd, true);
+
 out_unlock:
 	spin_unlock(&files->file_lock);
 	return file;
-- 
1.8.3.1



More information about the Devel mailing list