[CRIU] [PATCH v4 18/19] files: Make tasks set their own service_fd_base

Kirill Tkhai ktkhai at virtuozzo.com
Wed Jan 10 17:03:10 MSK 2018


Currently, we set rlim(RLIMIT_NOFILE) unlimited
and service_fd_rlim_cur to place service fds.
This leads to a signify problem: every task uses
the biggest possible files_struct in kernel, and
it consumes excess memory after restore
in comparation to dump. In some situations this
may end in restore fail as there is no enough
memory in memory cgroup of on node.

The patch fixes the problem by introducing
task-measured service_fd_base. It's calculated
in dependence of max used file fd and is placed
near the right border of kernel-allocated memory
hunk for task's fds (see alloc_fdtable() for
details). This reduces kernel-allocated files_struct
to 512 fds for the most process in standard linux
system (I've analysed the processes in my work system).

Also, since the "standard processes" will have the same
service_fd_base, clone_service_fd() won't have to
actualy dup() their service fds for them like we
have at the moment. This is the one of reasons why
we still keep service fds as a range of fds,
and do not try to use unused holes in task fds.

Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>

v2: Add a handle for very big fd numbers near service_fd_rlim_cur.
v3: Fix excess accounting for nr equal to pow 2 minus 1.
---
 criu/util.c |   57 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/criu/util.c b/criu/util.c
index 93d428da7..b1b02dad6 100644
--- a/criu/util.c
+++ b/criu/util.c
@@ -540,7 +540,7 @@ int close_service_fd(enum sfd_type type)
 static void move_service_fd(struct pstree_item *me, int type, int new_id, int new_base)
 {
 	int old = get_service_fd(type);
-	int new = __get_service_fd(type, new_id);
+	int new = new_base - type - SERVICE_FD_MAX * new_id;
 	int ret;
 
 	if (old < 0)
@@ -553,24 +553,73 @@ static void move_service_fd(struct pstree_item *me, int type, int new_id, int ne
 		close(old);
 }
 
+static int choose_service_fd_base(struct pstree_item *me)
+{
+	int nr, real_nr, fdt_nr = 1, id = rsti(me)->service_fd_id;
+
+	if (rsti(me)->fdt) {
+		/* The base is set by owner of fdt (id 0) */
+		if (id != 0)
+			return service_fd_base;
+		fdt_nr = rsti(me)->fdt->nr;
+	}
+	/* Now find process's max used fd number */
+	if (!list_empty(&rsti(me)->fds))
+		nr = list_entry(rsti(me)->fds.prev,
+				struct fdinfo_list_entry, ps_list)->fe->fd;
+	else
+		nr = -1;
+
+	nr = max(nr, inh_fd_max);
+	/*
+	 * Service fds go after max fd near right border of alignment:
+	 *
+	 * ...|max_fd|max_fd+1|...|sfd first|...|sfd last (aligned)|
+	 *
+	 * So, they take maximum numbers of area allocated by kernel.
+	 * See linux alloc_fdtable() for details.
+	 */
+	nr += (SERVICE_FD_MAX - SERVICE_FD_MIN) * fdt_nr;
+	nr += 16; /* Safety pad */
+	real_nr = nr;
+
+	nr /= (1024 / sizeof(void *));
+	nr = 1 << (32 - __builtin_clz(nr));
+	nr *= (1024 / sizeof(void *));
+
+	if (nr > service_fd_rlim_cur) {
+		/* Right border is bigger, than rlim. OK, then just aligned value is enough */
+		nr = round_down(service_fd_rlim_cur, (1024 / sizeof(void *)));
+		if (nr < real_nr) {
+			pr_err("Can't chose service_fd_base: %d %d\n", nr, real_nr);
+			return -1;
+		}
+	}
+
+	return nr;
+}
+
 int clone_service_fd(struct pstree_item *me)
 {
 	int id, new_base, i, ret = -1;
 
-	new_base = service_fd_base;
+	new_base = choose_service_fd_base(me);
 	id = rsti(me)->service_fd_id;
 
-	if (service_fd_id == id)
+	if (new_base == -1)
+		return -1;
+	if (service_fd_base == new_base && service_fd_id == id)
 		return 0;
 
 	/* Dup sfds in memmove() style: they may overlap */
-	if (get_service_fd(LOG_FD_OFF) > __get_service_fd(LOG_FD_OFF, id))
+	if (get_service_fd(LOG_FD_OFF) > new_base - LOG_FD_OFF - SERVICE_FD_MAX * id)
 		for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++)
 			move_service_fd(me, i, id, new_base);
 	else
 		for (i = SERVICE_FD_MAX - 1; i > SERVICE_FD_MIN; i--)
 			move_service_fd(me, i, id, new_base);
 
+	service_fd_base = new_base;
 	service_fd_id = id;
 	ret = 0;
 



More information about the CRIU mailing list