[CRIU] [PATCH 10/10] files: Use sys_kcmp to find file descriptor duplicates

Cyrill Gorcunov gorcunov at openvz.org
Fri Feb 24 18:24:31 EST 2012


We switch generic-object-id concept with sys_kcmp approach,
which implies changes of image format a bit (and since it's
early time for project overall, we're allowed to).

In short -- previously every file descriptor had an ID
generated by a kernel and exported via procfs. If the
appropriate file descriptors were the same objects in
kernel memory -- the IDs did match up to bit. It allows
us to figure out which files are actually the identical
ones and should be restored in a special way.

Once sys_kcmp system call were merged into the kernel,
we've got a new opprotunity -- to use this call instead.
The syscall basically compare kernel objects and returns
ordered results suitable for objects sorting in userspace.

For us it mens we treat every file descriptor as a combination
of 'id' and 'salt'. While 'id' is serves for fast comparison between
fds, the 'salt' is kind of a second key, which guarantees uniqueness
of id+salt tuple over all file descritors found in process
(or group of processes).

To be able to find and dump file descriptors in a single pass we collect
every fd we found into a global rbtree, where (!) each node might become
a root for a subtree as well.

The main tree carries only primary ids, ie nonequal ones. If we find
some id which is already in tree, we need to make sure that it's
either indeed a duplicate or not.

For this we use sys_kcmp syscall and if we find that in real the file
descriptors are different -- we simply add a unique 'salt' value and
put new fd into a subtree.

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 Makefile           |    1 +
 cr-dump.c          |   77 +++++++++++++++-----
 cr-show.c          |    5 +-
 file-ids.c         |  209 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 files.c            |   26 ++++---
 include/file-ids.h |   34 +++++++++
 include/files.h    |    3 +-
 include/image.h    |    5 +-
 8 files changed, 325 insertions(+), 35 deletions(-)
 create mode 100644 file-ids.c
 create mode 100644 include/file-ids.h

diff --git a/Makefile b/Makefile
index 00d731f..4ca6cb1 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,7 @@ OBJS		+= log.o
 OBJS		+= libnetlink.o
 OBJS		+= sockets.o
 OBJS		+= files.o
+OBJS		+= file-ids.o
 OBJS		+= namespaces.o
 OBJS		+= uts_ns.o
 OBJS		+= ipc_ns.o
diff --git a/cr-dump.c b/cr-dump.c
index c7b450a..b738bb9 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -20,6 +20,7 @@
 
 #include "types.h"
 #include "list.h"
+#include "file-ids.h"
 
 #include "compiler.h"
 #include "crtools.h"
@@ -31,6 +32,7 @@
 #include "image.h"
 #include "proc_parse.h"
 #include "parasite-syscall.h"
+#include "file-ids.h"
 
 #ifndef CONFIG_X86_64
 # error No x86-32 support yet
@@ -89,7 +91,10 @@ struct fd_parms {
 	unsigned long	fd_name;
 	unsigned long	pos;
 	unsigned int	flags;
-	char		*id;
+
+	u64		id;
+	u64		salt;
+	pid_t		pid;
 };
 
 static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
@@ -120,10 +125,23 @@ static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
 	e.flags = p->flags;
 	e.pos	= p->pos;
 	e.addr	= p->fd_name;
-	if (p->id)
-		memcpy(e.id, p->id, FD_ID_SIZE);
-	else
-		memzero(e.id, FD_ID_SIZE);
+
+	if (likely(!fd_is_special(&e))) {
+		struct fd_id_entry *fd_id_entry;
+
+		fd_id_entry = fd_id_entry_collect(p->id, p->pid, p->fd_name);
+		if (!fd_id_entry) {
+			pr_err("No memory to collect fidinfo for file %li (pid %d)\n",
+				p->fd_name, p->pid);
+			goto err;
+		}
+
+		e.id	= fd_id_entry->id;
+		e.salt	= fd_id_entry->salt;
+	} else {
+		e.id	= FD_ID_INVALID;
+		e.salt	= FD_SALT_INVALID;
+	}
 
 	pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n",
 		type, len, p->flags, p->pos, p->fd_name);
@@ -144,7 +162,13 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
 	int fd, ret;
 
 	/* Dump /proc/pid/cwd */
-	params = (struct fd_parms){ .fd_name = FDINFO_CWD, };
+	params = (struct fd_parms) {
+		.fd_name	= FDINFO_CWD,
+		.id		= FD_ID_INVALID,
+		.salt		= FD_SALT_INVALID,
+		.pid		= FD_PID_INVALID,
+	};
+
 	fd = open_proc(pid, "cwd");
 	if (fd < 0)
 		return -1;
@@ -153,7 +177,13 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
 		return ret;
 
 	/* Dump /proc/pid/exe */
-	params = (struct fd_parms){ .fd_name = FDINFO_EXE, };
+	params = (struct fd_parms) {
+		.fd_name	= FDINFO_EXE,
+		.id		= FD_ID_INVALID,
+		.salt		= FD_SALT_INVALID,
+		.pid		= FD_PID_INVALID,
+	};
+
 	fd = open_proc(pid, "exe");
 	if (fd < 0)
 		return -1;
@@ -281,8 +311,12 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, int lfd,
 
 	if (S_ISREG(st_buf.st_mode) ||
 	    S_ISDIR(st_buf.st_mode) ||
-	    (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR))
+	    (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR)) {
+
+		p->id = GEN_FD_ID(st_buf.st_dev, st_buf.st_ino, p->pos);
+
 		return dump_one_reg_file(FDINFO_FD, p, lfd, cr_fdset, 1);
+	}
 
 	if (S_ISFIFO(st_buf.st_mode)) {
 		if (fstatfs(lfd, &stfs_buf) < 0) {
@@ -312,16 +346,20 @@ static int read_fd_params(pid_t pid, char *fd, struct fd_parms *p)
 		return -1;
 
 	p->fd_name = atoi(fd);
-	ret = fscanf(file, "pos:\t%li\nflags:\t%o\nid:\t%s\n", &p->pos, &p->flags, p->id);
+	ret = fscanf(file, "pos:\t%li\nflags:\t%o\n", &p->pos, &p->flags);
 	fclose(file);
 
-	if (ret != 3) {
-		pr_err("Bad format of fdinfo file (%d items, want 3)\n", ret);
+	if (ret != 2) {
+		pr_err("Bad format of fdinfo file (%d items, want 2)\n", ret);
 		return -1;
 	}
 
-	pr_info("%d fdinfo %s: pos: %16lx flags: %16o id %s\n",
-		pid, fd, p->pos, p->flags, p->id);
+	pr_info("%d fdinfo %s: pos: %16lx flags: %16o\n",
+		pid, fd, p->pos, p->flags);
+
+	p->pid	= pid;
+	p->id	= FD_ID_INVALID;
+	p->salt	= FD_SALT_INVALID;
 
 	return 0;
 }
@@ -352,8 +390,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
 		return -1;
 
 	while ((de = readdir(fd_dir))) {
-		char id[FD_ID_SIZE];
-		struct fd_parms p = { .id = id };
+		struct fd_parms p;
 		int lfd;
 
 		if (de->d_name[0] == '.')
@@ -407,9 +444,10 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct
 			} else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
 				   vma_entry_is(vma, VMA_FILE_SHARED)) {
 				struct fd_parms p = {
-					.fd_name = vma->start,
-					.pos = 0,
-					.id = NULL,
+					.fd_name	= vma->start,
+					.id		= FD_ID_INVALID,
+					.salt		= FD_SALT_INVALID,
+					.pid		= pid,
 				};
 
 				if (vma->prot & PROT_WRITE &&
@@ -1248,6 +1286,8 @@ int cr_dump_tasks(pid_t pid, struct cr_options *opts)
 	struct pstree_item *item;
 	int i, ret = -1;
 
+	fd_id_entry_init();
+
 	pr_info("========================================\n");
 	if (!opts->leader_only)
 		pr_info("Dumping process group (pid: %d)\n", pid);
@@ -1299,6 +1339,7 @@ err:
 	pstree_switch_state(&pstree_list, opts);
 	free_pstree(&pstree_list);
 	close_cr_fdset(&cr_fdset);
+	fd_id_entry_fini();
 
 	return ret;
 }
diff --git a/cr-show.c b/cr-show.c
index 2f9cb84..a187138 100644
--- a/cr-show.c
+++ b/cr-show.c
@@ -85,8 +85,9 @@ static void show_files(int fd_files)
 		if (ret <= 0)
 			goto out;
 
-		pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx id: %s",
-				e.type, e.len, e.flags, e.pos, e.addr, e.id);
+		pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx "
+			"id: %16lx salt: %16lx",
+			e.type, e.len, e.flags, e.pos, e.addr, e.id, e.salt);
 
 		if (e.len) {
 			int ret = read(fd_files, local_buf, e.len);
diff --git a/file-ids.c b/file-ids.c
new file mode 100644
index 0000000..c3f8755
--- /dev/null
+++ b/file-ids.c
@@ -0,0 +1,209 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "types.h"
+#include "file-ids.h"
+#include "rbtree.h"
+
+#include "compiler.h"
+#include "syscall.h"
+#include "util.h"
+
+struct rb_root fd_id_root = RB_ROOT;
+static unsigned long fd_id_entries_salt = 1;
+
+void fd_id_entry_init(void)
+{
+	/* Has nothing to do yet */
+}
+
+void fd_id_entry_fini(void)
+{
+	/* Clear fd_id_entries */
+	fd_id_entries_salt = 1;
+}
+
+static void print_fd_id_tree(void)
+{
+	struct rb_node *n, *sn;
+
+	pr_debug("---------> fd_id_root\n");
+	for (n = rb_first(&fd_id_root); n; n = rb_next(n)) {
+		struct fd_id_entry *e = rb_entry(n, struct fd_id_entry, node);
+
+		pr_debug("\tid %16lx salt %16lx pid %6d fd %4d\n",
+			 e->id, e->salt, e->pid, e->fd);
+
+		if (!RB_EMPTY_ROOT(&e->subtree_root)) {
+			for (sn = rb_first(&e->subtree_root); sn; sn = rb_next(sn)) {
+				struct fd_id_entry *e = rb_entry(sn, struct fd_id_entry, subtree_node);
+
+				pr_debug("\t\tid %16lx salt %16lx pid %6d fd %4d\n",
+					 e->id, e->salt, e->pid, e->fd);
+			}
+		}
+	}
+	pr_debug("----------\n");
+}
+
+static struct fd_id_entry *alloc_fd_id_entry(u64 id, pid_t pid, int fd)
+{
+	struct fd_id_entry *e;
+
+	pr_debug("alloc-fd:\tid %16lx pid %6d fd %4d\n", id, pid, fd);
+
+	e = xmalloc(sizeof(*e));
+	if (!e) {
+		pr_err("No memory to allocate FD-ID: id %16lx pid %6d fd %4d\n",
+			id, pid, fd);
+		goto err;
+	}
+
+	e->salt	= FD_SALT_INVALID;
+	e->id	= id;
+	e->pid	= pid;
+	e->fd	= fd;
+
+	rb_init_node(&e->node);
+	rb_init_node(&e->subtree_node);
+	rb_attach_node(&e->subtree_root, &e->subtree_node);
+err:
+	return e;
+}
+
+static struct fd_id_entry *
+lookup_alloc_subtree(struct fd_id_entry *e, u64 id, pid_t pid, int fd, bool do_alloc)
+{
+	struct rb_node *node = e->subtree_root.rb_node;
+	struct fd_id_entry *sub = NULL;
+
+	pr_debug("sublookup-fd:\tid %16lx pid %6d fd %4d\n", id, pid, fd);
+
+	while (node) {
+		struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, subtree_node);
+		int ret = sys_kcmp(this->pid, pid, KCMP_FILE, this->fd, fd);
+
+		pr_debug("sublookup-fd:\t\tid (%2d) %16lx (%6d) -- %16lx (%6d)\n",
+			 ret, id, pid, this->id, this->pid);
+
+		if (ret < 0)
+			node = node->rb_left;
+		else if (ret > 0)
+			node = node->rb_right;
+		else
+			return this;
+	}
+
+	if (do_alloc) {
+		struct rb_node **new = &(e->subtree_root.rb_node);
+		struct rb_node *parent = NULL;
+
+		sub = alloc_fd_id_entry(id, pid, fd);
+		if (!sub)
+			goto err;
+
+		sub->salt = fd_id_entries_salt++;
+
+		while (*new) {
+			struct fd_id_entry *this = rb_entry(*new, struct fd_id_entry, subtree_node);
+			int ret = sys_kcmp(this->pid, pid, KCMP_FILE, this->fd, fd);
+
+			pr_debug("sublookup-fd:\t\tid (%2d) %16lx (%6d) -- %16lx (%6d)\n",
+				 ret, id, pid, this->id, this->pid);
+
+			parent = *new;
+			if (ret < 0)
+				new = &((*new)->rb_left);
+			else if (ret > 0)
+				new = &((*new)->rb_right);
+			else
+				BUG_ON(1);
+			/* Must no be dups here */
+		}
+
+		pr_debug("subinsert-fd:\tid %16lx pid %6d fd %4d salt %16lx\n",
+			 sub->id, sub->pid, sub->fd, sub->salt);
+
+		rb_link_node(&sub->subtree_node, parent, new);
+		rb_insert_color(&sub->subtree_node, &e->subtree_root);
+
+		print_fd_id_tree();
+	}
+
+err:
+	return sub;
+}
+
+static struct fd_id_entry *
+lookup_alloc(u64 id, pid_t pid, int fd, bool do_alloc)
+{
+	struct rb_node *node = fd_id_root.rb_node;
+	struct fd_id_entry *e = NULL;
+
+	pr_debug("lookup-fd:\tid %16lx pid %6d fd %4d\n", id, pid, fd);
+	print_fd_id_tree();
+
+	while (node) {
+		struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, node);
+
+		pr_debug("lookup-fd:\t\tid %16lx (%6d) -- %16lx (%6d)\n",
+			 id, pid, this->id, this->pid);
+
+		if (id < this->id)
+			node = node->rb_left;
+		else if (id > this->id)
+			node = node->rb_right;
+		else
+			return lookup_alloc_subtree(this, id, pid, fd, do_alloc);
+	}
+
+	if (do_alloc) {
+		struct rb_node **new = &(fd_id_root.rb_node);
+		struct rb_node *parent = NULL;
+
+		e = alloc_fd_id_entry(id, pid, fd);
+		if (!e)
+			goto err;
+
+		while (*new) {
+			struct fd_id_entry *this = rb_entry(*new, struct fd_id_entry, node);
+
+			pr_debug("lookup-fd:\t\tid %16lx (%6d) -- %16lx (%6d)\n",
+				 id, pid, this->id, this->pid);
+
+			parent = *new;
+			if (id < this->id)
+				new = &((*new)->rb_left);
+			else if (id > this->id)
+				new = &((*new)->rb_right);
+			else
+				BUG_ON(1);
+			/* Must no be dups here */
+		}
+
+		pr_debug("insert-fd:\tid %16lx pid %6d fd %4d salt %16lx\n",
+			 e->id, e->pid, e->fd, e->salt);
+
+		rb_link_node(&e->node, parent, new);
+		rb_insert_color(&e->node, &fd_id_root);
+
+		print_fd_id_tree();
+	}
+
+err:
+	return e;
+}
+
+struct fd_id_entry *fd_id_entry_collect(u64 id, pid_t pid, int fd)
+{
+	pr_debug("collect-fd:\tid %16lx pid %6d fd %4d\n", id, pid, fd);
+	return lookup_alloc(id, pid, fd, true);
+}
diff --git a/files.c b/files.c
index 56b3311..f86cb0e 100644
--- a/files.c
+++ b/files.c
@@ -44,14 +44,14 @@ int prepare_shared_fdinfo(void)
 	return 0;
 }
 
-static struct fdinfo_desc *find_fd(char *id)
+static struct fdinfo_desc *find_fd(u64 id, u64 salt)
 {
 	struct fdinfo_desc *fi;
 	int i;
 
 	for (i = 0; i < nr_fdinfo_descs; i++) {
 		fi = fdinfo_descs + i;
-		if (!strncmp(fi->id, id, FD_ID_SIZE))
+		if (fi->id == id && fi->salt == salt)
 			return fi;
 	}
 
@@ -74,9 +74,10 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 {
 	int i;
 	struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list];
-	struct fdinfo_desc	*desc;
+	struct fdinfo_desc *desc;
 
-	pr_info("Collect fdinfo pid=%d fd=%ld id=%s\n", pid, e->addr, e->id);
+	pr_info("Collect fdinfo pid=%d fd=%ld id=%16lx salt=%16lx\n",
+		pid, e->addr, e->id, e->salt);
 
 	nr_fdinfo_list++;
 	if ((nr_fdinfo_list) * sizeof(struct fdinfo_list_entry) >= 4096) {
@@ -90,7 +91,8 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 
 	for (i = 0; i < nr_fdinfo_descs; i++) {
 		desc = &fdinfo_descs[i];
-		if (strncmp(desc->id, (char *) e->id, FD_ID_SIZE))
+
+		if (desc->id != e->id || desc->salt != e->salt)
 			continue;
 
 		fdinfo_descs[i].users++;
@@ -111,13 +113,15 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 	}
 
 	desc = &fdinfo_descs[nr_fdinfo_descs];
-	memset(desc, 0, sizeof(fdinfo_descs[nr_fdinfo_descs]));
+	memzero(desc, sizeof(*desc));
 
-	memcpy(desc->id, e->id, FD_ID_SIZE);
-	desc->addr= e->addr;
-	desc->pid = pid;
-	desc->users = 1;
+	desc->id	= e->id;
+	desc->salt	= e->salt;
+	desc->addr	= e->addr;
+	desc->pid	= pid;
+	desc->users	= 1;
 	INIT_LIST_HEAD(&desc->list);
+
 	list_add(&le->list, &desc->list);
 	nr_fdinfo_descs++;
 
@@ -392,7 +396,7 @@ static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int sta
 	u32 mag;
 	int ret = 0;
 
-	struct fdinfo_desc *fi = find_fd((char *)fe->id);
+	struct fdinfo_desc *fi = find_fd(fe->id, fe->salt);
 
 	if (move_img_fd(fdinfo_fd, (int)fe->addr))
 		return -1;
diff --git a/include/file-ids.h b/include/file-ids.h
new file mode 100644
index 0000000..5a6fc70
--- /dev/null
+++ b/include/file-ids.h
@@ -0,0 +1,34 @@
+#ifndef FILE_IDS_H__
+#define FILE_IDS_H__
+
+#include "compiler.h"
+#include "types.h"
+#include "rbtree.h"
+
+#define FD_ID_INVALID	-1UL
+#define FD_SALT_INVALID	-2UL
+#define FD_PID_INVALID	((int)-3UL)
+
+struct fd_id_entry {
+	struct rb_node	node;
+
+	struct rb_root	subtree_root;
+	struct rb_node	subtree_node;
+
+	u64		id;
+	u64		salt;
+
+	pid_t		pid;
+	int		fd;
+} __aligned(sizeof(long));
+
+#define GEN_FD_ID(dev, ino, pos)	\
+	((u64)(dev) ^ (u64)(ino) ^ (u64)(pos))
+
+extern void fd_id_entry_init(void);
+extern void fd_id_entry_fini(void);
+extern void fd_id_entry_inc_salt(void);
+
+extern struct fd_id_entry *fd_id_entry_collect(u64 id, pid_t pid, int fd);
+
+#endif /* FILE_IDS_H__ */
diff --git a/include/files.h b/include/files.h
index f311e93..b360eac 100644
--- a/include/files.h
+++ b/include/files.h
@@ -22,7 +22,8 @@ struct fmap_fd {
 };
 
 struct fdinfo_desc {
-	char			id[FD_ID_SIZE];
+	u64			id;
+	u64			salt;
 	u64			addr;
 	int			pid;
 	u32			real_pid;	/* futex */
diff --git a/include/image.h b/include/image.h
index c9b7209..730b84b 100644
--- a/include/image.h
+++ b/include/image.h
@@ -39,15 +39,14 @@
 #define PAGE_RSS	1
 #define PAGE_ANON	2
 
-#define FD_ID_SIZE	50
-
 struct fdinfo_entry {
 	u8	type;
 	u8	len;
 	u16	flags;
 	u32	pos;
 	u64	addr;
-	u8	id[FD_ID_SIZE];
+	u64	id;
+	u64	salt;
 	u8	name[0];
 } __packed;
 
-- 
1.7.7.6



More information about the CRIU mailing list