[CRIU] [crtools-bot for Cyrill Gorcunov ] files: Use sys_kcmp to find file descriptor duplicates v4

Cyrill Gorcunov gorcunov at openvz.org
Tue Feb 28 10:13:47 EST 2012


The commit is pushed to "master" and will appear on git://github.com/cyrillos/crtools.git
------>
commit 2acc741a3aba3c5b1c87cf8af00ec82bee46f9ed
Author: Cyrill Gorcunov <gorcunov at openvz.org>
Date:   Tue Feb 28 18:27:28 2012 +0400

    files: Use sys_kcmp to find file descriptor duplicates v4
    
    We switch generic-object-id concept with sys_kcmp approach,
    which implies changes of image format a bit (and since it's
    early time for project overall, we're allowed to).
    
    In short -- previously every file descriptor had an ID
    generated by a kernel and exported via procfs. If the
    appropriate file descriptors were the same objects in
    kernel memory -- the IDs did match up to bit. It allows
    us to figure out which files were actually the identical
    ones and should be restored in a special way.
    
    Once sys_kcmp system call was merged into the kernel,
    we've got a new opprotunity -- to use this syscall instead.
    The syscall basically compares kernel objects and returns
    ordered results suitable for objects sorting in a userspace.
    
    For us it means -- we treat every file descriptor as a combination
    of 'genid' and 'subid'. While 'genid' serves for fast comparison
    between fds, the 'subid' is kind of a second key, which guarantees
    uniqueness of genid+subid tuple over all file descritors found
    in a process (or group of processes).
    
    To be able to find and dump file descriptors in a single pass we
    collect every fd into a global rbtree, where (!) each node might
    become a root for a subtree as well.
    
    The main tree carries only non-equal genid. If we find genid which
    is already in tree, we need to make sure that it's either indeed
    a duplicate or not. For this we use sys_kcmp syscall and if we
    find that file descriptors are different -- we simply put new
    fd into a subtree.
    
    Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
    Acked-by: Pavel Emelyanov <xemul at parallels.com>
---
 Makefile           |    1 +
 cr-dump.c          |   70 +++++++++++++++++++-------
 cr-show.c          |    5 +-
 file-ids.c         |  145 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 files.c            |   25 +++++----
 include/compiler.h |    2 +
 include/file-ids.h |   34 ++++++++++++
 include/files.h    |    2 +-
 include/image.h    |    4 +-
 include/rbtree.h   |    9 +++
 10 files changed, 262 insertions(+), 35 deletions(-)

diff --git a/Makefile b/Makefile
index 00d731f..4ca6cb1 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,7 @@ OBJS		+= log.o
 OBJS		+= libnetlink.o
 OBJS		+= sockets.o
 OBJS		+= files.o
+OBJS		+= file-ids.o
 OBJS		+= namespaces.o
 OBJS		+= uts_ns.o
 OBJS		+= ipc_ns.o
diff --git a/cr-dump.c b/cr-dump.c
index c7b450a..4b4cbe2 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -20,6 +20,7 @@
 
 #include "types.h"
 #include "list.h"
+#include "file-ids.h"
 
 #include "compiler.h"
 #include "crtools.h"
@@ -31,6 +32,7 @@
 #include "image.h"
 #include "proc_parse.h"
 #include "parasite-syscall.h"
+#include "file-ids.h"
 
 #ifndef CONFIG_X86_64
 # error No x86-32 support yet
@@ -89,7 +91,9 @@ struct fd_parms {
 	unsigned long	fd_name;
 	unsigned long	pos;
 	unsigned int	flags;
-	char		*id;
+
+	u64		id;
+	pid_t		pid;
 };
 
 static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
@@ -120,10 +124,24 @@ static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
 	e.flags = p->flags;
 	e.pos	= p->pos;
 	e.addr	= p->fd_name;
-	if (p->id)
-		memcpy(e.id, p->id, FD_ID_SIZE);
-	else
-		memzero(e.id, FD_ID_SIZE);
+
+	if (likely(!fd_is_special(&e))) {
+		struct fd_id_entry *fd_id_entry;
+
+		/*
+		 * Make sure the union is still correlate with structure
+		 * we write to disk.
+		 */
+		BUILD_BUG_ON(sizeof(fd_id_entry->u.key) != sizeof(e.id));
+
+		fd_id_entry = fd_id_entry_collect((u32)p->id, p->pid, p->fd_name);
+		if (!fd_id_entry)
+			goto err;
+
+		/* Now it might have completely new ID here */
+		e.id	= fd_id_entry->u.id;
+	} else
+		e.id	= FD_ID_INVALID;
 
 	pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n",
 		type, len, p->flags, p->pos, p->fd_name);
@@ -144,7 +162,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
 	int fd, ret;
 
 	/* Dump /proc/pid/cwd */
-	params = (struct fd_parms){ .fd_name = FDINFO_CWD, };
+	params = (struct fd_parms) {
+		.fd_name	= FDINFO_CWD,
+		.id		= FD_ID_INVALID,
+		.pid		= FD_PID_INVALID,
+	};
+
 	fd = open_proc(pid, "cwd");
 	if (fd < 0)
 		return -1;
@@ -153,7 +176,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
 		return ret;
 
 	/* Dump /proc/pid/exe */
-	params = (struct fd_parms){ .fd_name = FDINFO_EXE, };
+	params = (struct fd_parms) {
+		.fd_name	= FDINFO_EXE,
+		.id		= FD_ID_INVALID,
+		.pid		= FD_PID_INVALID,
+	};
+
 	fd = open_proc(pid, "exe");
 	if (fd < 0)
 		return -1;
@@ -281,8 +309,12 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, int lfd,
 
 	if (S_ISREG(st_buf.st_mode) ||
 	    S_ISDIR(st_buf.st_mode) ||
-	    (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR))
+	    (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR)) {
+
+		p->id = MAKE_FD_GENID(st_buf.st_dev, st_buf.st_ino, p->pos);
+
 		return dump_one_reg_file(FDINFO_FD, p, lfd, cr_fdset, 1);
+	}
 
 	if (S_ISFIFO(st_buf.st_mode)) {
 		if (fstatfs(lfd, &stfs_buf) < 0) {
@@ -312,16 +344,19 @@ static int read_fd_params(pid_t pid, char *fd, struct fd_parms *p)
 		return -1;
 
 	p->fd_name = atoi(fd);
-	ret = fscanf(file, "pos:\t%li\nflags:\t%o\nid:\t%s\n", &p->pos, &p->flags, p->id);
+	ret = fscanf(file, "pos:\t%li\nflags:\t%o\n", &p->pos, &p->flags);
 	fclose(file);
 
-	if (ret != 3) {
-		pr_err("Bad format of fdinfo file (%d items, want 3)\n", ret);
+	if (ret != 2) {
+		pr_err("Bad format of fdinfo file (%d items, want 2)\n", ret);
 		return -1;
 	}
 
-	pr_info("%d fdinfo %s: pos: %16lx flags: %16o id %s\n",
-		pid, fd, p->pos, p->flags, p->id);
+	pr_info("%d fdinfo %s: pos: %16lx flags: %16o\n",
+		pid, fd, p->pos, p->flags);
+
+	p->pid	= pid;
+	p->id	= FD_ID_INVALID;
 
 	return 0;
 }
@@ -352,8 +387,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
 		return -1;
 
 	while ((de = readdir(fd_dir))) {
-		char id[FD_ID_SIZE];
-		struct fd_parms p = { .id = id };
+		struct fd_parms p;
 		int lfd;
 
 		if (de->d_name[0] == '.')
@@ -407,9 +441,9 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct
 			} else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
 				   vma_entry_is(vma, VMA_FILE_SHARED)) {
 				struct fd_parms p = {
-					.fd_name = vma->start,
-					.pos = 0,
-					.id = NULL,
+					.fd_name	= vma->start,
+					.id		= FD_ID_INVALID,
+					.pid		= pid,
 				};
 
 				if (vma->prot & PROT_WRITE &&
diff --git a/cr-show.c b/cr-show.c
index 2f9cb84..dd3cb2f 100644
--- a/cr-show.c
+++ b/cr-show.c
@@ -85,8 +85,9 @@ static void show_files(int fd_files)
 		if (ret <= 0)
 			goto out;
 
-		pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx id: %s",
-				e.type, e.len, e.flags, e.pos, e.addr, e.id);
+		pr_info("type: %02x len: %02x flags: %4x pos: %8x "
+			"addr: %16lx id: %16lx",
+			e.type, e.len, e.flags, e.pos, e.addr, e.id);
 
 		if (e.len) {
 			int ret = read(fd_files, local_buf, e.len);
diff --git a/file-ids.c b/file-ids.c
new file mode 100644
index 0000000..aae3464
--- /dev/null
+++ b/file-ids.c
@@ -0,0 +1,145 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "types.h"
+#include "file-ids.h"
+#include "rbtree.h"
+
+#include "compiler.h"
+#include "syscall.h"
+#include "image.h"
+#include "util.h"
+
+/*
+ * We track shared files by global rbtree, where each node might
+ * be a root for subtree. The reason for that is the nature of data
+ * we obtain from operating system.
+ *
+ * Basically OS provides us two ways to distinguish files
+ *
+ *  - information obtained from fstat call
+ *  - shiny new sys_kcmp system call (which may compare the file descriptor
+ *    pointers inside the kernel and provide us order info)
+ *
+ * So, to speedup procedure of searching for shared file descriptors
+ * we use both techniques. From fstat call we get that named general file
+ * IDs (genid) which are carried in the main rbtree.
+ *
+ * In case if two genid are the same -- we need to use a second way and
+ * call for sys_kcmp. Thus, if kernel tells us that files have identical
+ * genid but in real they are different from kernel point of view -- we assign
+ * a second unique key (subid) to such file descriptor and put it into a subtree.
+ *
+ * So the tree will look like
+ *
+ *               (root)
+ *               genid-1
+ *              /    \
+ *         genid-2  genid-3
+ *            / \    / \
+ *
+ * Where each genid node might be a sub-rbtree as well
+ *
+ *               (genid-N)
+ *               /      \
+ *           subid-1   subid-2
+ *            / \       / \
+ *
+ * Carrying two rbtree at once allow us to minimize the number
+ * of sys_kcmp syscalls, also to collect and dump file descriptors
+ * in one pass.
+ */
+
+struct rb_root fd_id_root = RB_ROOT;
+static unsigned long fd_id_entries_subid = 1;
+
+static struct fd_id_entry *alloc_fd_id_entry(u32 genid, pid_t pid, int fd)
+{
+	struct fd_id_entry *e;
+
+	e = xmalloc(sizeof(*e));
+	if (!e)
+		goto err;
+
+	e->u.key.subid	= fd_id_entries_subid++;
+	e->u.key.genid	= genid;
+	e->pid		= pid;
+	e->fd		= fd;
+
+	/* Make sure no overflow here */
+	BUG_ON(!e->u.key.subid);
+
+	rb_init_node(&e->node);
+	rb_init_node(&e->subtree_node);
+	rb_attach_node(&e->subtree_root, &e->subtree_node);
+err:
+	return e;
+}
+
+static struct fd_id_entry *
+lookup_alloc_subtree(struct fd_id_entry *e, u32 genid, pid_t pid, int fd)
+{
+	struct rb_node *node = e->subtree_root.rb_node;
+	struct fd_id_entry *sub = NULL;
+
+	struct rb_node **new = &e->subtree_root.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (node) {
+		struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, subtree_node);
+		int ret = sys_kcmp(this->pid, pid, KCMP_FILE, this->fd, fd);
+
+		parent = *new;
+		if (ret < 0)
+			node = node->rb_left, new = &((*new)->rb_left);
+		else if (ret > 0)
+			node = node->rb_right, new = &((*new)->rb_right);
+		else
+			return this;
+	}
+
+	sub = alloc_fd_id_entry(genid, pid, fd);
+	if (!sub)
+		goto err;
+
+	rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new);
+err:
+	return sub;
+}
+
+struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd)
+{
+	struct rb_node *node = fd_id_root.rb_node;
+	struct fd_id_entry *e = NULL;
+
+	struct rb_node **new = &fd_id_root.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (node) {
+		struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, node);
+
+		parent = *new;
+		if (genid < this->u.key.genid)
+			node = node->rb_left, new = &((*new)->rb_left);
+		else if (genid > this->u.key.genid)
+			node = node->rb_right, new = &((*new)->rb_right);
+		else
+			return lookup_alloc_subtree(this, genid, pid, fd);
+	}
+
+	e = alloc_fd_id_entry(genid, pid, fd);
+	if (!e)
+		goto err;
+
+	rb_link_and_balance(&fd_id_root, &e->node, parent, new);
+err:
+	return e;
+}
diff --git a/files.c b/files.c
index 56b3311..6c3d8b3 100644
--- a/files.c
+++ b/files.c
@@ -44,14 +44,14 @@ int prepare_shared_fdinfo(void)
 	return 0;
 }
 
-static struct fdinfo_desc *find_fd(char *id)
+static struct fdinfo_desc *find_fd(u64 id)
 {
 	struct fdinfo_desc *fi;
 	int i;
 
 	for (i = 0; i < nr_fdinfo_descs; i++) {
 		fi = fdinfo_descs + i;
-		if (!strncmp(fi->id, id, FD_ID_SIZE))
+		if (fi->id == id)
 			return fi;
 	}
 
@@ -74,9 +74,10 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 {
 	int i;
 	struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list];
-	struct fdinfo_desc	*desc;
+	struct fdinfo_desc *desc;
 
-	pr_info("Collect fdinfo pid=%d fd=%ld id=%s\n", pid, e->addr, e->id);
+	pr_info("Collect fdinfo pid=%d fd=%ld id=%16lx\n",
+		pid, e->addr, e->id);
 
 	nr_fdinfo_list++;
 	if ((nr_fdinfo_list) * sizeof(struct fdinfo_list_entry) >= 4096) {
@@ -90,7 +91,8 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 
 	for (i = 0; i < nr_fdinfo_descs; i++) {
 		desc = &fdinfo_descs[i];
-		if (strncmp(desc->id, (char *) e->id, FD_ID_SIZE))
+
+		if (desc->id != e->id)
 			continue;
 
 		fdinfo_descs[i].users++;
@@ -111,13 +113,14 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 	}
 
 	desc = &fdinfo_descs[nr_fdinfo_descs];
-	memset(desc, 0, sizeof(fdinfo_descs[nr_fdinfo_descs]));
+	memzero(desc, sizeof(*desc));
 
-	memcpy(desc->id, e->id, FD_ID_SIZE);
-	desc->addr= e->addr;
-	desc->pid = pid;
-	desc->users = 1;
+	desc->id	= e->id;
+	desc->addr	= e->addr;
+	desc->pid	= pid;
+	desc->users	= 1;
 	INIT_LIST_HEAD(&desc->list);
+
 	list_add(&le->list, &desc->list);
 	nr_fdinfo_descs++;
 
@@ -392,7 +395,7 @@ static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int sta
 	u32 mag;
 	int ret = 0;
 
-	struct fdinfo_desc *fi = find_fd((char *)fe->id);
+	struct fdinfo_desc *fi = find_fd(fe->id);
 
 	if (move_img_fd(fdinfo_fd, (int)fe->addr))
 		return -1;
diff --git a/include/compiler.h b/include/compiler.h
index c969906..621cedb 100644
--- a/include/compiler.h
+++ b/include/compiler.h
@@ -16,6 +16,8 @@
 #define NORETURN 		__attribute__((__noreturn__))
 #define __packed		__attribute__((__packed__))
 #define __used			__attribute__((__used__))
+#define __maybe_unused		__attribute__((unused))
+#define __always_unused		__attribute__((unused))
 
 #define __section(S)		__attribute__ ((__section__(#S)))
 
diff --git a/include/file-ids.h b/include/file-ids.h
new file mode 100644
index 0000000..3001f80
--- /dev/null
+++ b/include/file-ids.h
@@ -0,0 +1,34 @@
+#ifndef FILE_IDS_H__
+#define FILE_IDS_H__
+
+#include "compiler.h"
+#include "types.h"
+#include "rbtree.h"
+
+#define FD_ID_INVALID		(-1UL)
+#define FD_PID_INVALID		((int)-2UL)
+
+struct fd_id_entry {
+	struct rb_node	node;
+
+	struct rb_root	subtree_root;
+	struct rb_node	subtree_node;
+
+	union {
+		struct {
+			u32		genid;	/* generic id, may have duplicates */
+			u32		subid;	/* subid is always unique */
+		} key;
+		u64			id;
+	} u;
+
+	pid_t		pid;
+	int		fd;
+} __aligned(sizeof(long));
+
+#define MAKE_FD_GENID(dev, ino, pos) \
+	(((u32)(dev) ^ (u32)(ino) ^ (u32)(pos)))
+
+extern struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd);
+
+#endif /* FILE_IDS_H__ */
diff --git a/include/files.h b/include/files.h
index f311e93..eb6a219 100644
--- a/include/files.h
+++ b/include/files.h
@@ -22,7 +22,7 @@ struct fmap_fd {
 };
 
 struct fdinfo_desc {
-	char			id[FD_ID_SIZE];
+	u64			id;
 	u64			addr;
 	int			pid;
 	u32			real_pid;	/* futex */
diff --git a/include/image.h b/include/image.h
index c9b7209..09d7bc4 100644
--- a/include/image.h
+++ b/include/image.h
@@ -39,15 +39,13 @@
 #define PAGE_RSS	1
 #define PAGE_ANON	2
 
-#define FD_ID_SIZE	50
-
 struct fdinfo_entry {
 	u8	type;
 	u8	len;
 	u16	flags;
 	u32	pos;
 	u64	addr;
-	u8	id[FD_ID_SIZE];
+	u64	id;
 	u8	name[0];
 } __packed;
 
diff --git a/include/rbtree.h b/include/rbtree.h
index 1d56bb0..2da5877 100644
--- a/include/rbtree.h
+++ b/include/rbtree.h
@@ -82,4 +82,13 @@ static void rb_link_node(struct rb_node *node, struct rb_node *parent,
 	*rb_link = node;
 }
 
+static void rb_link_and_balance(struct rb_root *root,
+				struct rb_node *node,
+				struct rb_node *parent,
+				struct rb_node **rb_link)
+{
+	rb_link_node(node, parent, rb_link);
+	rb_insert_color(node, root);
+}
+
 #endif /* RBTREE_H__ */


More information about the CRIU mailing list