[CRIU] Re: [PATCH 10/10] files: Use sys_kcmp to find file descriptor duplicates

Cyrill Gorcunov gorcunov at openvz.org
Mon Feb 27 07:43:53 EST 2012


On Mon, Feb 27, 2012 at 11:26:58AM +0400, Pavel Emelyanov wrote:
> > +void fd_id_entry_init(void)
> > +{
> > +       /* Has nothing to do yet */
> > +}
> > +
> > +void fd_id_entry_fini(void)
> > +{
> > +       /* Clear fd_id_entries */
> > +       fd_id_entries_salt = 1;
> > +}
> 
> These two are useless. Plz, remove.
> 

Pavel, take a look please, I've tried to address
all your concerns but we can't squash ID into one
u64 number, since offset if file is actually 8 byte
value, so I renamed id to genid and subid. What
do you think?

	Cyrill
---
From: Cyrill Gorcunov <gorcunov at openvz.org>
Subject: [PATCH 10/10] files: Use sys_kcmp to find file descriptor duplicates v2

We switch generic-object-id concept with sys_kcmp approach,
which implies changes of image format a bit (and since it's
early time for project overall, we're allowed to).

In short -- previously every file descriptor had an ID
generated by a kernel and exported via procfs. If the
appropriate file descriptors were the same objects in
kernel memory -- the IDs did match up to bit. It allows
us to figure out which files are actually the identical
ones and should be restored in a special way.

Once sys_kcmp system call were merged into the kernel,
we've got a new opprotunity -- to use this call instead.
The syscall basically compare kernel objects and returns
ordered results suitable for objects sorting in userspace.

For us it mens we treat every file descriptor as a combination
of 'genid' and 'subid'. While 'genid' is serves for fast comparison
between fds, the 'subid' is kind of a second key, which guarantees
uniqueness of genid+subid tuple over all file descritors found in process
(or group of processes).

To be able to find and dump file descriptors in a single pass we collect
every fd we found into a global rbtree, where (!) each node might become
a root for a subtree as well.

The main tree carries only primary ids, ie nonequal ones. If we find
some id which is already in tree, we need to make sure that it's
either indeed a duplicate or not.

For this we use sys_kcmp syscall and if we find that in real the file
descriptors are different -- we simply add a unique 'subid' value and
put new fd into a subtree.

v2:
 - drop debug code
 - drop reundant functions
 - rename id and salt to genid and subid,
   since position in file is 8 bytes value
   we can't shrink structure and pack everything
   into one 64 bit field

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 Makefile           |    1 +
 cr-dump.c          |   74 +++++++++++++++++++++++++--------
 cr-show.c          |    6 ++-
 file-ids.c         |  115 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 files.c            |   26 +++++++-----
 include/file-ids.h |   31 ++++++++++++++
 include/files.h    |    3 +-
 include/image.h    |    5 +-
 8 files changed, 226 insertions(+), 35 deletions(-)
 create mode 100644 file-ids.c
 create mode 100644 include/file-ids.h

diff --git a/Makefile b/Makefile
index 00d731f..4ca6cb1 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,7 @@ OBJS		+= log.o
 OBJS		+= libnetlink.o
 OBJS		+= sockets.o
 OBJS		+= files.o
+OBJS		+= file-ids.o
 OBJS		+= namespaces.o
 OBJS		+= uts_ns.o
 OBJS		+= ipc_ns.o
diff --git a/cr-dump.c b/cr-dump.c
index c7b450a..e0816aa 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -20,6 +20,7 @@
 
 #include "types.h"
 #include "list.h"
+#include "file-ids.h"
 
 #include "compiler.h"
 #include "crtools.h"
@@ -31,6 +32,7 @@
 #include "image.h"
 #include "proc_parse.h"
 #include "parasite-syscall.h"
+#include "file-ids.h"
 
 #ifndef CONFIG_X86_64
 # error No x86-32 support yet
@@ -89,7 +91,10 @@ struct fd_parms {
 	unsigned long	fd_name;
 	unsigned long	pos;
 	unsigned int	flags;
-	char		*id;
+
+	u64		genid;
+	u64		subid;
+	pid_t		pid;
 };
 
 static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
@@ -120,10 +125,23 @@ static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
 	e.flags = p->flags;
 	e.pos	= p->pos;
 	e.addr	= p->fd_name;
-	if (p->id)
-		memcpy(e.id, p->id, FD_ID_SIZE);
-	else
-		memzero(e.id, FD_ID_SIZE);
+
+	if (likely(!fd_is_special(&e))) {
+		struct fd_id_entry *fd_id_entry;
+
+		fd_id_entry = fd_id_entry_collect(p->genid, p->pid, p->fd_name);
+		if (!fd_id_entry) {
+			pr_err("No memory to collect fidinfo for file %li (pid %d)\n",
+				p->fd_name, p->pid);
+			goto err;
+		}
+
+		e.genid	= fd_id_entry->genid;
+		e.subid	= fd_id_entry->subid;
+	} else {
+		e.genid	= FD_GENID_INVALID;
+		e.subid	= FD_SUBID_INVALID;
+	}
 
 	pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n",
 		type, len, p->flags, p->pos, p->fd_name);
@@ -144,7 +162,13 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
 	int fd, ret;
 
 	/* Dump /proc/pid/cwd */
-	params = (struct fd_parms){ .fd_name = FDINFO_CWD, };
+	params = (struct fd_parms) {
+		.fd_name	= FDINFO_CWD,
+		.genid		= FD_GENID_INVALID,
+		.subid		= FD_SUBID_INVALID,
+		.pid		= FD_PID_INVALID,
+	};
+
 	fd = open_proc(pid, "cwd");
 	if (fd < 0)
 		return -1;
@@ -153,7 +177,13 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
 		return ret;
 
 	/* Dump /proc/pid/exe */
-	params = (struct fd_parms){ .fd_name = FDINFO_EXE, };
+	params = (struct fd_parms) {
+		.fd_name	= FDINFO_EXE,
+		.genid		= FD_GENID_INVALID,
+		.subid		= FD_SUBID_INVALID,
+		.pid		= FD_PID_INVALID,
+	};
+
 	fd = open_proc(pid, "exe");
 	if (fd < 0)
 		return -1;
@@ -281,8 +311,12 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, int lfd,
 
 	if (S_ISREG(st_buf.st_mode) ||
 	    S_ISDIR(st_buf.st_mode) ||
-	    (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR))
+	    (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR)) {
+
+		p->genid = MAKE_FD_GENID(st_buf.st_dev, st_buf.st_ino, p->pos);
+
 		return dump_one_reg_file(FDINFO_FD, p, lfd, cr_fdset, 1);
+	}
 
 	if (S_ISFIFO(st_buf.st_mode)) {
 		if (fstatfs(lfd, &stfs_buf) < 0) {
@@ -312,16 +346,20 @@ static int read_fd_params(pid_t pid, char *fd, struct fd_parms *p)
 		return -1;
 
 	p->fd_name = atoi(fd);
-	ret = fscanf(file, "pos:\t%li\nflags:\t%o\nid:\t%s\n", &p->pos, &p->flags, p->id);
+	ret = fscanf(file, "pos:\t%li\nflags:\t%o\n", &p->pos, &p->flags);
 	fclose(file);
 
-	if (ret != 3) {
-		pr_err("Bad format of fdinfo file (%d items, want 3)\n", ret);
+	if (ret != 2) {
+		pr_err("Bad format of fdinfo file (%d items, want 2)\n", ret);
 		return -1;
 	}
 
-	pr_info("%d fdinfo %s: pos: %16lx flags: %16o id %s\n",
-		pid, fd, p->pos, p->flags, p->id);
+	pr_info("%d fdinfo %s: pos: %16lx flags: %16o\n",
+		pid, fd, p->pos, p->flags);
+
+	p->pid	= pid;
+	p->genid= FD_GENID_INVALID;
+	p->subid= FD_SUBID_INVALID;
 
 	return 0;
 }
@@ -352,8 +390,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
 		return -1;
 
 	while ((de = readdir(fd_dir))) {
-		char id[FD_ID_SIZE];
-		struct fd_parms p = { .id = id };
+		struct fd_parms p;
 		int lfd;
 
 		if (de->d_name[0] == '.')
@@ -407,9 +444,10 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct
 			} else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
 				   vma_entry_is(vma, VMA_FILE_SHARED)) {
 				struct fd_parms p = {
-					.fd_name = vma->start,
-					.pos = 0,
-					.id = NULL,
+					.fd_name	= vma->start,
+					.genid		= FD_GENID_INVALID,
+					.subid		= FD_SUBID_INVALID,
+					.pid		= pid,
 				};
 
 				if (vma->prot & PROT_WRITE &&
diff --git a/cr-show.c b/cr-show.c
index 2f9cb84..b911cac 100644
--- a/cr-show.c
+++ b/cr-show.c
@@ -85,8 +85,10 @@ static void show_files(int fd_files)
 		if (ret <= 0)
 			goto out;
 
-		pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx id: %s",
-				e.type, e.len, e.flags, e.pos, e.addr, e.id);
+		pr_info("type: %02x len: %02x flags: %4x pos: %8x "
+			"addr: %16lx genid: %16lx subid: %16lx",
+			e.type, e.len, e.flags, e.pos, e.addr,
+			e.genid, e.subid);
 
 		if (e.len) {
 			int ret = read(fd_files, local_buf, e.len);
diff --git a/file-ids.c b/file-ids.c
new file mode 100644
index 0000000..7dcaa2a
--- /dev/null
+++ b/file-ids.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "types.h"
+#include "file-ids.h"
+#include "rbtree.h"
+
+#include "compiler.h"
+#include "syscall.h"
+#include "util.h"
+
+struct rb_root fd_id_root = RB_ROOT;
+static unsigned long fd_id_entries_subid = 1;
+
+static struct fd_id_entry *alloc_fd_id_entry(u64 genid, pid_t pid, int fd)
+{
+	struct fd_id_entry *e;
+
+	e = xmalloc(sizeof(*e));
+	if (!e) {
+		pr_err("No memory to allocate fd_id_entry: "
+		       "genid %16lx pid %6d fd %4d\n",
+			genid, pid, fd);
+		goto err;
+	}
+
+	e->subid= FD_SUBID_INVALID;
+	e->genid= genid;
+	e->pid	= pid;
+	e->fd	= fd;
+
+	rb_init_node(&e->node);
+	rb_init_node(&e->subtree_node);
+	rb_attach_node(&e->subtree_root, &e->subtree_node);
+err:
+	return e;
+}
+
+static struct fd_id_entry *
+lookup_alloc_subtree(struct fd_id_entry *e, u64 genid, pid_t pid, int fd)
+{
+	struct rb_node *node = e->subtree_root.rb_node;
+	struct fd_id_entry *sub = NULL;
+
+	struct rb_node **new = &e->subtree_root.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (node) {
+		struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, subtree_node);
+		int ret = sys_kcmp(this->pid, pid, KCMP_FILE, this->fd, fd);
+
+		parent = *new;
+		if (ret < 0)
+			node = node->rb_left, new = &((*new)->rb_left);
+		else if (ret > 0)
+			node = node->rb_right, new = &((*new)->rb_right);
+		else
+			return this;
+	}
+
+	sub = alloc_fd_id_entry(genid, pid, fd);
+	if (!sub)
+		goto err;
+
+	sub->subid = fd_id_entries_subid++;
+
+	rb_link_node(&sub->subtree_node, parent, new);
+	rb_insert_color(&sub->subtree_node, &e->subtree_root);
+err:
+	return sub;
+}
+
+static struct fd_id_entry *
+lookup_alloc(u64 genid, pid_t pid, int fd)
+{
+	struct rb_node *node = fd_id_root.rb_node;
+	struct fd_id_entry *e = NULL;
+
+	struct rb_node **new = &fd_id_root.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (node) {
+		struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, node);
+
+		parent = *new;
+		if (genid < this->genid)
+			node = node->rb_left, new = &((*new)->rb_left);
+		else if (genid > this->genid)
+			node = node->rb_right, new = &((*new)->rb_right);
+		else
+			return lookup_alloc_subtree(this, genid, pid, fd);
+	}
+
+	e = alloc_fd_id_entry(genid, pid, fd);
+	if (!e)
+		goto err;
+
+	rb_link_node(&e->node, parent, new);
+	rb_insert_color(&e->node, &fd_id_root);
+err:
+	return e;
+}
+
+struct fd_id_entry *fd_id_entry_collect(u64 genid, pid_t pid, int fd)
+{
+	return lookup_alloc(genid, pid, fd);
+}
diff --git a/files.c b/files.c
index 56b3311..3137ca6 100644
--- a/files.c
+++ b/files.c
@@ -44,14 +44,14 @@ int prepare_shared_fdinfo(void)
 	return 0;
 }
 
-static struct fdinfo_desc *find_fd(char *id)
+static struct fdinfo_desc *find_fd(u64 genid, u64 subid)
 {
 	struct fdinfo_desc *fi;
 	int i;
 
 	for (i = 0; i < nr_fdinfo_descs; i++) {
 		fi = fdinfo_descs + i;
-		if (!strncmp(fi->id, id, FD_ID_SIZE))
+		if (fi->genid == genid && fi->subid == subid)
 			return fi;
 	}
 
@@ -74,9 +74,10 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 {
 	int i;
 	struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list];
-	struct fdinfo_desc	*desc;
+	struct fdinfo_desc *desc;
 
-	pr_info("Collect fdinfo pid=%d fd=%ld id=%s\n", pid, e->addr, e->id);
+	pr_info("Collect fdinfo pid=%d fd=%ld genid=%16lx subid=%16lx\n",
+		pid, e->addr, e->genid, e->subid);
 
 	nr_fdinfo_list++;
 	if ((nr_fdinfo_list) * sizeof(struct fdinfo_list_entry) >= 4096) {
@@ -90,7 +91,8 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 
 	for (i = 0; i < nr_fdinfo_descs; i++) {
 		desc = &fdinfo_descs[i];
-		if (strncmp(desc->id, (char *) e->id, FD_ID_SIZE))
+
+		if (desc->genid != e->genid || desc->subid != e->subid)
 			continue;
 
 		fdinfo_descs[i].users++;
@@ -111,13 +113,15 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
 	}
 
 	desc = &fdinfo_descs[nr_fdinfo_descs];
-	memset(desc, 0, sizeof(fdinfo_descs[nr_fdinfo_descs]));
+	memzero(desc, sizeof(*desc));
 
-	memcpy(desc->id, e->id, FD_ID_SIZE);
-	desc->addr= e->addr;
-	desc->pid = pid;
-	desc->users = 1;
+	desc->genid	= e->genid;
+	desc->subid	= e->subid;
+	desc->addr	= e->addr;
+	desc->pid	= pid;
+	desc->users	= 1;
 	INIT_LIST_HEAD(&desc->list);
+
 	list_add(&le->list, &desc->list);
 	nr_fdinfo_descs++;
 
@@ -392,7 +396,7 @@ static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int sta
 	u32 mag;
 	int ret = 0;
 
-	struct fdinfo_desc *fi = find_fd((char *)fe->id);
+	struct fdinfo_desc *fi = find_fd(fe->genid, fe->subid);
 
 	if (move_img_fd(fdinfo_fd, (int)fe->addr))
 		return -1;
diff --git a/include/file-ids.h b/include/file-ids.h
new file mode 100644
index 0000000..6d712a2
--- /dev/null
+++ b/include/file-ids.h
@@ -0,0 +1,31 @@
+#ifndef FILE_IDS_H__
+#define FILE_IDS_H__
+
+#include "compiler.h"
+#include "types.h"
+#include "rbtree.h"
+
+#define FD_GENID_INVALID	(-1UL)
+#define FD_SUBID_INVALID	(-2UL)
+#define FD_PID_INVALID		((int)-3UL)
+
+struct fd_id_entry {
+	struct rb_node	node;
+
+	struct rb_root	subtree_root;
+	struct rb_node	subtree_node;
+
+	u64		genid;	/* generic id, may have duplicates */
+	u64		subid;	/* subid is always unique */
+
+	pid_t		pid;
+	int		fd;
+} __aligned(sizeof(long));
+
+/* Position in file is u64 so we can't shrink it */
+#define MAKE_FD_GENID(dev, ino, pos) \
+	((u64)(dev) ^ (u64)(ino) ^ (u64)(pos))
+
+extern struct fd_id_entry *fd_id_entry_collect(u64 genid, pid_t pid, int fd);
+
+#endif /* FILE_IDS_H__ */
diff --git a/include/files.h b/include/files.h
index f311e93..707e270 100644
--- a/include/files.h
+++ b/include/files.h
@@ -22,7 +22,8 @@ struct fmap_fd {
 };
 
 struct fdinfo_desc {
-	char			id[FD_ID_SIZE];
+	u64			genid;
+	u64			subid;
 	u64			addr;
 	int			pid;
 	u32			real_pid;	/* futex */
diff --git a/include/image.h b/include/image.h
index c9b7209..7e5d6e6 100644
--- a/include/image.h
+++ b/include/image.h
@@ -39,15 +39,14 @@
 #define PAGE_RSS	1
 #define PAGE_ANON	2
 
-#define FD_ID_SIZE	50
-
 struct fdinfo_entry {
 	u8	type;
 	u8	len;
 	u16	flags;
 	u32	pos;
 	u64	addr;
-	u8	id[FD_ID_SIZE];
+	u64	genid;
+	u64	subid;
 	u8	name[0];
 } __packed;
 
-- 
1.7.7.6



More information about the CRIU mailing list