[CRIU] Re: [PATCH 10/10] files: Use sys_kcmp to find file descriptor
duplicates
Cyrill Gorcunov
gorcunov at openvz.org
Tue Feb 28 08:10:28 EST 2012
On Tue, Feb 28, 2012 at 02:26:31PM +0400, Pavel Emelyanov wrote:
> On 02/27/2012 07:21 PM, Cyrill Gorcunov wrote:
> > On Mon, Feb 27, 2012 at 06:01:41PM +0400, Cyrill Gorcunov wrote:
> >>
> >> Yes, 0 will work as well but I would prefer to keep some predefined
> >> value other than 0 which actually gives us a good hint in debugging
> >> purpose.
> >>
> >
> > Does this one look better?
>
> Almost perfect. See comments inline.
>
The final one.
Cyrill
---
From: Cyrill Gorcunov <gorcunov at openvz.org>
Date: Tue, 28 Feb 2012 17:07:43 +0400
Subject: [PATCH] files: Use sys_kcmp to find file descriptor duplicates v3
We switch generic-object-id concept with sys_kcmp approach,
which implies changes of image format a bit (and since it's
early time for project overall, we're allowed to).
In short -- previously every file descriptor had an ID
generated by a kernel and exported via procfs. If the
appropriate file descriptors were the same objects in
kernel memory -- the IDs did match up to bit. It allows
us to figure out which files are actually the identical
ones and should be restored in a special way.
Once sys_kcmp system call were merged into the kernel,
we've got a new opprotunity -- to use this call instead.
The syscall basically compare kernel objects and returns
ordered results suitable for objects sorting in userspace.
For us it mens we treat every file descriptor as a combination
of 'genid' and 'subid'. While 'genid' is serves for fast comparison
between fds, the 'subid' is kind of a second key, which guarantees
uniqueness of genid+subid tuple over all file descritors found in process
(or group of processes).
To be able to find and dump file descriptors in a single pass we collect
every fd we found into a global rbtree, where (!) each node might become
a root for a subtree as well.
The main tree carries only primary ids, ie nonequal ones. If we find
some id which is already in tree, we need to make sure that it's
either indeed a duplicate or not.
For this we use sys_kcmp syscall and if we find that in real the file
descriptors are different -- we simply add a unique 'subid' value and
put new fd into a subtree.
Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
Makefile | 1 +
cr-dump.c | 70 +++++++++++++++++++-------
cr-show.c | 5 +-
file-ids.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++
files.c | 25 +++++----
include/compiler.h | 2 +
include/file-ids.h | 34 ++++++++++++
include/files.h | 2 +-
include/image.h | 4 +-
include/rbtree.h | 9 +++
10 files changed, 261 insertions(+), 35 deletions(-)
create mode 100644 file-ids.c
create mode 100644 include/file-ids.h
diff --git a/Makefile b/Makefile
index 00d731f..4ca6cb1 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,7 @@ OBJS += log.o
OBJS += libnetlink.o
OBJS += sockets.o
OBJS += files.o
+OBJS += file-ids.o
OBJS += namespaces.o
OBJS += uts_ns.o
OBJS += ipc_ns.o
diff --git a/cr-dump.c b/cr-dump.c
index c7b450a..4b4cbe2 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -20,6 +20,7 @@
#include "types.h"
#include "list.h"
+#include "file-ids.h"
#include "compiler.h"
#include "crtools.h"
@@ -31,6 +32,7 @@
#include "image.h"
#include "proc_parse.h"
#include "parasite-syscall.h"
+#include "file-ids.h"
#ifndef CONFIG_X86_64
# error No x86-32 support yet
@@ -89,7 +91,9 @@ struct fd_parms {
unsigned long fd_name;
unsigned long pos;
unsigned int flags;
- char *id;
+
+ u64 id;
+ pid_t pid;
};
static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
@@ -120,10 +124,24 @@ static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
e.flags = p->flags;
e.pos = p->pos;
e.addr = p->fd_name;
- if (p->id)
- memcpy(e.id, p->id, FD_ID_SIZE);
- else
- memzero(e.id, FD_ID_SIZE);
+
+ if (likely(!fd_is_special(&e))) {
+ struct fd_id_entry *fd_id_entry;
+
+ /*
+ * Make sure the union is still correlate with structure
+ * we write to disk.
+ */
+ BUILD_BUG_ON(sizeof(fd_id_entry->u.key) != sizeof(e.id));
+
+ fd_id_entry = fd_id_entry_collect((u32)p->id, p->pid, p->fd_name);
+ if (!fd_id_entry)
+ goto err;
+
+ /* Now it might have completely new ID here */
+ e.id = fd_id_entry->u.id;
+ } else
+ e.id = FD_ID_INVALID;
pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n",
type, len, p->flags, p->pos, p->fd_name);
@@ -144,7 +162,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
int fd, ret;
/* Dump /proc/pid/cwd */
- params = (struct fd_parms){ .fd_name = FDINFO_CWD, };
+ params = (struct fd_parms) {
+ .fd_name = FDINFO_CWD,
+ .id = FD_ID_INVALID,
+ .pid = FD_PID_INVALID,
+ };
+
fd = open_proc(pid, "cwd");
if (fd < 0)
return -1;
@@ -153,7 +176,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
return ret;
/* Dump /proc/pid/exe */
- params = (struct fd_parms){ .fd_name = FDINFO_EXE, };
+ params = (struct fd_parms) {
+ .fd_name = FDINFO_EXE,
+ .id = FD_ID_INVALID,
+ .pid = FD_PID_INVALID,
+ };
+
fd = open_proc(pid, "exe");
if (fd < 0)
return -1;
@@ -281,8 +309,12 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, int lfd,
if (S_ISREG(st_buf.st_mode) ||
S_ISDIR(st_buf.st_mode) ||
- (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR))
+ (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR)) {
+
+ p->id = MAKE_FD_GENID(st_buf.st_dev, st_buf.st_ino, p->pos);
+
return dump_one_reg_file(FDINFO_FD, p, lfd, cr_fdset, 1);
+ }
if (S_ISFIFO(st_buf.st_mode)) {
if (fstatfs(lfd, &stfs_buf) < 0) {
@@ -312,16 +344,19 @@ static int read_fd_params(pid_t pid, char *fd, struct fd_parms *p)
return -1;
p->fd_name = atoi(fd);
- ret = fscanf(file, "pos:\t%li\nflags:\t%o\nid:\t%s\n", &p->pos, &p->flags, p->id);
+ ret = fscanf(file, "pos:\t%li\nflags:\t%o\n", &p->pos, &p->flags);
fclose(file);
- if (ret != 3) {
- pr_err("Bad format of fdinfo file (%d items, want 3)\n", ret);
+ if (ret != 2) {
+ pr_err("Bad format of fdinfo file (%d items, want 2)\n", ret);
return -1;
}
- pr_info("%d fdinfo %s: pos: %16lx flags: %16o id %s\n",
- pid, fd, p->pos, p->flags, p->id);
+ pr_info("%d fdinfo %s: pos: %16lx flags: %16o\n",
+ pid, fd, p->pos, p->flags);
+
+ p->pid = pid;
+ p->id = FD_ID_INVALID;
return 0;
}
@@ -352,8 +387,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
return -1;
while ((de = readdir(fd_dir))) {
- char id[FD_ID_SIZE];
- struct fd_parms p = { .id = id };
+ struct fd_parms p;
int lfd;
if (de->d_name[0] == '.')
@@ -407,9 +441,9 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct
} else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
vma_entry_is(vma, VMA_FILE_SHARED)) {
struct fd_parms p = {
- .fd_name = vma->start,
- .pos = 0,
- .id = NULL,
+ .fd_name = vma->start,
+ .id = FD_ID_INVALID,
+ .pid = pid,
};
if (vma->prot & PROT_WRITE &&
diff --git a/cr-show.c b/cr-show.c
index 2f9cb84..dd3cb2f 100644
--- a/cr-show.c
+++ b/cr-show.c
@@ -85,8 +85,9 @@ static void show_files(int fd_files)
if (ret <= 0)
goto out;
- pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx id: %s",
- e.type, e.len, e.flags, e.pos, e.addr, e.id);
+ pr_info("type: %02x len: %02x flags: %4x pos: %8x "
+ "addr: %16lx id: %16lx",
+ e.type, e.len, e.flags, e.pos, e.addr, e.id);
if (e.len) {
int ret = read(fd_files, local_buf, e.len);
diff --git a/file-ids.c b/file-ids.c
new file mode 100644
index 0000000..b10d260
--- /dev/null
+++ b/file-ids.c
@@ -0,0 +1,144 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "types.h"
+#include "file-ids.h"
+#include "rbtree.h"
+
+#include "compiler.h"
+#include "syscall.h"
+#include "image.h"
+#include "util.h"
+
+/*
+ * We track shared files by global rbtree, where each node might
+ * be a root for subtree. The reason for that is the nature of data
+ * we obtain from operating system.
+ *
+ * Basically OS provides us two ways to distinguish files
+ *
+ * - information obtained from fstat call
+ * - shiny new sys_kcmp system call (which may compare the file descriptor
+ * pointers inside the kernel and provide us order info)
+ *
+ * So, to speedup procedure of searching for shared file descriptors
+ * we use both techniques. From fstat call we get that named general file
+ * IDs (genid) which are carried in the main rbtree.
+ *
+ * In case if two genid are the same -- we need to use a second way and
+ * call for sys_kcmp. Thus, if kernel tells us that files have identical
+ * genid but in real they are different from kernel point of view -- we assign
+ * a second unique key (subid) to such file descriptor and put it into a subtree.
+ *
+ * So the tree will look like
+ *
+ * (root)
+ * genid-1
+ * / \
+ * genid-2 genid-3
+ * / \ / \
+ *
+ * Where each genid node might be a sub-rbtree as well
+ *
+ * (genid-N)
+ * / \
+ * subid-1 subid-2
+ * / \ / \
+ *
+ * Carrying two rbtree at once allow us to minimize the number
+ * of sys_kcmp syscalls, also to collect and dump file descriptors
+ * in one pass.
+ */
+
+struct rb_root fd_id_root = RB_ROOT;
+static unsigned long fd_id_entries_subid = 1;
+
+static struct fd_id_entry *alloc_fd_id_entry(u32 genid, pid_t pid, int fd)
+{
+ struct fd_id_entry *e;
+
+ e = xmalloc(sizeof(*e));
+ if (!e)
+ goto err;
+
+ e->u.key.subid = fd_id_entries_subid++; /* just for the symmetry */
+ e->u.key.genid = genid;
+ e->pid = pid;
+ e->fd = fd;
+
+ rb_init_node(&e->node);
+ rb_init_node(&e->subtree_node);
+ rb_attach_node(&e->subtree_root, &e->subtree_node);
+err:
+ return e;
+}
+
+static struct fd_id_entry *
+lookup_alloc_subtree(struct fd_id_entry *e, u32 genid, pid_t pid, int fd)
+{
+ struct rb_node *node = e->subtree_root.rb_node;
+ struct fd_id_entry *sub = NULL;
+
+ struct rb_node **new = &e->subtree_root.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (node) {
+ struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, subtree_node);
+ int ret = sys_kcmp(this->pid, pid, KCMP_FILE, this->fd, fd);
+
+ parent = *new;
+ if (ret < 0)
+ node = node->rb_left, new = &((*new)->rb_left);
+ else if (ret > 0)
+ node = node->rb_right, new = &((*new)->rb_right);
+ else
+ return this;
+ }
+
+ sub = alloc_fd_id_entry(genid, pid, fd);
+ if (!sub)
+ goto err;
+
+ sub->u.key.subid = fd_id_entries_subid++;
+
+ rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new);
+err:
+ return sub;
+}
+
+struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd)
+{
+ struct rb_node *node = fd_id_root.rb_node;
+ struct fd_id_entry *e = NULL;
+
+ struct rb_node **new = &fd_id_root.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (node) {
+ struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, node);
+
+ parent = *new;
+ if (genid < this->u.key.genid)
+ node = node->rb_left, new = &((*new)->rb_left);
+ else if (genid > this->u.key.genid)
+ node = node->rb_right, new = &((*new)->rb_right);
+ else
+ return lookup_alloc_subtree(this, genid, pid, fd);
+ }
+
+ e = alloc_fd_id_entry(genid, pid, fd);
+ if (!e)
+ goto err;
+
+ rb_link_and_balance(&fd_id_root, &e->node, parent, new);
+err:
+ return e;
+}
diff --git a/files.c b/files.c
index 56b3311..6c3d8b3 100644
--- a/files.c
+++ b/files.c
@@ -44,14 +44,14 @@ int prepare_shared_fdinfo(void)
return 0;
}
-static struct fdinfo_desc *find_fd(char *id)
+static struct fdinfo_desc *find_fd(u64 id)
{
struct fdinfo_desc *fi;
int i;
for (i = 0; i < nr_fdinfo_descs; i++) {
fi = fdinfo_descs + i;
- if (!strncmp(fi->id, id, FD_ID_SIZE))
+ if (fi->id == id)
return fi;
}
@@ -74,9 +74,10 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
{
int i;
struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list];
- struct fdinfo_desc *desc;
+ struct fdinfo_desc *desc;
- pr_info("Collect fdinfo pid=%d fd=%ld id=%s\n", pid, e->addr, e->id);
+ pr_info("Collect fdinfo pid=%d fd=%ld id=%16lx\n",
+ pid, e->addr, e->id);
nr_fdinfo_list++;
if ((nr_fdinfo_list) * sizeof(struct fdinfo_list_entry) >= 4096) {
@@ -90,7 +91,8 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
for (i = 0; i < nr_fdinfo_descs; i++) {
desc = &fdinfo_descs[i];
- if (strncmp(desc->id, (char *) e->id, FD_ID_SIZE))
+
+ if (desc->id != e->id)
continue;
fdinfo_descs[i].users++;
@@ -111,13 +113,14 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
}
desc = &fdinfo_descs[nr_fdinfo_descs];
- memset(desc, 0, sizeof(fdinfo_descs[nr_fdinfo_descs]));
+ memzero(desc, sizeof(*desc));
- memcpy(desc->id, e->id, FD_ID_SIZE);
- desc->addr= e->addr;
- desc->pid = pid;
- desc->users = 1;
+ desc->id = e->id;
+ desc->addr = e->addr;
+ desc->pid = pid;
+ desc->users = 1;
INIT_LIST_HEAD(&desc->list);
+
list_add(&le->list, &desc->list);
nr_fdinfo_descs++;
@@ -392,7 +395,7 @@ static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int sta
u32 mag;
int ret = 0;
- struct fdinfo_desc *fi = find_fd((char *)fe->id);
+ struct fdinfo_desc *fi = find_fd(fe->id);
if (move_img_fd(fdinfo_fd, (int)fe->addr))
return -1;
diff --git a/include/compiler.h b/include/compiler.h
index c969906..621cedb 100644
--- a/include/compiler.h
+++ b/include/compiler.h
@@ -16,6 +16,8 @@
#define NORETURN __attribute__((__noreturn__))
#define __packed __attribute__((__packed__))
#define __used __attribute__((__used__))
+#define __maybe_unused __attribute__((unused))
+#define __always_unused __attribute__((unused))
#define __section(S) __attribute__ ((__section__(#S)))
diff --git a/include/file-ids.h b/include/file-ids.h
new file mode 100644
index 0000000..3001f80
--- /dev/null
+++ b/include/file-ids.h
@@ -0,0 +1,34 @@
+#ifndef FILE_IDS_H__
+#define FILE_IDS_H__
+
+#include "compiler.h"
+#include "types.h"
+#include "rbtree.h"
+
+#define FD_ID_INVALID (-1UL)
+#define FD_PID_INVALID ((int)-2UL)
+
+struct fd_id_entry {
+ struct rb_node node;
+
+ struct rb_root subtree_root;
+ struct rb_node subtree_node;
+
+ union {
+ struct {
+ u32 genid; /* generic id, may have duplicates */
+ u32 subid; /* subid is always unique */
+ } key;
+ u64 id;
+ } u;
+
+ pid_t pid;
+ int fd;
+} __aligned(sizeof(long));
+
+#define MAKE_FD_GENID(dev, ino, pos) \
+ (((u32)(dev) ^ (u32)(ino) ^ (u32)(pos)))
+
+extern struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd);
+
+#endif /* FILE_IDS_H__ */
diff --git a/include/files.h b/include/files.h
index f311e93..eb6a219 100644
--- a/include/files.h
+++ b/include/files.h
@@ -22,7 +22,7 @@ struct fmap_fd {
};
struct fdinfo_desc {
- char id[FD_ID_SIZE];
+ u64 id;
u64 addr;
int pid;
u32 real_pid; /* futex */
diff --git a/include/image.h b/include/image.h
index c9b7209..09d7bc4 100644
--- a/include/image.h
+++ b/include/image.h
@@ -39,15 +39,13 @@
#define PAGE_RSS 1
#define PAGE_ANON 2
-#define FD_ID_SIZE 50
-
struct fdinfo_entry {
u8 type;
u8 len;
u16 flags;
u32 pos;
u64 addr;
- u8 id[FD_ID_SIZE];
+ u64 id;
u8 name[0];
} __packed;
diff --git a/include/rbtree.h b/include/rbtree.h
index 1d56bb0..2da5877 100644
--- a/include/rbtree.h
+++ b/include/rbtree.h
@@ -82,4 +82,13 @@ static void rb_link_node(struct rb_node *node, struct rb_node *parent,
*rb_link = node;
}
+static void rb_link_and_balance(struct rb_root *root,
+ struct rb_node *node,
+ struct rb_node *parent,
+ struct rb_node **rb_link)
+{
+ rb_link_node(node, parent, rb_link);
+ rb_insert_color(node, root);
+}
+
#endif /* RBTREE_H__ */
--
1.7.7.6
More information about the CRIU
mailing list