[CRIU] Re: [PATCH 10/10] files: Use sys_kcmp to find file descriptor
duplicates
Cyrill Gorcunov
gorcunov at openvz.org
Tue Feb 28 09:29:46 EST 2012
On Tue, Feb 28, 2012 at 06:09:38PM +0400, Cyrill Gorcunov wrote:
> > > +
> > > + sub->u.key.subid = fd_id_entries_subid++;
> >
> > It's already incremented in alloc_fd_id_entry :)
>
> Sure, it's redundant, will drop.
>
> >
> > > + rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new);
> >
> > Plus, you can merge call to rb_link_and_balance into alloc_fd_id_entry (and rename
> > the latter to better reflect what it's doing.
> >
>
> No, i better should not. The searches are done in different trees
> (and as result different rb_link, parent and such will be there).
> So I would prefer to not mess searching/linking code.
>
This one should fit. Note I've added
alloc_fd_id_entry
...
/* Make sure no overflow here */
BUG_ON(!e->u.key.subid);
just to be sure all tuples will never ever have duplicates.
Cyrill
---
From: Cyrill Gorcunov <gorcunov at openvz.org>
Date: Tue, 28 Feb 2012 18:27:28 +0400
Subject: [PATCH] files: Use sys_kcmp to find file descriptor duplicates v4
We switch generic-object-id concept with sys_kcmp approach,
which implies changes of image format a bit (and since it's
early time for project overall, we're allowed to).
In short -- previously every file descriptor had an ID
generated by a kernel and exported via procfs. If the
appropriate file descriptors were the same objects in
kernel memory -- the IDs did match up to bit. It allows
us to figure out which files were actually the identical
ones and should be restored in a special way.
Once sys_kcmp system call was merged into the kernel,
we've got a new opprotunity -- to use this syscall instead.
The syscall basically compares kernel objects and returns
ordered results suitable for objects sorting in a userspace.
For us it means -- we treat every file descriptor as a combination
of 'genid' and 'subid'. While 'genid' serves for fast comparison
between fds, the 'subid' is kind of a second key, which guarantees
uniqueness of genid+subid tuple over all file descritors found
in a process (or group of processes).
To be able to find and dump file descriptors in a single pass we
collect every fd into a global rbtree, where (!) each node might
become a root for a subtree as well.
The main tree carries only non-equal genid. If we find genid which
is already in tree, we need to make sure that it's either indeed
a duplicate or not. For this we use sys_kcmp syscall and if we
find that file descriptors are different -- we simply put new
fd into a subtree.
Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
Makefile | 1 +
cr-dump.c | 70 +++++++++++++++++++-------
cr-show.c | 5 +-
file-ids.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++
files.c | 25 +++++----
include/compiler.h | 2 +
include/file-ids.h | 34 ++++++++++++
include/files.h | 2 +-
include/image.h | 4 +-
include/rbtree.h | 9 +++
10 files changed, 262 insertions(+), 35 deletions(-)
create mode 100644 file-ids.c
create mode 100644 include/file-ids.h
diff --git a/Makefile b/Makefile
index 00d731f..4ca6cb1 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,7 @@ OBJS += log.o
OBJS += libnetlink.o
OBJS += sockets.o
OBJS += files.o
+OBJS += file-ids.o
OBJS += namespaces.o
OBJS += uts_ns.o
OBJS += ipc_ns.o
diff --git a/cr-dump.c b/cr-dump.c
index c7b450a..4b4cbe2 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -20,6 +20,7 @@
#include "types.h"
#include "list.h"
+#include "file-ids.h"
#include "compiler.h"
#include "crtools.h"
@@ -31,6 +32,7 @@
#include "image.h"
#include "proc_parse.h"
#include "parasite-syscall.h"
+#include "file-ids.h"
#ifndef CONFIG_X86_64
# error No x86-32 support yet
@@ -89,7 +91,9 @@ struct fd_parms {
unsigned long fd_name;
unsigned long pos;
unsigned int flags;
- char *id;
+
+ u64 id;
+ pid_t pid;
};
static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
@@ -120,10 +124,24 @@ static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
e.flags = p->flags;
e.pos = p->pos;
e.addr = p->fd_name;
- if (p->id)
- memcpy(e.id, p->id, FD_ID_SIZE);
- else
- memzero(e.id, FD_ID_SIZE);
+
+ if (likely(!fd_is_special(&e))) {
+ struct fd_id_entry *fd_id_entry;
+
+ /*
+ * Make sure the union is still correlate with structure
+ * we write to disk.
+ */
+ BUILD_BUG_ON(sizeof(fd_id_entry->u.key) != sizeof(e.id));
+
+ fd_id_entry = fd_id_entry_collect((u32)p->id, p->pid, p->fd_name);
+ if (!fd_id_entry)
+ goto err;
+
+ /* Now it might have completely new ID here */
+ e.id = fd_id_entry->u.id;
+ } else
+ e.id = FD_ID_INVALID;
pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n",
type, len, p->flags, p->pos, p->fd_name);
@@ -144,7 +162,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
int fd, ret;
/* Dump /proc/pid/cwd */
- params = (struct fd_parms){ .fd_name = FDINFO_CWD, };
+ params = (struct fd_parms) {
+ .fd_name = FDINFO_CWD,
+ .id = FD_ID_INVALID,
+ .pid = FD_PID_INVALID,
+ };
+
fd = open_proc(pid, "cwd");
if (fd < 0)
return -1;
@@ -153,7 +176,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
return ret;
/* Dump /proc/pid/exe */
- params = (struct fd_parms){ .fd_name = FDINFO_EXE, };
+ params = (struct fd_parms) {
+ .fd_name = FDINFO_EXE,
+ .id = FD_ID_INVALID,
+ .pid = FD_PID_INVALID,
+ };
+
fd = open_proc(pid, "exe");
if (fd < 0)
return -1;
@@ -281,8 +309,12 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, int lfd,
if (S_ISREG(st_buf.st_mode) ||
S_ISDIR(st_buf.st_mode) ||
- (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR))
+ (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR)) {
+
+ p->id = MAKE_FD_GENID(st_buf.st_dev, st_buf.st_ino, p->pos);
+
return dump_one_reg_file(FDINFO_FD, p, lfd, cr_fdset, 1);
+ }
if (S_ISFIFO(st_buf.st_mode)) {
if (fstatfs(lfd, &stfs_buf) < 0) {
@@ -312,16 +344,19 @@ static int read_fd_params(pid_t pid, char *fd, struct fd_parms *p)
return -1;
p->fd_name = atoi(fd);
- ret = fscanf(file, "pos:\t%li\nflags:\t%o\nid:\t%s\n", &p->pos, &p->flags, p->id);
+ ret = fscanf(file, "pos:\t%li\nflags:\t%o\n", &p->pos, &p->flags);
fclose(file);
- if (ret != 3) {
- pr_err("Bad format of fdinfo file (%d items, want 3)\n", ret);
+ if (ret != 2) {
+ pr_err("Bad format of fdinfo file (%d items, want 2)\n", ret);
return -1;
}
- pr_info("%d fdinfo %s: pos: %16lx flags: %16o id %s\n",
- pid, fd, p->pos, p->flags, p->id);
+ pr_info("%d fdinfo %s: pos: %16lx flags: %16o\n",
+ pid, fd, p->pos, p->flags);
+
+ p->pid = pid;
+ p->id = FD_ID_INVALID;
return 0;
}
@@ -352,8 +387,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
return -1;
while ((de = readdir(fd_dir))) {
- char id[FD_ID_SIZE];
- struct fd_parms p = { .id = id };
+ struct fd_parms p;
int lfd;
if (de->d_name[0] == '.')
@@ -407,9 +441,9 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct
} else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
vma_entry_is(vma, VMA_FILE_SHARED)) {
struct fd_parms p = {
- .fd_name = vma->start,
- .pos = 0,
- .id = NULL,
+ .fd_name = vma->start,
+ .id = FD_ID_INVALID,
+ .pid = pid,
};
if (vma->prot & PROT_WRITE &&
diff --git a/cr-show.c b/cr-show.c
index 2f9cb84..dd3cb2f 100644
--- a/cr-show.c
+++ b/cr-show.c
@@ -85,8 +85,9 @@ static void show_files(int fd_files)
if (ret <= 0)
goto out;
- pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx id: %s",
- e.type, e.len, e.flags, e.pos, e.addr, e.id);
+ pr_info("type: %02x len: %02x flags: %4x pos: %8x "
+ "addr: %16lx id: %16lx",
+ e.type, e.len, e.flags, e.pos, e.addr, e.id);
if (e.len) {
int ret = read(fd_files, local_buf, e.len);
diff --git a/file-ids.c b/file-ids.c
new file mode 100644
index 0000000..aae3464
--- /dev/null
+++ b/file-ids.c
@@ -0,0 +1,145 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "types.h"
+#include "file-ids.h"
+#include "rbtree.h"
+
+#include "compiler.h"
+#include "syscall.h"
+#include "image.h"
+#include "util.h"
+
+/*
+ * We track shared files by global rbtree, where each node might
+ * be a root for subtree. The reason for that is the nature of data
+ * we obtain from operating system.
+ *
+ * Basically OS provides us two ways to distinguish files
+ *
+ * - information obtained from fstat call
+ * - shiny new sys_kcmp system call (which may compare the file descriptor
+ * pointers inside the kernel and provide us order info)
+ *
+ * So, to speedup procedure of searching for shared file descriptors
+ * we use both techniques. From fstat call we get that named general file
+ * IDs (genid) which are carried in the main rbtree.
+ *
+ * In case if two genid are the same -- we need to use a second way and
+ * call for sys_kcmp. Thus, if kernel tells us that files have identical
+ * genid but in real they are different from kernel point of view -- we assign
+ * a second unique key (subid) to such file descriptor and put it into a subtree.
+ *
+ * So the tree will look like
+ *
+ * (root)
+ * genid-1
+ * / \
+ * genid-2 genid-3
+ * / \ / \
+ *
+ * Where each genid node might be a sub-rbtree as well
+ *
+ * (genid-N)
+ * / \
+ * subid-1 subid-2
+ * / \ / \
+ *
+ * Carrying two rbtree at once allow us to minimize the number
+ * of sys_kcmp syscalls, also to collect and dump file descriptors
+ * in one pass.
+ */
+
+struct rb_root fd_id_root = RB_ROOT;
+static unsigned long fd_id_entries_subid = 1;
+
+static struct fd_id_entry *alloc_fd_id_entry(u32 genid, pid_t pid, int fd)
+{
+ struct fd_id_entry *e;
+
+ e = xmalloc(sizeof(*e));
+ if (!e)
+ goto err;
+
+ e->u.key.subid = fd_id_entries_subid++;
+ e->u.key.genid = genid;
+ e->pid = pid;
+ e->fd = fd;
+
+ /* Make sure no overflow here */
+ BUG_ON(!e->u.key.subid);
+
+ rb_init_node(&e->node);
+ rb_init_node(&e->subtree_node);
+ rb_attach_node(&e->subtree_root, &e->subtree_node);
+err:
+ return e;
+}
+
+static struct fd_id_entry *
+lookup_alloc_subtree(struct fd_id_entry *e, u32 genid, pid_t pid, int fd)
+{
+ struct rb_node *node = e->subtree_root.rb_node;
+ struct fd_id_entry *sub = NULL;
+
+ struct rb_node **new = &e->subtree_root.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (node) {
+ struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, subtree_node);
+ int ret = sys_kcmp(this->pid, pid, KCMP_FILE, this->fd, fd);
+
+ parent = *new;
+ if (ret < 0)
+ node = node->rb_left, new = &((*new)->rb_left);
+ else if (ret > 0)
+ node = node->rb_right, new = &((*new)->rb_right);
+ else
+ return this;
+ }
+
+ sub = alloc_fd_id_entry(genid, pid, fd);
+ if (!sub)
+ goto err;
+
+ rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new);
+err:
+ return sub;
+}
+
+struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd)
+{
+ struct rb_node *node = fd_id_root.rb_node;
+ struct fd_id_entry *e = NULL;
+
+ struct rb_node **new = &fd_id_root.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (node) {
+ struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, node);
+
+ parent = *new;
+ if (genid < this->u.key.genid)
+ node = node->rb_left, new = &((*new)->rb_left);
+ else if (genid > this->u.key.genid)
+ node = node->rb_right, new = &((*new)->rb_right);
+ else
+ return lookup_alloc_subtree(this, genid, pid, fd);
+ }
+
+ e = alloc_fd_id_entry(genid, pid, fd);
+ if (!e)
+ goto err;
+
+ rb_link_and_balance(&fd_id_root, &e->node, parent, new);
+err:
+ return e;
+}
diff --git a/files.c b/files.c
index 56b3311..6c3d8b3 100644
--- a/files.c
+++ b/files.c
@@ -44,14 +44,14 @@ int prepare_shared_fdinfo(void)
return 0;
}
-static struct fdinfo_desc *find_fd(char *id)
+static struct fdinfo_desc *find_fd(u64 id)
{
struct fdinfo_desc *fi;
int i;
for (i = 0; i < nr_fdinfo_descs; i++) {
fi = fdinfo_descs + i;
- if (!strncmp(fi->id, id, FD_ID_SIZE))
+ if (fi->id == id)
return fi;
}
@@ -74,9 +74,10 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
{
int i;
struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list];
- struct fdinfo_desc *desc;
+ struct fdinfo_desc *desc;
- pr_info("Collect fdinfo pid=%d fd=%ld id=%s\n", pid, e->addr, e->id);
+ pr_info("Collect fdinfo pid=%d fd=%ld id=%16lx\n",
+ pid, e->addr, e->id);
nr_fdinfo_list++;
if ((nr_fdinfo_list) * sizeof(struct fdinfo_list_entry) >= 4096) {
@@ -90,7 +91,8 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
for (i = 0; i < nr_fdinfo_descs; i++) {
desc = &fdinfo_descs[i];
- if (strncmp(desc->id, (char *) e->id, FD_ID_SIZE))
+
+ if (desc->id != e->id)
continue;
fdinfo_descs[i].users++;
@@ -111,13 +113,14 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
}
desc = &fdinfo_descs[nr_fdinfo_descs];
- memset(desc, 0, sizeof(fdinfo_descs[nr_fdinfo_descs]));
+ memzero(desc, sizeof(*desc));
- memcpy(desc->id, e->id, FD_ID_SIZE);
- desc->addr= e->addr;
- desc->pid = pid;
- desc->users = 1;
+ desc->id = e->id;
+ desc->addr = e->addr;
+ desc->pid = pid;
+ desc->users = 1;
INIT_LIST_HEAD(&desc->list);
+
list_add(&le->list, &desc->list);
nr_fdinfo_descs++;
@@ -392,7 +395,7 @@ static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int sta
u32 mag;
int ret = 0;
- struct fdinfo_desc *fi = find_fd((char *)fe->id);
+ struct fdinfo_desc *fi = find_fd(fe->id);
if (move_img_fd(fdinfo_fd, (int)fe->addr))
return -1;
diff --git a/include/compiler.h b/include/compiler.h
index c969906..621cedb 100644
--- a/include/compiler.h
+++ b/include/compiler.h
@@ -16,6 +16,8 @@
#define NORETURN __attribute__((__noreturn__))
#define __packed __attribute__((__packed__))
#define __used __attribute__((__used__))
+#define __maybe_unused __attribute__((unused))
+#define __always_unused __attribute__((unused))
#define __section(S) __attribute__ ((__section__(#S)))
diff --git a/include/file-ids.h b/include/file-ids.h
new file mode 100644
index 0000000..3001f80
--- /dev/null
+++ b/include/file-ids.h
@@ -0,0 +1,34 @@
+#ifndef FILE_IDS_H__
+#define FILE_IDS_H__
+
+#include "compiler.h"
+#include "types.h"
+#include "rbtree.h"
+
+#define FD_ID_INVALID (-1UL)
+#define FD_PID_INVALID ((int)-2UL)
+
+struct fd_id_entry {
+ struct rb_node node;
+
+ struct rb_root subtree_root;
+ struct rb_node subtree_node;
+
+ union {
+ struct {
+ u32 genid; /* generic id, may have duplicates */
+ u32 subid; /* subid is always unique */
+ } key;
+ u64 id;
+ } u;
+
+ pid_t pid;
+ int fd;
+} __aligned(sizeof(long));
+
+#define MAKE_FD_GENID(dev, ino, pos) \
+ (((u32)(dev) ^ (u32)(ino) ^ (u32)(pos)))
+
+extern struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd);
+
+#endif /* FILE_IDS_H__ */
diff --git a/include/files.h b/include/files.h
index f311e93..eb6a219 100644
--- a/include/files.h
+++ b/include/files.h
@@ -22,7 +22,7 @@ struct fmap_fd {
};
struct fdinfo_desc {
- char id[FD_ID_SIZE];
+ u64 id;
u64 addr;
int pid;
u32 real_pid; /* futex */
diff --git a/include/image.h b/include/image.h
index c9b7209..09d7bc4 100644
--- a/include/image.h
+++ b/include/image.h
@@ -39,15 +39,13 @@
#define PAGE_RSS 1
#define PAGE_ANON 2
-#define FD_ID_SIZE 50
-
struct fdinfo_entry {
u8 type;
u8 len;
u16 flags;
u32 pos;
u64 addr;
- u8 id[FD_ID_SIZE];
+ u64 id;
u8 name[0];
} __packed;
diff --git a/include/rbtree.h b/include/rbtree.h
index 1d56bb0..2da5877 100644
--- a/include/rbtree.h
+++ b/include/rbtree.h
@@ -82,4 +82,13 @@ static void rb_link_node(struct rb_node *node, struct rb_node *parent,
*rb_link = node;
}
+static void rb_link_and_balance(struct rb_root *root,
+ struct rb_node *node,
+ struct rb_node *parent,
+ struct rb_node **rb_link)
+{
+ rb_link_node(node, parent, rb_link);
+ rb_insert_color(node, root);
+}
+
#endif /* RBTREE_H__ */
--
1.7.7.6
More information about the CRIU
mailing list