[CRIU] [RFC] fds TX/RX

Cyrill Gorcunov gorcunov at openvz.org
Fri Mar 16 09:29:47 EDT 2012


Hi,

here is a patch I would like to discuss. It handles
tx/rx of file descriptors. Sometimes in make tests
I see that log file were not closed properly after
restore, I'm working on it now. But to draw overall
picture the patch is below.

	Cyrill
---
From: Cyrill Gorcunov <gorcunov at openvz.org>
Date: Fri, 16 Mar 2012 17:19:35 +0400
Subject: [PATCH] dump: Transfer file descriptors the dumpee has into our
 space via SCM

There were an idea from Pavel to move all file descriptors the dumpee
has into our space via SCM facility. Then we can do anything we need
with descriptors directly without calling parasite code anymore.

In particular it will be needed for fowners dumping.

While the patch looks big in real it does the following
simple steps

 - at dumping start procedure the crtools scans dumpee
   /proc/pid/fd directory and collects the fds being found
   there

 - then crtools inject parasite code and pass the set of
   collected fds into parasite, the parasite in turn send
   these descriptors back to our space via SCM messages

 - once the fds are observed the further dumping code operate
   over new our 'local' fds.

To make this all happen the folowing structures were brought in

 - struct fds_scm_map which carries both original and new file
   descriptors with 1:1 map

 - struct scm_fdset which serves for fast transmission/receiving
   of file descriptors

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 Makefile                   |    1 +
 cr-dump.c                  |  261 +++++++++++++++++++++++---------------------
 fds-scm-map.c              |  140 +++++++++++++++++++++++
 include/fds-scm-map.h      |   22 ++++
 include/parasite-syscall.h |    1 +
 include/parasite.h         |   13 ++-
 include/syscall-codes.h    |    1 +
 include/syscall.h          |    5 +
 include/types.h            |   18 +++
 include/util-net.h         |   33 ++++++
 include/util.h             |   13 ++
 parasite-syscall.c         |  100 +++++++++++++++---
 parasite.c                 |   65 +++++++++++
 util-net.c                 |  104 +++++++++++++++++-
 14 files changed, 635 insertions(+), 142 deletions(-)
 create mode 100644 fds-scm-map.c
 create mode 100644 include/fds-scm-map.h

diff --git a/Makefile b/Makefile
index 94161f9..c9322ec 100644
--- a/Makefile
+++ b/Makefile
@@ -46,6 +46,7 @@ OBJS		+= file-ids.o
 OBJS		+= namespaces.o
 OBJS		+= uts_ns.o
 OBJS		+= ipc_ns.o
+OBJS		+= fds-scm-map.o
 
 OBJS-BLOB	+= parasite.o
 SRCS-BLOB	+= $(patsubst %.o,%.c,$(OBJS-BLOB))
diff --git a/cr-dump.c b/cr-dump.c
index 7ca75a6..7fc1c1f 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -32,19 +32,23 @@
 #include "image.h"
 #include "proc_parse.h"
 #include "parasite-syscall.h"
+#include "fds-scm-map.h"
 
 #ifndef CONFIG_X86_64
 # error No x86-32 support yet
 #endif
 
 struct fd_parms {
-	unsigned long	fd_name;
-	unsigned long	pos;
-	unsigned int	flags;
-	unsigned int	type;
+	unsigned int		scm_fd_locl;	/* our fd number transferred from parasite */
+	unsigned int		scm_fd_orig;	/* corresponding original fd */
+	struct vma_entry	*vma_entry;	/* Used for VMA file maps */
 
-	u64		id;
-	pid_t		pid;
+	unsigned long		pos;
+	unsigned int		flags;
+	unsigned int		type;
+
+	u64			id;
+	pid_t			pid;
 };
 
 static char big_buffer[PATH_MAX];
@@ -97,16 +101,15 @@ err:
 	return ret;
 }
 
-static int dump_one_reg_file(const struct fd_parms *p, int lfd,
-			     const struct cr_fdset *cr_fdset,
-			     bool do_close_lfd)
+static int dump_one_reg_file(const struct fd_parms *params,
+			     const struct cr_fdset *cr_fdset)
 {
 	struct fdinfo_entry e;
 	char fd_str[128];
 	int len;
 	int ret = -1;
 
-	snprintf(fd_str, sizeof(fd_str), "/proc/self/fd/%d", lfd);
+	snprintf(fd_str, sizeof(fd_str), "/proc/self/fd/%d", params->scm_fd_locl);
 	len = readlink(fd_str, big_buffer, sizeof(big_buffer) - 1);
 	if (len < 0) {
 		pr_perror("Can't readlink %s", fd_str);
@@ -114,17 +117,19 @@ static int dump_one_reg_file(const struct fd_parms *p, int lfd,
 	}
 
 	big_buffer[len] = '\0';
-	pr_info("Dumping path for %lx fd via self %d [%s]\n",
-		p->fd_name, lfd, big_buffer);
-
-	if (do_close_lfd)
-		close(lfd);
+	pr_info("Dumping path for %d fd [%s]\n",
+		params->scm_fd_orig, big_buffer);
 
-	e.type	= p->type;
+	e.type	= params->type;
 	e.len	= len;
-	e.flags = p->flags;
-	e.pos	= p->pos;
-	e.addr	= p->fd_name;
+	e.flags = params->flags;
+	e.pos	= params->pos;
+
+	if (params->type == FDINFO_MAP)
+		e.addr	= params->vma_entry->start;
+	else
+		e.addr	= params->scm_fd_orig;
+
 	e.id	= FD_ID_INVALID;
 
 	if (likely(!fd_is_special(&e))) {
@@ -136,7 +141,7 @@ static int dump_one_reg_file(const struct fd_parms *p, int lfd,
 		 */
 		BUILD_BUG_ON(sizeof(entry->u.key) != sizeof(e.id));
 
-		entry = fd_id_entry_collect((u32)p->id, p->pid, p->fd_name);
+		entry = fd_id_entry_collect((u32)params->id, params->pid, params->scm_fd_orig);
 		if (!entry)
 			goto err;
 
@@ -145,7 +150,7 @@ static int dump_one_reg_file(const struct fd_parms *p, int lfd,
 	}
 
 	pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n",
-		p->type, len, p->flags, p->pos, p->fd_name);
+		params->type, len, params->flags, params->pos, e.addr);
 
 	if (write_img(cr_fdset->fds[CR_FD_FDINFO], &e))
 		goto err;
@@ -163,35 +168,41 @@ static int dump_task_special_files(pid_t pid, const struct cr_fdset *cr_fdset)
 	int fd, ret;
 
 	/* Dump /proc/pid/cwd */
+	fd = open_proc(pid, "cwd");
+	if (fd < 0)
+		return -1;
 	params = (struct fd_parms) {
+		.scm_fd_orig	= fd,
+		.scm_fd_locl	= fd,
 		.id		= FD_ID_INVALID,
 		.pid		= FD_PID_INVALID,
 		.type		= FDINFO_CWD,
 	};
 
-	fd = open_proc(pid, "cwd");
-	if (fd < 0)
-		return -1;
-	ret = dump_one_reg_file(&params, fd, cr_fdset, 1);
+	ret = dump_one_reg_file(&params, cr_fdset);
+	close(fd);
 	if (ret)
 		return ret;
 
 	/* Dump /proc/pid/exe */
+	fd = open_proc(pid, "exe");
+	if (fd < 0)
+		return -1;
 	params = (struct fd_parms) {
+		.scm_fd_orig	= fd,
+		.scm_fd_locl	= fd,
 		.id		= FD_ID_INVALID,
 		.pid		= FD_PID_INVALID,
 		.type		= FDINFO_EXE,
 	};
 
-	fd = open_proc(pid, "exe");
-	if (fd < 0)
-		return -1;
-	ret = dump_one_reg_file(&params, fd, cr_fdset, 1);
+	ret = dump_one_reg_file(&params, cr_fdset);
+	close(fd);
 
 	return ret;
 }
 
-static int dump_pipe_and_data(int lfd, struct pipe_entry *e,
+static int dump_pipe_and_data(int fd, struct pipe_entry *e,
 			      const struct cr_fdset *cr_fdset)
 {
 	int fd_pipes;
@@ -208,13 +219,13 @@ static int dump_pipe_and_data(int lfd, struct pipe_entry *e,
 		goto err;
 	}
 
-	pipe_size = fcntl(lfd, F_GETPIPE_SZ);
+	pipe_size = fcntl(fd, F_GETPIPE_SZ);
 	if (pipe_size < 0) {
 		pr_err("Can't obtain piped data size\n");
 		goto err;
 	}
 
-	has_bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK);
+	has_bytes = tee(fd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK);
 	if (has_bytes < 0) {
 		if (errno != EAGAIN) {
 			pr_perror("Can't pick pipe data");
@@ -246,108 +257,86 @@ err:
 	return ret;
 }
 
-static int dump_one_pipe(const struct fd_parms *p, unsigned int id, int lfd,
+static int dump_one_pipe(const struct fd_parms *p, unsigned int id,
 			 const struct cr_fdset *cr_fdset)
 {
 	struct pipe_entry e;
 	int ret = -1;
 	struct statfs stfs_buf;
 
-	if (fstatfs(lfd, &stfs_buf) < 0) {
-		pr_perror("Can't fstatfs on %ld", p->fd_name);
+	if (fstatfs(p->scm_fd_locl, &stfs_buf) < 0) {
+		pr_perror("Can't fstatfs on %d", p->scm_fd_orig);
 		return -1;
 	}
 
 	if (stfs_buf.f_type != PIPEFS_MAGIC) {
-		pr_err("Dumping of FIFO's is not supported: %ld\n", p->fd_name);
+		pr_err("Dumping of FIFO's is not supported: %d\n", p->scm_fd_orig);
 		return -1;
 	}
 
-	pr_info("Dumping pipe %ld/%x flags %x\n", p->fd_name, id, p->flags);
+	pr_info("Dumping pipe %d/%x flags %x\n", p->scm_fd_orig, id, p->flags);
 
-	e.fd		= p->fd_name;
+	e.fd		= p->scm_fd_orig;
 	e.pipeid	= id;
 	e.flags		= p->flags;
+	e.bytes		= 0;
 
-	if (p->flags & O_WRONLY) {
-		e.bytes = 0;
+	if (p->flags & O_WRONLY)
 		ret = write_img(cr_fdset->fds[CR_FD_PIPES], &e);
-	} else
-		ret = dump_pipe_and_data(lfd, &e, cr_fdset);
+	else
+		ret = dump_pipe_and_data(p->scm_fd_locl, &e, cr_fdset);
 
 err:
 	if (!ret)
 		pr_info("Dumped pipe: fd: %8x pipeid: %8x flags: %8x bytes: %8x\n",
 			e.fd, e.pipeid, e.flags, e.bytes);
 	else
-		pr_err("Dumping pipe %ld/%x flags %x\n", p->fd_name, id, p->flags);
+		pr_err("Dumping pipe %d/%x flags %x\n", p->scm_fd_orig, id, p->flags);
 
 	return ret;
 }
 
-static int read_fd_params(pid_t pid, const char *fd, struct fd_parms *p)
-{
-	FILE *file;
-	int ret;
-
-	file = fopen_proc(pid, "fdinfo/%s", fd);
-	if (!file)
-		return -1;
-
-	p->fd_name = atoi(fd);
-	ret = fscanf(file, "pos:\t%li\nflags:\t%o\n", &p->pos, &p->flags);
-	fclose(file);
-
-	if (ret != 2) {
-		pr_err("Bad format of fdinfo file (%d items, want 2)\n", ret);
-		return -1;
-	}
-
-	pr_info("%d fdinfo %s: pos: %16lx flags: %16o\n",
-		pid, fd, p->pos, p->flags);
-
-	p->pid	= pid;
-	p->id	= FD_ID_INVALID;
-
-	return 0;
-}
-
-static int dump_one_fd(pid_t pid, int pid_fd_dir, const char *d_name,
+static int dump_one_fd(struct fd_parms *params,
 		       const struct cr_fdset *cr_fdset,
 		       struct sk_queue *sk_queue)
 {
 	struct stat fd_stat;
 	int err = -1;
-	struct fd_parms p;
-	int lfd;
 
-	if (read_fd_params(pid, d_name, &p))
-		return -1;
-
-	lfd = openat(pid_fd_dir, d_name, O_RDONLY);
-	if (lfd < 0) {
-		err = try_dump_socket(pid, p.fd_name, cr_fdset, sk_queue);
-		if (err != 1)
-			return err;
+	pr_info("%d fdinfo %d (%d): pos: %16lx flags: %16o\n",
+		params->pid, params->scm_fd_orig, params->scm_fd_locl,
+		params->pos, params->flags);
 
-		pr_perror("Failed to open %d/%ld", pid_fd_dir, p.fd_name);
-		return -1;
+	/*
+	 * Check if it's a socket.
+	 */
+	{
+		int fd = open_proc_nocheck(params->pid, "fd/%d", params->scm_fd_orig);
+		if (fd < 0) {
+			err = try_dump_socket(params->pid, params->scm_fd_orig,
+					      cr_fdset, sk_queue);
+			if (err != 1)
+				return err;
+
+			pr_perror("Failed to open %d", params->scm_fd_orig);
+			return -1;
+		} else
+			close(fd);
 	}
 
-	if (fstat(lfd, &fd_stat) < 0) {
-		pr_perror("Can't get stat on %ld", p.fd_name);
-		goto out_close;
+	if (fstat(params->scm_fd_locl, &fd_stat) < 0) {
+		pr_perror("Can't get stat on %d", params->scm_fd_orig);
+		return -1;
 	}
 
 	if (S_ISCHR(fd_stat.st_mode) &&
 	    (major(fd_stat.st_rdev) == TTY_MAJOR ||
 	     major(fd_stat.st_rdev) == UNIX98_PTY_SLAVE_MAJOR)) {
 		/* skip only standard destriptors */
-		if (p.fd_name < 3) {
-			err = 0;
-			pr_info("... Skipping tty ... %d/%ld\n",
-				pid_fd_dir, p.fd_name);
-			goto out_close;
+		if (params->scm_fd_orig < 3) {
+			pr_info("... Skipping tty ... %d\n",
+				params->scm_fd_orig);
+			return 0;
 		}
 		goto err;
 	}
@@ -356,62 +345,69 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, const char *d_name,
 	    S_ISDIR(fd_stat.st_mode) ||
 	    (S_ISCHR(fd_stat.st_mode) && major(fd_stat.st_rdev) == MEM_MAJOR)) {
 
-		p.id = MAKE_FD_GENID(fd_stat.st_dev, fd_stat.st_ino, p.pos);
-		p.type = FDINFO_REG;
+		params->id	= MAKE_FD_GENID(fd_stat.st_dev, fd_stat.st_ino, params->pos);
+		params->type	= FDINFO_REG;
 
-		return dump_one_reg_file(&p, lfd, cr_fdset, 1);
+		return dump_one_reg_file(params, cr_fdset);
 	}
 
 	if (S_ISFIFO(fd_stat.st_mode))
-		return dump_one_pipe(&p, fd_stat.st_ino, lfd, cr_fdset);
+		return dump_one_pipe(params, fd_stat.st_ino, cr_fdset);
 
 err:
-	pr_err("Can't dump file %ld of that type [%x]\n", p.fd_name, fd_stat.st_mode);
+	pr_err("Can't dump file %d of that type [%x]\n",
+		params->scm_fd_orig, fd_stat.st_mode);
 
-out_close:
-	close_safe(&lfd);
 	return err;
 }
 
-static int dump_task_files(pid_t pid, const struct cr_fdset *cr_fdset,
-			   struct sk_queue *sk_queue)
+static int dump_task_files_seized(struct parasite_ctl *ctl,
+				  struct fds_scm_map *fds_map,
+				  const struct cr_fdset *cr_fdset,
+				  struct sk_queue *sk_queue)
 {
-	struct dirent *de;
-	unsigned long pos;
-	unsigned int flags;
-	DIR *fd_dir;
+	struct fd_parms params;
+	unsigned int i;
+	int ret;
 
 	pr_info("\n");
-	pr_info("Dumping opened files (pid: %d)\n", pid);
+	pr_info("Dumping opened files (pid: %d)\n", ctl->pid);
 	pr_info("----------------------------------------\n");
 
 	/*
+	 * Parasite should transfer file descriptors
+	 * to our space.
+	 */
+	ret = parasite_tx_fds_seized(ctl, fds_map->fd_orig, fds_map->fd_locl, fds_map->nr_fds);
+	if (ret)
+		return -1;
+
+	/*
 	 * Dump special files at the beginning. We might need
 	 * to re-read them in restorer, so better to make it
 	 * fast.
 	 */
-	if (dump_task_special_files(pid, cr_fdset)) {
+	if (dump_task_special_files(ctl->pid, cr_fdset)) {
 		pr_err("Can't dump special files\n");
 		return -1;
 	}
 
-	fd_dir = opendir_proc(pid, "fd");
-	if (!fd_dir)
-		return -1;
+	for (i = 0; i < fds_map->nr_fds; i++) {
 
-	while ((de = readdir(fd_dir))) {
-		if (!strcmp(de->d_name, "."))
-			continue;
-		if (!strcmp(de->d_name, ".."))
-			continue;
-		if (dump_one_fd(pid, dirfd(fd_dir), de->d_name, cr_fdset,
-				sk_queue))
+		params.scm_fd_locl	= fds_map->fd_locl[i];
+		params.scm_fd_orig	= fds_map->fd_orig[i];
+		params.pos		= lseek(params.scm_fd_locl, 0, SEEK_CUR);
+		params.flags		= fcntl(params.scm_fd_locl, F_GETFL);
+		params.pid		= ctl->pid;
+		params.id		= FD_ID_INVALID;
+
+		ret = dump_one_fd(&params, cr_fdset, sk_queue);
+		if (ret)
 			return -1;
 	}
 
 	pr_info("----------------------------------------\n");
 
-	closedir(fd_dir);
 	return 0;
 }
 
@@ -447,21 +443,24 @@ static int dump_task_mappings(pid_t pid, const struct list_head *vma_area_list,
 			if (write_img(cr_fdset->fds[CR_FD_SHMEM], &e))
 				goto err;
 		} else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
-				vma_entry_is(vma, VMA_FILE_SHARED)) {
-			struct fd_parms p = {
-				.fd_name	= vma->start,
+			   vma_entry_is(vma, VMA_FILE_SHARED)) {
+
+			struct fd_parms params = {
+				.scm_fd_locl	= vma_area->vm_file_fd,
+				.scm_fd_orig	= vma_area->vm_file_fd,
+				.vma_entry	= vma,
 				.id		= FD_ID_INVALID,
 				.pid		= pid,
 				.type		= FDINFO_MAP,
 			};
 
 			if (vma->prot & PROT_WRITE &&
-					vma_entry_is(vma, VMA_FILE_SHARED))
-				p.flags = O_RDWR;
+			    vma_entry_is(vma, VMA_FILE_SHARED))
+				params.flags = O_RDWR;
 			else
-				p.flags = O_RDONLY;
+				params.flags = O_RDONLY;
 
-			ret = dump_one_reg_file(&p, vma_area->vm_file_fd, cr_fdset, 0);
+			ret = dump_one_reg_file(&params, cr_fdset);
 			if (ret)
 				goto err;
 		}
@@ -1287,6 +1286,7 @@ static int dump_one_task(const struct pstree_item *item, struct cr_fdset *cr_fds
 	int ret = -1;
 	struct parasite_dump_misc misc;
 	struct sk_queue sk_queue = { };
+	struct fds_scm_map *fds_map = NULL;
 
 	pr_info("========================================\n");
 	pr_info("Dumping task (pid: %d)\n", pid);
@@ -1315,9 +1315,9 @@ static int dump_one_task(const struct pstree_item *item, struct cr_fdset *cr_fds
 		goto err;
 	}
 
-	ret = dump_task_files(pid, cr_fdset, &sk_queue);
+	ret = fds_scm_collect_orig(pid, &fds_map);
 	if (ret) {
-		pr_err("Dump files (pid: %d) failed with %d\n", pid, ret);
+		pr_err("Collecting fds (pid: %d) failed with %d\n", pid, ret);
 		goto err;
 	}
 
@@ -1327,6 +1327,12 @@ static int dump_one_task(const struct pstree_item *item, struct cr_fdset *cr_fds
 		goto err;
 	}
 
+	ret = dump_task_files_seized(parasite_ctl, fds_map, cr_fdset, &sk_queue);
+	if (ret) {
+		pr_err("Dump files (pid: %d) failed with parasite\n", pid);
+		goto err;
+	}
+
 	ret = parasite_dump_pages_seized(parasite_ctl, &vma_area_list, cr_fdset);
 	if (ret) {
 		pr_err("Can't dump pages (pid: %d) with parasite\n", pid);
@@ -1394,11 +1400,14 @@ static int dump_one_task(const struct pstree_item *item, struct cr_fdset *cr_fds
 	}
 
 	free_mappings(&vma_area_list);
+	fds_scm_map_destroy(&fds_map);
 
 err:
 	close_pid_proc();
 err_free:
 	free_mappings(&vma_area_list);
+	fds_scm_map_destroy(&fds_map);
+
 	return ret;
 }
 
diff --git a/fds-scm-map.c b/fds-scm-map.c
new file mode 100644
index 0000000..9b79065
--- /dev/null
+++ b/fds-scm-map.c
@@ -0,0 +1,140 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+#include "types.h"
+#include "compiler.h"
+#include "crtools.h"
+#include "util.h"
+
+#include "fds-scm-map.h"
+
+#ifndef CONFIG_X86_64
+# error No x86-32 support yet
+#endif
+
+struct fds_scm_map *fds_scm_map_create(void)
+{
+	struct fds_scm_map *map;
+
+	map = xzalloc(sizeof(*map));
+	if (map) {
+		map->fd_locl = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+				    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		if (map->fd_locl == MAP_FAILED)
+			goto err_nomem;
+
+		map->fd_orig = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+				    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+		if (map->fd_orig == MAP_FAILED) {
+			munmap(map->fd_locl, PAGE_SIZE);
+			goto err_nomem;
+		}
+
+		map->size_fd_orig = PAGE_SIZE;
+		map->size_fd_locl = PAGE_SIZE;
+	}
+
+	return map;
+
+err_nomem:
+	pr_perror("Can't allocate memory for SCM fd map");
+	xfree(map);
+	return NULL;
+}
+
+void fds_scm_map_destroy(struct fds_scm_map **map)
+{
+	if (!map || !*map)
+		return;
+
+	if ((*map)->fd_locl != MAP_FAILED)
+		munmap((*map)->fd_locl, (*map)->size_fd_locl);
+
+	if ((*map)->fd_orig != MAP_FAILED)
+		munmap((*map)->fd_orig, (*map)->size_fd_orig);
+
+	xfree(*map);
+	*map = NULL;
+}
+
+int fds_scm_map_expand(struct fds_scm_map *map)
+{
+	int *new;
+
+	new = mremap(map->fd_locl, map->size_fd_locl,
+		     map->size_fd_locl + PAGE_SIZE, MREMAP_MAYMOVE);
+	if (new == MAP_FAILED)
+		goto err_nomem;
+
+	map->fd_locl = new;
+	map->size_fd_locl += PAGE_SIZE;
+
+	new = mremap(map->fd_orig, map->size_fd_orig,
+		     map->size_fd_orig + PAGE_SIZE, MREMAP_MAYMOVE);
+	if (new == MAP_FAILED)
+		goto err_nomem;
+
+	map->fd_orig = new;
+	map->size_fd_orig += PAGE_SIZE;
+
+	return 0;
+err_nomem:
+	pr_perror("Can't expand memory for SCM fd map");
+	return -ENOMEM;
+}
+
+int fds_scm_collect_orig(pid_t pid, struct fds_scm_map **map)
+{
+	unsigned int n = 0;
+	struct dirent *d;
+	DIR *dir;
+
+	BUG_ON(*map);
+
+	pr_info("\n");
+	pr_info("Collecting fds (pid: %d)\n", pid);
+	pr_info("----------------------------------------\n");
+
+	*map = fds_scm_map_create();
+	if (!*map)
+		return -1;
+
+	dir = opendir_proc(pid, "fd");
+	if (!dir)
+		return -1;
+
+	while ((d = readdir(dir))) {
+		if (d->d_name[0] == '.')
+			continue;
+
+		if (SCM_MAP_NEED_EXPAND((*map)->size_fd_orig, n + 1)) {
+			if (fds_scm_map_expand(*map))
+				return -ENOMEM;
+		}
+
+		(*map)->fd_orig[n] = atoi(d->d_name);
+		n++;
+	}
+
+	pr_info("Found %d file descriptors\n", n);
+
+	(*map)->nr_fds = n;
+
+	pr_info("----------------------------------------\n");
+	return 0;
+
+err_nomem:
+	pr_perror("Can't allocate memory for %d\n", pid);
+	return -ENOMEM;
+}
diff --git a/include/fds-scm-map.h b/include/fds-scm-map.h
new file mode 100644
index 0000000..393d28f
--- /dev/null
+++ b/include/fds-scm-map.h
@@ -0,0 +1,22 @@
+#ifndef FDS_SCM_MAP
+#define FDS_SCM_MAP
+
+#include <sys/types.h>
+
+extern struct fds_scm_map *fds_scm_map_create(void);
+extern void fds_scm_map_destroy(struct fds_scm_map **map);
+extern int fds_scm_map_expand(struct fds_scm_map *map);
+extern int fds_scm_collect_orig(pid_t pid, struct fds_scm_map **map);
+
+struct fds_scm_map {
+	int		*fd_locl;
+	int		*fd_orig;
+
+	unsigned int	nr_fds;
+	unsigned long	size_fd_locl;
+	unsigned long	size_fd_orig;
+};
+
+#define SCM_MAP_NEED_EXPAND(size, nr)	(((size) / sizeof(int)) <= (nr))
+
+#endif /* FDSET_SCM_MAP */
diff --git a/include/parasite-syscall.h b/include/parasite-syscall.h
index 5ae1554..1319f2e 100644
--- a/include/parasite-syscall.h
+++ b/include/parasite-syscall.h
@@ -31,6 +31,7 @@ struct parasite_ctl {
 
 extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct cr_fdset *cr_fdset);
 extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct cr_fdset *cr_fdset);
+extern int parasite_tx_fds_seized(struct parasite_ctl *ctl, int *fds_tx, int *fds_rx, unsigned int nr_fds);
 
 struct parasite_dump_misc;
 extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc);
diff --git a/include/parasite.h b/include/parasite.h
index dbad64d..8d6787d 100644
--- a/include/parasite.h
+++ b/include/parasite.h
@@ -8,10 +8,11 @@
 #include "compiler.h"
 #include "image.h"
 #include "sockets.h"
+#include "util-net.h"
 
 #define __parasite_head		__used __section(.parasite.head.text)
 
-#define PARASITE_STACK_SIZE	2048
+#define PARASITE_STACK_SIZE	4096
 #define PARASITE_ARG_SIZE	8196
 
 #define PARASITE_MAX_SIZE	(64 << 10)
@@ -30,6 +31,7 @@ enum {
 	PARASITE_CMD_DUMP_MISC,
 	PARASITE_CMD_DUMP_TID_ADDR,
 	PARASITE_CMD_DUMP_SK_QUEUES,
+	PARASITE_CMD_SCM_FD,
 
 	PARASITE_CMD_MAX,
 };
@@ -91,6 +93,15 @@ struct parasite_dump_sk_queues {
 	struct sk_queue_item	items[0];
 };
 
+struct parasite_scm_fd {
+	parasite_status_t	status;
+
+	struct scm_fdset	fdset;
+
+	struct sockaddr_un	saddr;
+	int			sun_len;
+};
+
 /*
  * Some useful offsets
  */
diff --git a/include/syscall-codes.h b/include/syscall-codes.h
index 86adf75..0c284b9 100644
--- a/include/syscall-codes.h
+++ b/include/syscall-codes.h
@@ -35,6 +35,7 @@
 #define __NR_wait4		61
 #define __NR_kill		62
 #define __NR_flock		73
+#define __NR_getdents		78
 #define __NR_unlink		87
 #define __NR_setresuid		117
 #define __NR_setresgid		119
diff --git a/include/syscall.h b/include/syscall.h
index 31a5d9a..0206ab9 100644
--- a/include/syscall.h
+++ b/include/syscall.h
@@ -298,6 +298,11 @@ static always_inline long sys_flock(unsigned long fd, unsigned long cmd)
 	return syscall2(__NR_flock, fd, cmd);
 }
 
+static always_inline long sys_getdents(unsigned int fd, struct linux_dirent *dirent, unsigned int count)
+{
+	return syscall3(__NR_getdents, (long)fd, (long)dirent, (long)count);
+}
+
 static void always_inline local_sleep(long seconds)
 {
 	struct timespec req, rem;
diff --git a/include/types.h b/include/types.h
index 0f87b4b..45220c8 100644
--- a/include/types.h
+++ b/include/types.h
@@ -195,4 +195,22 @@ enum kcmp_type {
 	KCMP_TYPES,
 };
 
+struct linux_dirent {
+	unsigned long	d_ino;
+	unsigned long	d_off;
+	unsigned short	d_reclen;
+	char		d_name[256];
+};
+
+/* For UNIX sockets data */
+#ifndef UIO_FASTIOV
+# define UIO_FASTIOV	8
+#endif
+#ifndef UIO_MAXIOV
+#define UIO_MAXIOV	1024
+#endif
+#ifndef SCM_MAX_FD
+# define SCM_MAX_FD	253
+#endif
+
 #endif /* CR_TYPES_H_ */
diff --git a/include/util-net.h b/include/util-net.h
index c5a8c46..ae63a48 100644
--- a/include/util-net.h
+++ b/include/util-net.h
@@ -1,6 +1,19 @@
 #ifndef UTIL_NET_H_
 #define UTIL_NET_H_
 
+#include <sys/socket.h>
+#include <sys/un.h>
+
+/*
+ * Because kernel do kmalloc for user data passed
+ * in SCM messages, and there is SCM_MAX_FD as a limit
+ * for descriptors passed at once -- we're trying to
+ * eliminate pressue on kernel memory manager and use
+ * predefined known to work well size of the message buffer.
+ */
+#define CR_SCM_MSG_SIZE	(1024)
+#define CR_SCM_MAX_FD	(252)
+
 #define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \
 			(size_t)((struct sockaddr_un *) 0)->sun_path)
 
@@ -8,6 +21,26 @@
 #define SO_PEEK_OFF            42
 #endif
 
+struct scm_fdset {
+	struct msghdr		hdr;
+	struct iovec		iov;
+	char			msg_buf[CR_SCM_MSG_SIZE];
+	int			__pad;
+	union {
+		int		nr_fds_rx;
+		int		nr_fds_tx;
+		int		__nr_fds;
+	};
+};
+
+extern int *scm_fdset_init(struct scm_fdset *fdset);
+extern void scm_fdset_set_addr(struct scm_fdset *fdset, struct sockaddr_un *saddr, int saddr_len);
+extern int *scm_fdset_first(struct scm_fdset *fdset);
+extern int scm_fdset_update(struct scm_fdset *fdset, int nr_fds);
+extern int scm_fdset_send(int sock, struct scm_fdset *fdset);
+extern int scm_fdset_recv(int sock, struct scm_fdset *fdset);
+
 extern int send_fd(int sock, struct sockaddr_un *saddr, int len, int fd);
 extern int recv_fd(int sock);
+
 #endif
diff --git a/include/util.h b/include/util.h
index 915a3b5..963665b 100644
--- a/include/util.h
+++ b/include/util.h
@@ -11,6 +11,8 @@
 #include <errno.h>
 
 #include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
 #include <dirent.h>
 
 #include "compiler.h"
@@ -189,6 +191,17 @@ int do_open_proc(pid_t pid, int flags, const char *fmt, ...);
 		__fd;						\
 	})
 
+#define __open_proc_nocheck(pid, flags, fmt, ...)		\
+	({							\
+		int __fd = do_open_proc(pid, flags,		\
+					fmt, ##__VA_ARGS__);	\
+								\
+		__fd;						\
+	})
+
+#define open_proc_nocheck(pid, fmt, ...)			\
+	__open_proc_nocheck(pid, O_RDONLY, fmt, ##__VA_ARGS__)
+
 /* int open_proc(pid_t pid, const char *fmt, ...); */
 #define open_proc(pid, fmt, ...)				\
 	__open_proc(pid, O_RDONLY, fmt, ##__VA_ARGS__)
diff --git a/parasite-syscall.c b/parasite-syscall.c
index 35c3ed7..deac303 100644
--- a/parasite-syscall.c
+++ b/parasite-syscall.c
@@ -320,13 +320,13 @@ static int munmap_seized(struct parasite_ctl *ctl, void *addr, size_t length)
 	return ret;
 }
 
-static int gen_parasite_saddr(struct sockaddr_un *saddr, pid_t pid)
+static int gen_parasite_saddr(struct sockaddr_un *saddr, int salt)
 {
 	int sun_len;
 
 	saddr->sun_family = AF_UNIX;
 	snprintf(saddr->sun_path, UNIX_PATH_MAX,
-			"X/crtools-pr-%d", pid);
+			"X/crtools-pr-%d", salt);
 
 	sun_len = SUN_LEN(saddr);
 	*saddr->sun_path = '\0';
@@ -399,17 +399,19 @@ out:
 	return ret;
 }
 
-static int parasite_init(struct parasite_ctl *ctl, pid_t pid)
+static int parasite_init(struct parasite_ctl *ctl)
 {
 	struct parasite_init_args args = { };
+	int ret;
+
+	args.sun_len = gen_parasite_saddr(&args.saddr, ctl->pid);
 
-	args.sun_len = gen_parasite_saddr(&args.saddr, pid);
+	ret = parasite_execute(PARASITE_CMD_INIT, ctl, (parasite_status_t *)&args, sizeof(args));
 
-	return parasite_execute(PARASITE_CMD_INIT, ctl,
-				(parasite_status_t *)&args, sizeof(args));
+	return ret;
 }
 
-static int parasite_set_logfd(struct parasite_ctl *ctl, pid_t pid)
+static int parasite_set_logfd(struct parasite_ctl *ctl)
 {
 	parasite_status_t args = { };
 	int ret;
@@ -419,10 +421,8 @@ static int parasite_set_logfd(struct parasite_ctl *ctl, pid_t pid)
 		return ret;
 
 	ret = parasite_execute(PARASITE_CMD_SET_LOGFD, ctl, &args, sizeof(args));
-	if (ret < 0)
-		return ret;
 
-	return 0;
+	return ret;
 }
 
 int parasite_dump_tid_addr_seized(struct parasite_ctl *ctl, pid_t pid, unsigned int **tid_addr)
@@ -457,6 +457,78 @@ int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_mis
 				sizeof(struct parasite_dump_misc));
 }
 
+/* Transmit file descriptors from dumpee into our space */
+int parasite_tx_fds_seized(struct parasite_ctl *ctl, int *fds_tx, int *fds_rx, unsigned int nr_fds)
+{
+	struct parasite_scm_fd args	= { };
+	parasite_status_t *st		= &args.status;
+	struct scm_fdset *fdset		= &args.fdset;
+	int ret = -1;
+	int sock;
+
+	args.sun_len = gen_parasite_saddr(&args.saddr, (int)-2u);
+
+	sock = socket(PF_UNIX, SOCK_DGRAM, 0);
+	if (sock < 0) {
+		pr_perror("Can't create socket");
+		return -1;
+	}
+
+	ret = bind(sock, (struct sockaddr *)&args.saddr, args.sun_len);
+	if (ret < 0) {
+		pr_perror("Can't bind socket");
+		goto err;
+	}
+
+	while (nr_fds) {
+		int *__fds_tx;
+		int *__fds_rx;
+
+		scm_fdset_init(fdset);
+		scm_fdset_update(fdset, nr_fds);
+
+		nr_fds -= fdset->__nr_fds;
+
+		__fds_tx = scm_fdset_first(fdset);
+
+		memcpy(__fds_tx, fds_tx, sizeof(int) * fdset->nr_fds_tx);
+		fds_tx += fdset->nr_fds_tx;
+
+		ret = parasite_execute(PARASITE_CMD_SCM_FD, ctl, st, sizeof(args));
+		if (ret) {
+			pr_err("SCM_RIGHTS failed on TX: %d\n", ret);
+			goto err;
+		}
+
+		scm_fdset_init(fdset);
+		scm_fdset_set_addr(fdset, &args.saddr, args.sun_len);
+
+		ret = scm_fdset_recv(sock, fdset);
+		if (ret) {
+			pr_err("SCM_RIGHTS failed on RX: %d (%s)\n",
+				ret, strerror(-ret));
+			goto err;
+		}
+		__fds_rx = scm_fdset_first(fdset);
+
+#if 1
+		{
+			int i = 0;
+			for (i = 0; i < fdset->nr_fds_rx; i++)
+				pr_debug("\tRX:    %016x\n", __fds_rx[i]);
+		}
+#endif
+
+		memcpy(fds_rx, __fds_rx, sizeof(int) * fdset->nr_fds_rx);
+		fds_rx += fdset->nr_fds_rx;
+	}
+	ret = 0;
+
+err:
+	close(sock);
+	return ret;
+}
+
 int parasite_dump_socket_info(struct parasite_ctl *ctl, struct cr_fdset *fdset,
 			      struct sk_queue *queue)
 {
@@ -678,8 +750,8 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct list_head *vma_are
 		goto err;
 	}
 
-	ctl->pid	= pid;
-	ctl->syscall_ip	= vma_area->vma.start;
+	ctl->pid		= pid;
+	ctl->syscall_ip		= vma_area->vma.start;
 
 	/*
 	 * Inject syscall instruction and remember original code,
@@ -734,7 +806,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct list_head *vma_are
 	ctl->addr_cmd		= (void *)PARASITE_CMD_ADDR((unsigned long)ctl->local_map);
 	ctl->addr_args		= (void *)PARASITE_ARGS_ADDR((unsigned long)ctl->local_map);
 
-	ret = parasite_init(ctl, pid);
+	ret = parasite_init(ctl);
 	if (ret) {
 		pr_err("%d: Can't create a transport socket\n", pid);
 		goto err_restore;
@@ -742,7 +814,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct list_head *vma_are
 
 	ctl->signals_blocked = 1;
 
-	ret = parasite_set_logfd(ctl, pid);
+	ret = parasite_set_logfd(ctl);
 	if (ret) {
 		pr_err("%d: Can't set a logging descriptor\n", pid);
 		goto err_restore;
diff --git a/parasite.c b/parasite.c
index 53b34a3..2de2cd1 100644
--- a/parasite.c
+++ b/parasite.c
@@ -85,6 +85,27 @@ static unsigned long builtin_strlen(char *str)
 	return len;
 }
 
+static int builtin_atoi(char *str)
+{
+	int ret = str[0] - '0';
+	int sign = 1;
+
+	while (*str == ' ')
+		str++;
+
+	if (*str == '-')
+		sign = -1;
+
+	while (*str++) {
+		int v = str[0] - '0';
+		if (v < 0 || v > 9)
+			break;
+		ret = ret * 10 + v;
+	}
+
+	return ret * sign;
+}
+
 static const unsigned char hex[] = "0123456789abcdef";
 static char *long2hex(unsigned long v)
 {
@@ -512,6 +533,47 @@ err_dmp:
 	return ret;
 }
 
+static int scm_fd(struct parasite_scm_fd *args)
+{
+	parasite_status_t *st	= &args->status;
+	struct scm_fdset *fdset	= &args->fdset;
+	int __nr_fds		= fdset->__nr_fds;
+	int ret;
+
+	/*
+	 * Need to reinit and bind the address,
+	 * this fdset came from different address
+	 * space. Note we don't poke data it consist.
+	 */
+	scm_fdset_init(fdset);
+	scm_fdset_set_addr(fdset, &args->saddr, args->sun_len);
+	scm_fdset_update(fdset, __nr_fds);
+
+#if 1
+	sys_write_msg("\tNRfds: ");
+	sys_write_msg(long2hex(fdset->__nr_fds));
+	sys_write_msg("\n");
+
+	{
+		int i;
+		int *fd = scm_fdset_first(fdset);
+		for (i = 0; i < fdset->__nr_fds; i++) {
+			sys_write_msg("\tTX:    ");
+			sys_write_msg(long2hex(fd[i]));
+			sys_write_msg("\n");
+		}
+	}
+#endif
+
+	ret = scm_fdset_send(tsock, fdset);
+	if (ret <= 0) {
+		sys_write_msg("scm_fdset_send failed\n");
+		SET_PARASITE_ERR(st, ret);
+	}
+
+	return 0;
+}
+
 static int init(struct parasite_init_args *args)
 {
 	parasite_status_t *st = &args->status;
@@ -579,6 +641,7 @@ static int __used parasite_service(unsigned long cmd, void *args)
 	BUILD_BUG_ON(sizeof(struct parasite_dump_misc) > PARASITE_ARG_SIZE);
 	BUILD_BUG_ON(sizeof(struct parasite_dump_tid_addr) > PARASITE_ARG_SIZE);
 	BUILD_BUG_ON(sizeof(struct parasite_dump_sk_queues) > PARASITE_ARG_SIZE);
+	BUILD_BUG_ON(sizeof(struct parasite_scm_fd) > PARASITE_ARG_SIZE);
 
 	switch (cmd) {
 	case PARASITE_CMD_INIT:
@@ -603,6 +666,8 @@ static int __used parasite_service(unsigned long cmd, void *args)
 		return dump_tid_addr((struct parasite_dump_tid_addr *)args);
 	case PARASITE_CMD_DUMP_SK_QUEUES:
 		return dump_skqueues((struct parasite_dump_sk_queues *)args);
+	case PARASITE_CMD_SCM_FD:
+		return scm_fd((struct parasite_scm_fd *)args);
 	default:
 		{
 			parasite_status_t *st = (parasite_status_t *)args;
diff --git a/util-net.c b/util-net.c
index dad02c4..492bdda 100644
--- a/util-net.c
+++ b/util-net.c
@@ -1,8 +1,110 @@
 #include <sys/socket.h>
 #include <sys/un.h>
+#include <errno.h>
 
+#include "compiler.h"
+#include "types.h"
 #include "syscall.h"
 
+#include "util-net.h"
+
+/* Setup the number of FDs placed into FDs array */
+int scm_fdset_update(struct scm_fdset *fdset, int nr_fds)
+{
+	int min_fd = min(nr_fds, CR_SCM_MAX_FD);
+	struct cmsghdr *cmsg;
+
+	cmsg				= CMSG_FIRSTHDR(&fdset->hdr);
+	fdset->hdr.msg_controllen	= CMSG_LEN(sizeof(int) * min_fd);
+	cmsg->cmsg_len			= fdset->hdr.msg_controllen;
+	fdset->__nr_fds			= min_fd;
+
+	return min_fd;
+}
+
+/* Returns pointer to the FDs array */
+int *scm_fdset_first(struct scm_fdset *fdset)
+{
+	struct cmsghdr *cmsg = CMSG_FIRSTHDR(&fdset->hdr);
+	return (int *)CMSG_DATA(cmsg);
+}
+
+/* Returns a pointer to the data where FDs should be placed */
+int *scm_fdset_init(struct scm_fdset *fdset)
+{
+	struct cmsghdr *cmsg;
+
+	BUILD_BUG_ON(CR_SCM_MAX_FD > SCM_MAX_FD);
+	BUILD_BUG_ON(sizeof(fdset->msg_buf) < (CMSG_SPACE(sizeof(int) * CR_SCM_MAX_FD)));
+
+	fdset->__nr_fds			= CR_SCM_MAX_FD;
+	fdset->__pad			= 0;
+
+	fdset->iov.iov_base		= &fdset->__pad;
+	fdset->iov.iov_len		= sizeof(fdset->__pad);
+
+	/*
+	 * msg_name and msg_namelen should be assigned separately.
+	 */
+	fdset->hdr.msg_iov		= &fdset->iov;
+	fdset->hdr.msg_iovlen		= 1;
+
+	fdset->hdr.msg_control		= &fdset->msg_buf;
+	fdset->hdr.msg_controllen	= CMSG_LEN(sizeof(int) * CR_SCM_MAX_FD);
+
+	cmsg				= CMSG_FIRSTHDR(&fdset->hdr);
+	cmsg->cmsg_len			= fdset->hdr.msg_controllen;
+	cmsg->cmsg_level		= SOL_SOCKET;
+	cmsg->cmsg_type			= SCM_RIGHTS;
+
+	return scm_fdset_first(fdset);
+}
+
+/* Just for fine-grained assignment */
+void scm_fdset_set_addr(struct scm_fdset *fdset, struct sockaddr_un *saddr, int saddr_len)
+{
+	fdset->hdr.msg_name		= (struct sockaddr *)saddr;
+	fdset->hdr.msg_namelen		= saddr_len;
+}
+
+/* Returns the number of fds passed or negative on error */
+int scm_fdset_send(int sock, struct scm_fdset *fdset)
+{
+	int ret = sys_sendmsg(sock, &fdset->hdr, 0);
+	if (ret <= 0)
+		return ret;
+
+	return fdset->nr_fds_tx;
+}
+
+/*
+ * Return pointer to the data where FDs are stored,
+ * or NULL on error. The fdset should be already init'ed.
+ */
+int scm_fdset_recv(int sock, struct scm_fdset *fdset)
+{
+	struct cmsghdr *cmsg;
+	int min_fd;
+	int ret = 0;
+
+	scm_fdset_update(fdset, CR_SCM_MAX_FD);
+
+	ret = sys_recvmsg(sock, &fdset->hdr, 0);
+	if (ret < 0)
+		return ret;
+
+	cmsg = CMSG_FIRSTHDR(&fdset->hdr);
+	if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
+		return -EINVAL;
+
+	min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
+	min_fd = min(min_fd, CR_SCM_MAX_FD);
+
+	fdset->nr_fds_rx = min_fd;
+
+	return 0;
+}
+
 int send_fd(int sock, struct sockaddr_un *saddr, int len, int fd)
 {
 	char cmsgbuf[CMSG_SPACE(sizeof(int))];
@@ -57,7 +159,7 @@ int recv_fd(int sock)
 		return ret;
 
 	cmsg = CMSG_FIRSTHDR(&msg);
-	if (!cmsg || !cmsg->cmsg_type == SCM_RIGHTS)
+	if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
 		return -2;
 
 	cmsg_data = (int *)CMSG_DATA(cmsg);
-- 
1.7.7.6



More information about the CRIU mailing list