[CRIU] Process Migration using Sockets v2 - Patch 1/2

Rodrigo Bruno rbruno at gsd.inesc-id.pt
Fri Oct 9 07:31:23 PDT 2015


Hi,

this patch changes existing code in CRIU to make remote process migration possible.

Remote process migration goes as follows:

1. some image is open (write permission) by CRIU dump. If the remote option is true, 
instead of openning a real disk file, this opens a socket connection to the 
image-proxy (local daemon). The file descriptor is returned and CRIU uses it as it 
would use a normal disk-backed file descriptor.

2. the image-proxy waits until CRIU dump closes the file. Then the image-proxy
forwards the image to the image-cache (remote daemon).

3. the image-cache caches the image in memory and waits.

4. CRIU restore opens an image (read permission). If the remote option is true, it
will open a connection to image-cache and ask for the specific file. If the file 
exists, the image-cache sends the file through the established connection. CRIU
restore uses the connection file descriptor as it would use a normal disk-backed 
file descriptor.

Protocol details:

1. all data channels (i.e., sockets used to send/receive images) are unix sockets
except for the image-cache channel used to receive images from the image-proxy.

CRIU Dump <-- UNIX SOCKET --> image-proxy ---> TCP SOCKET ---> image-cache ---> UNIX SOCKET ---> CRIU Restore
(notice the two-sided arrow between dump and image-proxy. This is necessary because 
predumps read image files from the image-proxy)

2. read (or get) and write (or put) connections use different server sockets, one for 
receiving read requests and another for write requests

3. Read connections are initiated as follows:
	a) the requester writes an image header
	b) the requester reads an image header
	c) the requester checks if the received image header matches.

4. Write connections only write the image header before sending the actual content.

5. the image header is a protobuf object that contains two strings: the image name
and the image namespace (the namespace identifies the process that created the image).

6. only connections between the image-proxy and the image-cache (which are TCP) check
for file boundaries. Imagine the image-proxy starts to forward an image to image-cache:
	a) write image-header
	b) write image size (uint64_t)
	c) the actual content, then close
When the connection is closed, the image-cache checks if it received the correct number of
bytes.

In this patch, two new files are added:
- include/image-remote.h
	This file presents the interface (set of functions) that are used by CRIU
to migrate processes using sockets. They provide ways for CRIU to open read and write
image connections, check is a particular file exists, etc.
- image-remote.c
	This file implements the functions described in include/image-remote.h


Signed-off-by: Rodrigo Bruno <rbruno at gsd.inesc-id.pt>
>From 1793ba84942e03900755378a6fc2642b8734c13d Mon Sep 17 00:00:00 2001
From: rodrigo-bruno <rbruno at gsd.inesc-id.pt>
Date: Fri, 9 Oct 2015 14:37:31 +0100
Subject: [PATCH] Process migration using sockets (1/2)

---
 Makefile.crtools            |   4 +
 cr-dedup.c                  |   1 +
 cr-dump.c                   |  16 +++
 crtools.c                   |  23 +++-
 image-remote.c              | 258 ++++++++++++++++++++++++++++++++++++++++++++
 image.c                     |  56 ++++++++--
 include/cr_options.h        |   1 +
 include/image-remote.h      |  87 +++++++++++++++
 include/protobuf-desc.h     |   2 +
 page-read.c                 |  50 +++++++--
 page-xfer.c                 |  31 +++++-
 protobuf-desc.c             |   1 +
 protobuf/Makefile           |   1 +
 protobuf/remote-image.proto |   8 ++
 14 files changed, 516 insertions(+), 23 deletions(-)
 create mode 100644 image-remote.c
 create mode 100644 include/image-remote.h
 create mode 100644 protobuf/remote-image.proto

diff --git a/Makefile.crtools b/Makefile.crtools
index 80f704f..4a74fa8 100644
--- a/Makefile.crtools
+++ b/Makefile.crtools
@@ -6,6 +6,10 @@ obj-y	+= crtools.o
 obj-y	+= security.o
 obj-y	+= image.o
 obj-y	+= image-desc.o
+obj-y	+= image-remote.o
+obj-y	+= image-proxy.o
+obj-y	+= image-cache.o
+obj-y	+= image-remote-pvt.o
 obj-y	+= net.o
 obj-y	+= tun.o
 obj-y	+= proc_parse.o
diff --git a/cr-dedup.c b/cr-dedup.c
index b453c3e..77f0b39 100644
--- a/cr-dedup.c
+++ b/cr-dedup.c
@@ -9,6 +9,7 @@
 
 #define MAX_BUNCH_SIZE 256
 
+/* TODO - patch this for using remote migration using sockets */
 static int cr_dedup_one_pagemap(int pid);
 
 int cr_dedup(void)
diff --git a/cr-dump.c b/cr-dump.c
index 3af077b..f8d931a 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -83,6 +83,8 @@
 
 #include "asm/dump.h"
 
+#include "image-remote.h"
+
 static char loc_buf[PAGE_SIZE];
 
 static void close_vma_file(struct vma_area *vma)
@@ -1343,6 +1345,11 @@ int cr_pre_dump_tasks(pid_t pid)
 	LIST_HEAD(ctls);
 	struct parasite_ctl *ctl, *n;
 
+	if (opts.remote && push_namespace() < 0) {
+		pr_err("Failed to push image namespace.\n");
+		goto err;
+	}
+
 	if (!opts.track_mem) {
 		pr_info("Enforcing memory tracking for pre-dump.\n");
 		opts.track_mem = true;
@@ -1448,6 +1455,11 @@ int cr_dump_tasks(pid_t pid)
 	pr_info("Dumping processes (pid: %d)\n", pid);
 	pr_info("========================================\n");
 
+	if (opts.remote && push_namespace() < 0) {
+		pr_err("Failed to push image namepsace.\n");
+		goto err;
+	}
+
 	if (init_stats(DUMP_STATS))
 		goto err;
 
@@ -1550,6 +1562,10 @@ err:
 	if (disconnect_from_page_server())
 		ret = -1;
 
+	if (opts.remote) {
+		finish_remote_dump();
+	}
+
 	close_cr_imgset(&glob_imgset);
 
 	if (bfd_flush_images())
diff --git a/crtools.c b/crtools.c
index ea8b889..4f1d2ed 100644
--- a/crtools.c
+++ b/crtools.c
@@ -43,6 +43,8 @@
 
 #include "setproctitle.h"
 
+#include "image-remote.h"
+
 struct cr_options opts;
 
 void init_opts(void)
@@ -62,6 +64,8 @@ void init_opts(void)
 	opts.cpu_cap = CPU_CAP_DEFAULT;
 	opts.manage_cgroups = CG_MODE_DEFAULT;
 	opts.ps_socket = -1;
+	opts.addr = PROXY_FWD_HOST;
+	opts.ps_port = CACHE_PUT;
 	opts.ghost_limit = DEFAULT_GHOST_LIMIT;
 }
 
@@ -252,6 +256,9 @@ int main(int argc, char *argv[], char *envp[])
 		{ "freeze-cgroup",		required_argument,	0, 1068 },
 		{ "ghost-limit",		required_argument,	0, 1069 },
 		{ "irmap-scan-path",		required_argument,	0, 1070 },
+		{ "remote",			no_argument,		0, 1071 },
+		{ "image-cache",		no_argument,		0, 1072 },
+		{ "image-proxy",		required_argument,	0, 1073 },
 		{ },
 	};
 
@@ -494,6 +501,9 @@ int main(int argc, char *argv[], char *envp[])
 			if (irmap_scan_path_add(optarg))
 				return -1;
 			break;
+		case 1071:
+			opts.remote = true;
+			break;
 		case 'M':
 			{
 				char *aux;
@@ -642,6 +652,12 @@ int main(int argc, char *argv[], char *envp[])
 	if (!strcmp(argv[optind], "page-server"))
 		return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;
 
+	if (!strcmp(argv[optind], "image-cache"))
+		return image_cache(opts.ps_port);
+
+	if (!strcmp(argv[optind], "image-proxy"))
+		return image_proxy(opts.addr, opts.ps_port);
+
 	if (!strcmp(argv[optind], "service"))
 		return cr_service(opts.daemon_mode);
 
@@ -668,6 +684,8 @@ usage:
 "  criu page-server\n"
 "  criu service [<options>]\n"
 "  criu dedup\n"
+"  criu image-cache [<options>]\n"
+"  criu image-proxy [<options>]\n"
 "\n"
 "Commands:\n"
 "  dump           checkpoint a process/tree identified by pid\n"
@@ -680,6 +698,8 @@ usage:
 "  dedup          remove duplicates in memory dump\n"
 "  cpuinfo dump   writes cpu information into image file\n"
 "  cpuinfo check  validates cpu information read from image file\n"
+"  image-cache    launch image-cache, used for process live migration\n"
+"  image-proxy    launch image-proxy, used for process live migration\n"
 	);
 
 	if (usage_error) {
@@ -706,6 +726,7 @@ usage:
 "                        restore making it the parent of the restored process\n"
 "  --freeze-cgroup\n"
 "                        use cgroup freezer to collect processes\n"
+"  --remote              dump images directly to remote node\n"
 "\n"
 "* Special resources support:\n"
 "  -x|--" USK_EXT_PARAM "inode,.." "      allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
@@ -762,7 +783,7 @@ usage:
 "                        when used on restore, as soon as page is restored, it\n"
 "                        will be punched from the image.\n"
 "\n"
-"Page/Service server options:\n"
+"Page/Service/image-cache/image-proxy server options:\n"
 "  --address ADDR        address of server or service\n"
 "  --port PORT           port of page server\n"
 "  -d|--daemon           run in the background after creating socket\n"
diff --git a/image-remote.c b/image-remote.c
new file mode 100644
index 0000000..79220f3
--- /dev/null
+++ b/image-remote.c
@@ -0,0 +1,258 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include "xmalloc.h"
+#include "criu-log.h"
+#include "image-remote.h"
+#include "image-remote-pvt.h"
+#include "protobuf/remote-image.pb-c.h"
+#include "protobuf-desc.h"
+
+#define PB_REMOTE_IMAGE_SIZE PATHLEN
+
+static char** parents = NULL;
+static int  parents_occ = 0;
+static char* namespace = NULL;
+
+int read_remote_image_connection(char* namespace, char* path)
+{
+	int sockfd;
+	char path_buf[PATHLEN], ns_buf[PATHLEN];
+
+	sockfd = setup_UNIX_client_socket(GET_IMG_PATH);
+	if (sockfd < 0) {
+	       return -1;
+	}
+
+	if (write_header(sockfd, namespace, path) < 0) {
+		pr_perror("Error writing header for %s:%s", path, namespace);
+		return -1;
+	}
+
+	if (read_header(sockfd, ns_buf, path_buf) < 0) {
+		pr_perror("Error reading header for %s:%s", path, namespace);
+		return -1;
+	}
+
+	if (!strncmp(path_buf, path, PATHLEN) && !strncmp(ns_buf, namespace, PATHLEN)) {
+		pr_info("Image cache does have %s:%s\n", path, namespace);
+		return sockfd;
+	}
+	else if (!strncmp(path_buf, DUMP_FINISH, PATHLEN)) {
+		pr_info("Image cache does not have %s:%s\n", path, namespace);
+		close(sockfd);
+		return -1;
+	}
+	else {
+		pr_perror("Image cache returned erroneous name %s\n", path);
+		close(sockfd);
+		return -1;
+	}
+}
+
+int write_remote_image_connection(char* namespace, char* path)
+{
+	int sockfd = setup_UNIX_client_socket(PUT_IMG_PATH);
+	if (sockfd < 0)
+		return -1;
+
+	if (write_header(sockfd, namespace, path) < 0) {
+		pr_perror("Error writing header for %s:%s", path, namespace);
+		return -1;
+	}
+
+	pr_info("[write_remote_image_connection] fd=%d\n", sockfd);
+
+	return sockfd;
+}
+
+int finish_remote_dump()
+{
+	pr_info("Dump side is calling finish\n");
+	int fd = write_remote_image_connection(NULL_NAMESPACE, DUMP_FINISH);
+	if (fd == -1) {
+		pr_perror("Unable to open finish dump connection");
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+int skip_remote_bytes(int fd, unsigned long len)
+{
+    static char buf[4096];
+    int n = 0;
+    unsigned long curr = 0;
+
+    for(; curr < len; ) {
+	    n = read(fd, buf, MIN(len - curr, 4096));
+	    if (n == 0) {
+		pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)",
+			curr, len);
+		return -1;
+	    }
+	    else if (n > 0) {
+		    curr += n;
+	    }
+	    else {
+		pr_perror("Error while skipping bytes from stream (%lx/%lx)",
+			curr, len);
+		return -1;
+	    }
+    }
+    if ( curr != len) {
+	    pr_perror("Unable to skip the current number of bytes: %lx instead of %lx",
+		    curr, len);
+	    return -1;
+    }
+    return 0;
+}
+
+static int write_namespaces()
+{
+	int n;
+	RemoteNamespaceEntry rn = REMOTE_NAMESPACE_ENTRY__INIT;
+
+	int sockfd = write_remote_image_connection(NULL_NAMESPACE, PARENT_IMG);
+	if (sockfd < 0) {
+		pr_perror("Unable to open namespace push connection");
+		return -1;
+	}
+
+	rn.n_namespace_ = parents_occ;
+	rn.namespace_ = xmalloc(sizeof(char*) * rn.n_namespace_);
+	if (!rn.namespace_) {
+		pr_perror("Unable to allocate namespace array");
+		close(sockfd);
+		return -1;
+	}
+
+	for (n = 0; n < rn.n_namespace_; n++) {
+		rn.namespace_[n] = xmalloc(sizeof(char) * PATHLEN);
+		if (!rn.namespace_[n]) {
+			pr_perror("Unable to allocate namespace");
+			close(sockfd);
+			return -1;
+		}
+		strncpy(rn.namespace_[n], parents[n], PATHLEN);
+	}
+
+	n = pb_write_obj(sockfd, &rn, PB_REMOTE_NAMESPACE);
+
+	for (n = 0; n < rn.n_namespace_; n++)
+		xfree(rn.namespace_[n]);
+	xfree(rn.namespace_);
+
+	close(sockfd);
+	return n;
+}
+
+static int read_namespaces()
+{
+	int n, sockfd;
+	RemoteNamespaceEntry* rn;
+	size_t namespaces = 0;
+	parents_occ = 0;
+
+	sockfd = read_remote_image_connection(NULL_NAMESPACE, PARENT_IMG);
+	if (sockfd < 0) {
+		pr_perror("Unable to open namespace get connection");
+		return -1;
+	}
+
+	n = pb_read_obj(sockfd, (void**)&rn, PB_REMOTE_NAMESPACE);
+	if (n)
+		namespaces = rn->n_namespace_;
+	else if (n < 0) {
+	    pr_perror("Unable to read remote namepsaces");
+	    close(sockfd);
+	    return n;
+	}
+
+	/* The extra char pointer is for the current namespace that is added
+	 if we are doing a dump or pre-dump operation. */
+	parents = malloc((namespaces + 1) * sizeof(char*));
+	if (!parents) {
+	    pr_perror("Unable to allocate parents");
+	    close(sockfd);
+	    return -1;
+	}
+
+	while (parents_occ < namespaces) {
+	    parents[parents_occ] = malloc(sizeof(char) * PATHLEN);
+	    if (!parents[parents_occ]) {
+		    pr_perror("Unable to allocate parent buffer");
+		    close(sockfd);
+		    return -1;
+	    }
+	    strncpy(parents[parents_occ], rn->namespace_[parents_occ], PATHLEN);
+	    parents_occ++;
+	}
+
+	free(rn);
+	close(sockfd);
+	return parents_occ;
+}
+
+int push_namespace()
+{
+    if (read_namespaces() < 0) {
+	    pr_perror("Failed to push namespace");
+	    return -1;
+    }
+
+    parents[parents_occ] = malloc(sizeof(char) * PATHLEN);
+    if (!parents[parents_occ]) {
+	    pr_perror("Unable to allocate parent buffer");
+	    return -1;
+    }
+    strncpy(parents[parents_occ++], namespace, PATHLEN);
+
+    if (write_namespaces() < 0) {
+	    pr_perror("Failed to push namespaces");
+	    return -1;
+    }
+
+    return parents_occ;
+}
+
+void init_namespace(char* ns)
+{
+	namespace = ns;
+}
+
+char* get_current_namespace()
+{
+	return namespace;
+}
+
+int get_current_namespace_fd()
+{
+	int i = 0;
+
+	if (parents_occ == 0 && read_namespaces() < 0)
+		return -1;
+
+	for (; i < parents_occ; i++) {
+	    if(!strncmp(parents[i], namespace, PATHLEN))
+		return i;
+	}
+	pr_perror("Error, could not find current namespace fd");
+	return -1;
+}
+
+char* get_namespace(int dfd)
+{
+	if (parents_occ == 0 && read_namespaces() < 0) {
+		pr_perror("No namespace in parent hierarchy");
+		return NULL;
+	}
+	if (dfd >= parents_occ || dfd < 0)
+		return NULL;
+	else
+		return parents[dfd];
+}
diff --git a/image.c b/image.c
index dc9d6a1..9fff629 100644
--- a/image.c
+++ b/image.c
@@ -12,6 +12,7 @@
 #include "protobuf.h"
 #include "protobuf/inventory.pb-c.h"
 #include "protobuf/pagemap.pb-c.h"
+#include "image-remote.h"
 
 bool fdinfo_per_id = false;
 bool ns_per_id = false;
@@ -308,16 +309,52 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of
 
 	flags = oflags & ~(O_NOBUF | O_SERVICE);
 
-	ret = openat(dfd, path, flags, CR_FD_PERM);
-	if (ret < 0) {
-		if (!(flags & O_CREAT) && (errno == ENOENT)) {
-			pr_info("No %s image\n", path);
+	if(opts.remote && (imgset_template[type].magic != STATS_MAGIC)) {
+		char* namespace = NULL;
+
+		/* if dfd is the service fd or is unset then we need the current
+		 * namesapce. Else we need a parent namespace. */
+		if (dfd == get_service_fd(IMG_FD_OFF) || dfd == -1)
+			namespace = get_current_namespace();
+		else
+			namespace = get_namespace(dfd);
+
+		/* TODO - Find out what is the best solution for this file. */
+		if (imgset_template[type].magic == IRMAP_CACHE_MAGIC) {
+			ret = -1;
+		}
+		else if(namespace == NULL) {
+			ret = -1;
+		}
+		else if (flags == O_RDONLY) {
+			pr_info("do_open_remote_image RDONLY path=%s namespace=%s\n",
+			  path, namespace);
+			ret = read_remote_image_connection(namespace, path);
+		}
+		else {
+			pr_info("do_open_remote_image WDONLY path=%s namespace=%s\n",
+			  path, namespace);
+			ret = write_remote_image_connection(namespace, path);
+		}
+
+		if (ret < 0) {
+			pr_info("No %s (dfd=%d) image\n", path, dfd);
 			img->_x.fd = EMPTY_IMG_FD;
 			goto skip_magic;
 		}
-
-		pr_perror("Unable to open %s", path);
-		goto err;
+	}
+	else {
+		ret = openat(dfd, path, flags, CR_FD_PERM);
+		if (ret < 0) {
+			if (!(flags & O_CREAT) && (errno == ENOENT)) {
+				pr_info("No %s image\n", path);
+				img->_x.fd = EMPTY_IMG_FD;
+				goto skip_magic;
+			}
+
+			pr_perror("Unable to open %s", path);
+			goto err;
+		}
 	}
 
 	img->_x.fd = ret;
@@ -410,7 +447,10 @@ int open_image_dir(char *dir)
 	close(fd);
 	fd = ret;
 
-	if (opts.img_parent) {
+	if (opts.remote) {
+		init_namespace(dir);
+	}
+	else if (opts.img_parent) {
 		ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
 		if (ret < 0 && errno != EEXIST) {
 			pr_perror("Can't link parent snapshot");
diff --git a/include/cr_options.h b/include/cr_options.h
index af130dd..4c611ee 100644
--- a/include/cr_options.h
+++ b/include/cr_options.h
@@ -91,6 +91,7 @@ struct cr_options {
 	bool			enable_external_masters;
 	bool			aufs;		/* auto-deteced, not via cli */
 	bool			overlayfs;
+	bool			remote;
 	size_t			ghost_limit;
 	struct list_head	irmap_scan_paths;
 };
diff --git a/include/image-remote.h b/include/image-remote.h
new file mode 100644
index 0000000..20e0d36
--- /dev/null
+++ b/include/image-remote.h
@@ -0,0 +1,87 @@
+#include <limits.h>
+
+#ifndef IMAGE_REMOTE_H
+#define	IMAGE_REMOTE_H
+
+#define PATHLEN PATH_MAX
+#define DUMP_FINISH "DUMP_FINISH"
+#define PARENT_IMG "parent"
+#define NULL_NAMESPACE "null"
+
+/* This flag is used to enable local debugging (dump + proxy + cache + restore)
+ * on the same machine. The idea is that both dump and restore processes are
+ * orthogonal to this. */
+#define LOCAL_DEVEL 1
+
+/* Read image connections are sent to GET_IMG_PATH (local UNIX socket). Write
+ * image connections are sent to PUT_IMG_PATH. Both image-proxy and image-cache
+ * need to accept both read and write connections. All connetions are local (i.e.
+ * using UNIX sockets) except for write connections to image-cache. These are
+ * used to migrate image files from the image-proxy to the image-cache. These
+ * connections (one per image file) use TCP. */
+
+#define GET_IMG_PATH "/tmp/criu-remote-get.sock"
+#define PUT_IMG_PATH "/tmp/criu-remote-put.sock"
+
+#define PROXY_GET LOCAL_DEVEL ? "/tmp/criu-proxy-get.sock" : GET_IMG_PATH
+#define PROXY_PUT PUT_IMG_PATH
+
+#define CACHE_GET GET_IMG_PATH
+/* Only the this channel is TCP.*/
+#define CACHE_PUT 9996
+
+#define PROXY_FWD_PORT CACHE_PUT
+#define PROXY_FWD_HOST "localhost"
+
+/* Warning: This may be problematic because of double evaluation... */
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+/* Called by restore to get the fd correspondent to a particular path. This call
+ * will block until the connection is received. */
+extern int read_remote_image_connection(char* namespace, char* path);
+
+/* Called by dump to create a socket connection to the restore side. The socket
+ * fd is returned for further writing operations. */
+extern int write_remote_image_connection(char* namespace, char* path );
+
+/* Called by dump when everything is dumped. This function creates a new
+ * connection with a special control name. The recover side uses it to ack that
+ * no more files are coming. */
+extern int finish_remote_dump();
+
+/* Starts an image proxy daemon (dump side). It receives image files through
+ * socket connections and forwards them to the image cache (restore side). */
+extern int image_proxy(char* cache_host, unsigned short cache_port);
+
+/* Starts an image cache daemon (restore side). It receives image files through
+ * socket connections and caches them until they are requested by the restore
+ * process. */
+extern int image_cache(unsigned short cache_port);
+
+/* Reads (discards) 'len' bytes from fd. This is used to emulate the function
+ * lseek, which is used to advance the file needle. */
+int skip_remote_bytes(int fd, unsigned long len);
+
+/* To support iterative migration (multiple pre-dumps before the final dump
+ * and subsequent restore, the concept of namespace is introduced. Each image
+ * is tagged with one namespace and we build a hierarchy of namespaces to
+ * represent the dependency between pagemaps. Currently, the images dir is
+ * used as namespace when the operation is marked as remote. */
+
+/* Sets the current namespace */
+void init_namespace(char* ns);
+
+/* Returns a pointer to the char array containing the current namesapce. */
+char* get_current_namespace();
+
+/* Returns an integer (virtual fd) representing the current namespace. */
+int get_current_namespace_fd();
+
+/* Returns the namespace associated with the virtual fd (given as argument). */
+char* get_namespace(int dfd);
+
+/* Pushes the current namespace into the namespace hierarchy. The hierarchy is
+ * read, modified, and written. */
+int push_namespace();
+
+#endif	/* IMAGE_REMOTE_H */
diff --git a/include/protobuf-desc.h b/include/protobuf-desc.h
index ab7e4f2..232b814 100644
--- a/include/protobuf-desc.h
+++ b/include/protobuf-desc.h
@@ -55,6 +55,8 @@ enum {
 	PB_CPUINFO,
 	PB_USERNS,
 	PB_NETNS,
+	PB_REMOTE_IMAGE,
+	PB_REMOTE_NAMESPACE,
 
 	/* PB_AUTOGEN_STOP */
 
diff --git a/page-read.c b/page-read.c
index 832c057..e690354 100644
--- a/page-read.c
+++ b/page-read.c
@@ -10,6 +10,8 @@
 #include "protobuf.h"
 #include "protobuf/pagemap.pb-c.h"
 
+#include "image-remote.h"
+
 #ifndef SEEK_DATA
 #define SEEK_DATA	3
 #define SEEK_HOLE	4
@@ -90,7 +92,12 @@ static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
 		return;
 
 	pr_debug("\tpr%u Skip %lx bytes from page-dump\n", pr->id, len);
-	if (!pr->pe->in_parent)
+	if (!pr->pe->in_parent && opts.remote) {
+		if (skip_remote_bytes(img_raw_fd(pr->pi), len) < 0) {
+			pr_perror("Unable to skip remote bytes");
+		}
+	}
+	else if (!pr->pe->in_parent)
 		lseek(img_raw_fd(pr->pi), len, SEEK_CUR);
 	pr->cvaddr += len;
 }
@@ -132,6 +139,20 @@ new_pagemap:
 	}
 }
 
+/* This read_ is a stronger way to read something from a fd. */
+static size_t read_(int fd, char* buff, size_t size)
+{
+	size_t n = 0;
+	size_t curr = 0;
+	while(1) {
+		n  = read(fd, buff + curr, size - curr);
+		if(n < 1)
+			return n;
+		curr += n;
+		if(curr == size)
+			return size;
+	}
+}
 static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, void *buf)
 {
 	int ret;
@@ -146,10 +167,11 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, void *bu
 			return ret;
 	} else {
 		int fd = img_raw_fd(pr->pi);
-		off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
+		/* TODO - lseek is not possible to sockets. Need to find a solution. */
+		off_t current_vaddr = opts.remote ? 0 : lseek(fd, 0, SEEK_CUR);
 		pr_debug("\tpr%u Read page %lx from self %lx/%"PRIx64"\n", pr->id,
 				vaddr, pr->cvaddr, current_vaddr);
-		ret = read(fd, buf, PAGE_SIZE);
+		ret = read_(fd, buf, PAGE_SIZE);
 		if (ret != PAGE_SIZE) {
 			pr_perror("Can't read mapping page %d", ret);
 			return -1;
@@ -195,9 +217,18 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
 	int pfd, ret;
 	struct page_read *parent = NULL;
 
-	pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
-	if (pfd < 0 && errno == ENOENT)
-		goto out;
+	if(opts.remote) {
+		/* dfd is either the service fd or an image namespace */
+		pfd = dfd == get_service_fd(IMG_FD_OFF) ? get_current_namespace_fd() : dfd;
+		pfd -= 1;
+		if(get_namespace(pfd) == NULL)
+			goto out;
+	}
+	else {
+		pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
+		if (pfd < 0 && errno == ENOENT)
+			goto out;
+	}
 
 	parent = xmalloc(sizeof(*parent));
 	if (!parent)
@@ -211,8 +242,8 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
 		xfree(parent);
 		parent = NULL;
 	}
-
-	close(pfd);
+	if(!opts.remote)
+		close(pfd);
 out:
 	pr->parent = parent;
 	return 0;
@@ -220,7 +251,8 @@ out:
 err_free:
 	xfree(parent);
 err_cl:
-	close(pfd);
+	if(!opts.remote)
+		close(pfd);
 	return -1;
 }
 
diff --git a/page-xfer.c b/page-xfer.c
index 7465ed8..d2a49de 100644
--- a/page-xfer.c
+++ b/page-xfer.c
@@ -17,6 +17,8 @@
 #include "protobuf.h"
 #include "protobuf/pagemap.pb-c.h"
 
+#include "image-remote.h"
+
 struct page_server_iov {
 	u32	cmd;
 	u32	nr_pages;
@@ -742,12 +744,22 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
 		int ret;
 		int pfd;
 
-		pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
-		if (pfd < 0 && errno == ENOENT)
-			goto out;
+		if (opts.remote) {
+			pfd = get_current_namespace_fd() - 1;
+			if (get_namespace(pfd) == NULL)
+				goto out;
+		}
+		else {
+			pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
+			if (pfd < 0 && errno == ENOENT)
+				goto out;
+		}
 
 		xfer->parent = xmalloc(sizeof(*xfer->parent));
-		if (!xfer->parent) {
+		if (!xfer->parent && opts.remote) {
+			return -1;
+		}
+		else if (!xfer->parent) {
 			close(pfd);
 			return -1;
 		}
@@ -757,7 +769,8 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
 			pr_perror("No parent image found, though parent directory is set");
 			xfree(xfer->parent);
 			xfer->parent = NULL;
-			close(pfd);
+			if(!opts.remote)
+				close(pfd);
 			goto out;
 		}
 		close(pfd);
@@ -807,6 +820,12 @@ int check_parent_local_xfer(int fd_type, int id)
 }
 
+int check_parent_remote_xfer()
+{
+	int pfd = get_current_namespace_fd() - 1;
+	return get_namespace(pfd) == NULL ? 0 : 1;
+}
+
 static int page_server_check_parent(int sk, struct page_server_iov *pi)
 {
 	int type, ret;
@@ -852,6 +871,8 @@ int check_parent_page_xfer(int fd_type, long id)
 {
 	if (opts.use_page_server)
 		return check_parent_server_xfer(fd_type, id);
+	else if (opts.remote)
+		return check_parent_remote_xfer();
 	else
 		return check_parent_local_xfer(fd_type, id);
 }
diff --git a/protobuf-desc.c b/protobuf-desc.c
index 873fd3b..2b58aab 100644
--- a/protobuf-desc.c
+++ b/protobuf-desc.c
@@ -61,6 +61,7 @@
 #include "protobuf/timerfd.pb-c.h"
 #include "protobuf/cpuinfo.pb-c.h"
 #include "protobuf/userns.pb-c.h"
+#include "protobuf/remote-image.pb-c.h"
 
 struct cr_pb_message_desc cr_pb_descs[PB_MAX];
 
diff --git a/protobuf/Makefile b/protobuf/Makefile
index 0b11852..f685fc6 100644
--- a/protobuf/Makefile
+++ b/protobuf/Makefile
@@ -48,6 +48,7 @@ proto-obj-y	+= tty.o
 proto-obj-y	+= file-lock.o
 proto-obj-y	+= rlimit.o
 proto-obj-y	+= pagemap.o
+proto-obj-y	+= remote-image.o
 proto-obj-y	+= siginfo.o
 proto-obj-y	+= rpc.o
 proto-obj-y	+= ext-file.o
diff --git a/protobuf/remote-image.proto b/protobuf/remote-image.proto
new file mode 100644
index 0000000..d2fad2e
--- /dev/null
+++ b/protobuf/remote-image.proto
@@ -0,0 +1,8 @@
+message remote_image_entry {
+	required string name		= 1;
+	required string namespace	= 2;
+}
+
+message remote_namespace_entry {
+	repeated string namespace	= 1;
+}
-- 
1.9.1



-- 
Rodrigo Bruno <rbruno at gsd.inesc-id.pt>


More information about the CRIU mailing list