[CRIU] Process Migration using Sockets v3 - Patch 1/2

Rodrigo Bruno rbruno at gsd.inesc-id.pt
Wed Nov 4 04:20:58 PST 2015


Hi, 

I am sending a new version of the patch.

I tried to address all the issues you pointed out.

Main changes regarding the previous version:

- changed the term used to indicate the creator of a particular image from a 
previous dump or predump. Before it was 'namespace' but it collided with other 
part of the code that also used this term. I am now using 'snapshot_id';

- created a new mode (O_FORCE_LOCAL) for files that should not be sent
through the network.

- changed file names
image-remote.c -> img-remote.c
include/image-remote.h -> include/img-remote.h

- simplified the way of looking for parent snapshots (this was an issue in
page-read.c and page-xfer.c)

- added two different headers, one for local image operations (read, write, 
append) and one for remote image operations (forward from proxy to cache).

- snapshot_ids are now appended instead of writing the whole stack each time
we add a new snapshot_id.

I also revised all the code trying to make it more readable.


Signed-off-by: Rodrigo Bruno <rbruno at gsd.inesc-id.pt>
>From 71ae29995a2dfa972cd02bcb3870ca947bb04d80 Mon Sep 17 00:00:00 2001
From: rodrigo-bruno <rbruno at gsd.inesc-id.pt>
Date: Wed, 4 Nov 2015 11:45:28 +0000
Subject: [PATCH] Process migration using sockets (1/2)

---
 cr-dedup.c                  |   1 +
 cr-dump.c                   |  16 +++
 crtools.c                   |  21 +++-
 image-desc.c                |   4 +-
 image.c                     |  54 ++++++++--
 img-remote.c                | 245 ++++++++++++++++++++++++++++++++++++++++++++
 include/cr_options.h        |   1 +
 include/image.h             |   3 +-
 include/img-remote.h        |  95 +++++++++++++++++
 include/protobuf-desc.h     |   8 +-
 include/util.h              |   1 +
 page-read.c                 |  43 ++++++--
 page-xfer.c                 |  24 ++++-
 protobuf-desc.c             |   1 +
 protobuf/Makefile           |   1 +
 protobuf/remote-image.proto |  20 ++++
 util.c                      |  14 +++
 17 files changed, 523 insertions(+), 29 deletions(-)
 create mode 100644 img-remote.c
 create mode 100644 include/img-remote.h
 create mode 100644 protobuf/remote-image.proto

diff --git a/cr-dedup.c b/cr-dedup.c
index b453c3e..77f0b39 100644
--- a/cr-dedup.c
+++ b/cr-dedup.c
@@ -9,6 +9,7 @@
 
 #define MAX_BUNCH_SIZE 256
 
+/* TODO - patch this for using remote migration using sockets */
 static int cr_dedup_one_pagemap(int pid);
 
 int cr_dedup(void)
diff --git a/cr-dump.c b/cr-dump.c
index 3af077b..1172ded 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -83,6 +83,8 @@
 
 #include "asm/dump.h"
 
+#include "img-remote.h"
+
 static char loc_buf[PAGE_SIZE];
 
 static void close_vma_file(struct vma_area *vma)
@@ -1343,6 +1345,11 @@ int cr_pre_dump_tasks(pid_t pid)
 	LIST_HEAD(ctls);
 	struct parasite_ctl *ctl, *n;
 
+	if (opts.remote && push_snapshot_id() < 0) {
+		pr_err("Failed to push image namespace.\n");
+		goto err;
+	}
+
 	if (!opts.track_mem) {
 		pr_info("Enforcing memory tracking for pre-dump.\n");
 		opts.track_mem = true;
@@ -1448,6 +1455,11 @@ int cr_dump_tasks(pid_t pid)
 	pr_info("Dumping processes (pid: %d)\n", pid);
 	pr_info("========================================\n");
 
+	if (opts.remote && push_snapshot_id() < 0) {
+		pr_err("Failed to push image namepsace.\n");
+		goto err;
+	}
+
 	if (init_stats(DUMP_STATS))
 		goto err;
 
@@ -1612,6 +1624,10 @@ err:
 
 	close_service_fd(CR_PROC_FD_OFF);
 
+        if (opts.remote) {
+		finish_remote_dump();
+	}
+
 	if (ret) {
 		kill_inventory();
 		pr_err("Dumping FAILED.\n");
diff --git a/crtools.c b/crtools.c
index ea8b889..eb19ade 100644
--- a/crtools.c
+++ b/crtools.c
@@ -43,6 +43,8 @@
 
 #include "setproctitle.h"
 
+#include "img-remote.h"
+
 struct cr_options opts;
 
 void init_opts(void)
@@ -62,6 +64,8 @@ void init_opts(void)
 	opts.cpu_cap = CPU_CAP_DEFAULT;
 	opts.manage_cgroups = CG_MODE_DEFAULT;
 	opts.ps_socket = -1;
+	opts.addr = PROXY_FWD_HOST;
+	opts.ps_port = CACHE_WRITE_PORT;
 	opts.ghost_limit = DEFAULT_GHOST_LIMIT;
 }
 
@@ -252,6 +256,7 @@ int main(int argc, char *argv[], char *envp[])
 		{ "freeze-cgroup",		required_argument,	0, 1068 },
 		{ "ghost-limit",		required_argument,	0, 1069 },
 		{ "irmap-scan-path",		required_argument,	0, 1070 },
+		{ "remote",			no_argument,		0, 1071 },
 		{ },
 	};
 
@@ -494,6 +499,9 @@ int main(int argc, char *argv[], char *envp[])
 			if (irmap_scan_path_add(optarg))
 				return -1;
 			break;
+		case 1071:
+			opts.remote = true;
+			break;
 		case 'M':
 			{
 				char *aux;
@@ -642,6 +650,12 @@ int main(int argc, char *argv[], char *envp[])
 	if (!strcmp(argv[optind], "page-server"))
 		return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;
 
+	if (!strcmp(argv[optind], "image-cache"))
+		return image_cache(opts.ps_port);
+
+	if (!strcmp(argv[optind], "image-proxy"))
+		return image_proxy(opts.addr, opts.ps_port);
+
 	if (!strcmp(argv[optind], "service"))
 		return cr_service(opts.daemon_mode);
 
@@ -668,6 +682,8 @@ usage:
 "  criu page-server\n"
 "  criu service [<options>]\n"
 "  criu dedup\n"
+"  criu image-cache [<options>]\n"
+"  criu image-proxy [<options>]\n"
 "\n"
 "Commands:\n"
 "  dump           checkpoint a process/tree identified by pid\n"
@@ -680,6 +696,8 @@ usage:
 "  dedup          remove duplicates in memory dump\n"
 "  cpuinfo dump   writes cpu information into image file\n"
 "  cpuinfo check  validates cpu information read from image file\n"
+"  image-cache    launch image-cache, used for process live migration\n"
+"  image-proxy    launch image-proxy, used for process live migration\n"
 	);
 
 	if (usage_error) {
@@ -706,6 +724,7 @@ usage:
 "                        restore making it the parent of the restored process\n"
 "  --freeze-cgroup\n"
 "                        use cgroup freezer to collect processes\n"
+"  --remote              dump images directly to remote node\n"
 "\n"
 "* Special resources support:\n"
 "  -x|--" USK_EXT_PARAM "inode,.." "      allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
@@ -762,7 +781,7 @@ usage:
 "                        when used on restore, as soon as page is restored, it\n"
 "                        will be punched from the image.\n"
 "\n"
-"Page/Service server options:\n"
+"Page/Service/image-cache/image-proxy server options:\n"
 "  --address ADDR        address of server or service\n"
 "  --port PORT           port of page server\n"
 "  -d|--daemon           run in the background after creating socket\n"
diff --git a/image-desc.c b/image-desc.c
index 773f2fa..48e0116 100644
--- a/image-desc.c
+++ b/image-desc.c
@@ -94,13 +94,13 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
 	[CR_FD_STATS] = {
 		.fmt	= "stats-%s",
 		.magic	= STATS_MAGIC,
-		.oflags = O_SERVICE,
+		.oflags = O_SERVICE | O_FORCE_LOCAL,
 	},
 
 	[CR_FD_IRMAP_CACHE] = {
 		.fmt	= "irmap-cache",
 		.magic	= IRMAP_CACHE_MAGIC,
-		.oflags = O_SERVICE,
+		.oflags = O_SERVICE | O_FORCE_LOCAL,
 	},
 
 	[CR_FD_FILE_LOCKS_PID] = {
diff --git a/image.c b/image.c
index dc9d6a1..a519788 100644
--- a/image.c
+++ b/image.c
@@ -12,6 +12,7 @@
 #include "protobuf.h"
 #include "protobuf/inventory.pb-c.h"
 #include "protobuf/pagemap.pb-c.h"
+#include "img-remote.h"
 
 bool fdinfo_per_id = false;
 bool ns_per_id = false;
@@ -306,18 +307,50 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of
 {
 	int ret, flags;
 
-	flags = oflags & ~(O_NOBUF | O_SERVICE);
+	flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL);
 
-	ret = openat(dfd, path, flags, CR_FD_PERM);
-	if (ret < 0) {
-		if (!(flags & O_CREAT) && (errno == ENOENT)) {
-			pr_info("No %s image\n", path);
+	if(opts.remote && !(oflags & O_FORCE_LOCAL)) {
+		char* snapshot_id = NULL;
+
+		/* Note: if dfd is the service fd then we need the current
+                 * snapshot_id. Else we need a parent snapshot_id. */
+		if (dfd == get_service_fd(IMG_FD_OFF))
+			snapshot_id = get_curr_snapshot_id();
+		else
+			snapshot_id = get_snapshot_id_from_idx(dfd);
+
+		if(snapshot_id == NULL) {
+			ret = -1;
+		}
+		else if (flags == O_RDONLY) {
+			pr_info("do_open_remote_image RDONLY path=%s snapshot_id=%s\n",
+			  path, snapshot_id);
+			ret = read_remote_image_connection(snapshot_id, path);
+		}
+		else {
+			pr_info("do_open_remote_image WDONLY path=%s snapshot_id=%s\n",
+			  path, snapshot_id);
+			ret = write_remote_image_connection(snapshot_id, path, O_WRONLY);
+		}
+
+		if (ret < 0) {
+			pr_info("No %s (dfd=%d) image\n", path, dfd);
 			img->_x.fd = EMPTY_IMG_FD;
 			goto skip_magic;
 		}
-
-		pr_perror("Unable to open %s", path);
-		goto err;
+	}
+	else {
+		ret = openat(dfd, path, flags, CR_FD_PERM);
+		if (ret < 0) {
+			if (!(flags & O_CREAT) && (errno == ENOENT)) {
+				pr_info("No %s image\n", path);
+				img->_x.fd = EMPTY_IMG_FD;
+				goto skip_magic;
+			}
+
+			pr_perror("Unable to open %s", path);
+			goto err;
+		}
 	}
 
 	img->_x.fd = ret;
@@ -410,7 +443,10 @@ int open_image_dir(char *dir)
 	close(fd);
 	fd = ret;
 
-	if (opts.img_parent) {
+	if (opts.remote) {
+		init_snapshot_id(dir);
+	}
+	else if (opts.img_parent) {
 		ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
 		if (ret < 0 && errno != EEXIST) {
 			pr_perror("Can't link parent snapshot");
diff --git a/img-remote.c b/img-remote.c
new file mode 100644
index 0000000..3c0affe
--- /dev/null
+++ b/img-remote.c
@@ -0,0 +1,245 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include "xmalloc.h"
+#include "criu-log.h"
+#include "img-remote.h"
+#include "img-remote-proto.h"
+#include "protobuf/remote-image.pb-c.h"
+#include "protobuf-desc.h"
+#include <fcntl.h>
+
+#define PB_LOCAL_IMAGE_SIZE PATHLEN
+
+static char* snapshot_id = NULL;
+
+LIST_HEAD(snapshot_head);
+
+/* A snapshot is a dump or pre-dump operation. Each snapshot is identified by an
+ * ID which corresponds to the working directory specefied by the user. */
+struct snapshot {
+	char snapshot_id[PATHLEN];
+	struct list_head l;
+};
+
+struct snapshot* new_snapshot(char* snapshot_id)
+{
+	struct snapshot* s = malloc(sizeof(struct snapshot));
+	if (!s) {
+		pr_perror("Failed to allocate snapshot structure");
+	}
+	strncpy(s->snapshot_id, snapshot_id, PATHLEN);
+	return s;
+}
+
+void add_snapshot(struct snapshot* snapshot)
+{
+	list_add_tail(&(snapshot->l), &snapshot_head);
+}
+
+int read_remote_image_connection(char* snapshot_id, char* path)
+{
+	int error;
+	int sockfd = setup_UNIX_client_socket(READ_IMG_PATH);
+	if (sockfd < 0) {
+		return -1;
+	}
+
+	if (write_header(sockfd, snapshot_id, path, O_RDONLY) < 0) {
+		pr_perror("Error writing header for %s:%s", path, snapshot_id);
+		return -1;
+	}
+
+	if (read_reply_header(sockfd, &error) < 0) {
+		pr_perror("Error reading reply header for %s:%s", path, snapshot_id);
+		return -1;
+	}
+
+	if (!error) {
+		pr_info("Image does exist (%s:%s)\n", path, snapshot_id);
+		return sockfd;
+	}
+	else if (error == ENOENT) {
+		pr_info("Image does not exist (%s:%s)\n", path, snapshot_id);
+		close(sockfd);
+		return -1;
+	}
+	else {
+		pr_perror("Unexpected error returned: %d (%s:%s)\n", error, path, snapshot_id);
+		close(sockfd);
+		return -1;
+	}
+}
+
+int write_remote_image_connection(char* snapshot_id, char* path, int flags)
+{
+	int sockfd = setup_UNIX_client_socket(WRITE_IMG_PATH);
+	if (sockfd < 0)
+		return -1;
+
+	if (write_header(sockfd, snapshot_id, path, flags) < 0) {
+		pr_perror("Error writing header for %s:%s", path, snapshot_id);
+		return -1;
+	}
+	return sockfd;
+}
+
+int finish_remote_dump()
+{
+	pr_info("Dump side is calling finish\n");
+	int fd = write_remote_image_connection(NULL_SNAPSHOT_ID, DUMP_FINISH, O_WRONLY);
+	if (fd == -1) {
+		pr_perror("Unable to open finish dump connection");
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+int skip_remote_bytes(int fd, unsigned long len)
+{
+	static char buf[4096];
+	int n = 0;
+	unsigned long curr = 0;
+
+	for(; curr < len; ) {
+		n = read(fd, buf, MIN(len - curr, 4096));
+		if (n == 0) {
+			pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)",
+				curr, len);
+			return -1;
+		}
+		else if (n > 0) {
+			curr += n;
+		}
+		else {
+			pr_perror("Error while skipping bytes from stream (%lx/%lx)",
+				curr, len);
+			return -1;
+		}
+	}
+	if ( curr != len) {
+		pr_perror("Unable to skip the current number of bytes: %lx instead of %lx",
+			curr, len);
+		return -1;
+	}
+	return 0;
+}
+
+static int pull_snapshot_ids()
+{
+	int n, sockfd;
+	SnapshotIdEntry* ls;
+	struct snapshot* s = NULL;
+
+	sockfd = read_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG);
+	if (sockfd < 0) {
+		pr_perror("Unable to open snapshot id read connection");
+		return -1;
+	}
+
+	while (1) {
+		n = pb_read_obj(sockfd, (void**)&ls, PB_SNAPSHOT_ID);
+		if (!n) {
+			close(sockfd);
+			return n;
+		}
+		else if (n < 0) {
+			pr_perror("Unable to read remote snapshot ids");
+			close(sockfd);
+			return n;
+		}
+
+		s = new_snapshot(ls->snapshot_id);
+		if (!s) {
+			pr_perror("Unable create new snapshot structure");
+			close(sockfd);
+			return -1;
+		}
+		add_snapshot(s);
+		pr_info("[read_snapshot ids] parent = %s\n", ls->snapshot_id);
+	}
+	free(ls);
+	close(sockfd);
+	return n;
+}
+
+int push_snapshot_id()
+{
+	int n;
+	SnapshotIdEntry rn = SNAPSHOT_ID_ENTRY__INIT;
+
+	int sockfd = write_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG, O_APPEND);
+	if (sockfd < 0) {
+		pr_perror("Unable to open snapshot id push connection");
+		return -1;
+	}
+
+	rn.snapshot_id = xmalloc(sizeof(char) * PATHLEN);
+	if (!rn.snapshot_id) {
+		pr_perror("Unable to allocate snapshot id buffer");
+		close(sockfd);
+		return -1;
+	}
+	strncpy(rn.snapshot_id, snapshot_id, PATHLEN);
+
+	n = pb_write_obj(sockfd, &rn, PB_SNAPSHOT_ID);
+
+	xfree(rn.snapshot_id);
+	close(sockfd);
+	return n;
+}
+
+void init_snapshot_id(char* si)
+{
+	snapshot_id = si;
+}
+
+char* get_curr_snapshot_id()
+{
+	return snapshot_id;
+}
+
+int get_curr_snapshot_id_idx()
+{
+	struct snapshot* si;
+	int idx = 0;
+
+	if (list_empty(&snapshot_head))
+		pull_snapshot_ids();
+
+	list_for_each_entry(si, &snapshot_head, l) {
+		if(!strncmp(si->snapshot_id, snapshot_id, PATHLEN)) 
+			return idx;
+		idx++;
+	}
+        
+	pr_perror("Error, could not find current snapshot id (%s) fd", snapshot_id);
+	return -1;
+}
+
+char* get_snapshot_id_from_idx(int idx)
+{
+	struct snapshot* si;
+	if (list_empty(&snapshot_head))
+		pull_snapshot_ids();
+
+	list_for_each_entry(si, &snapshot_head, l) {
+		if(!idx)
+			return si->snapshot_id;
+		idx--;
+	}
+
+	pr_perror("Error, could not find snapshot id for idx %d", idx);
+	return NULL;
+}
+
+char* get_curr_parent_snapshot_id()
+{
+	return get_snapshot_id_from_idx(get_curr_snapshot_id_idx() - 1);
+}
+
diff --git a/include/cr_options.h b/include/cr_options.h
index af130dd..4c611ee 100644
--- a/include/cr_options.h
+++ b/include/cr_options.h
@@ -91,6 +91,7 @@ struct cr_options {
 	bool			enable_external_masters;
 	bool			aufs;		/* auto-deteced, not via cli */
 	bool			overlayfs;
+	bool			remote;
 	size_t			ghost_limit;
 	struct list_head	irmap_scan_paths;
 };
diff --git a/include/image.h b/include/image.h
index 305febf..329cad3 100644
--- a/include/image.h
+++ b/include/image.h
@@ -85,7 +85,7 @@
  *  - unsupported
  *  	stands for any unknown memory areas, usually means
  *  	we don't know how to work with it and should stop
- *  	processing exiting with error; while the rest of bits
+ *  	processing exiting with error; wO_WRONLYhile the rest of bits
  *  	are part of image ABI, this particular one must never
  *  	be used in image.
  */
@@ -128,6 +128,7 @@ extern bool img_common_magic;
 #define O_DUMP		(O_WRONLY | O_CREAT | O_TRUNC)
 #define O_SHOW		(O_RDONLY | O_NOBUF)
 #define O_RSTR		(O_RDONLY)
+#define O_FORCE_LOCAL	(O_SYNC)
 
 struct cr_img {
 	union {
diff --git a/include/img-remote.h b/include/img-remote.h
new file mode 100644
index 0000000..f3bb810
--- /dev/null
+++ b/include/img-remote.h
@@ -0,0 +1,95 @@
+#include <limits.h>
+
+#ifndef IMAGE_REMOTE_H
+#define	IMAGE_REMOTE_H
+
+#define PATHLEN PATH_MAX
+#define DUMP_FINISH "DUMP_FINISH"
+#define PARENT_IMG "parent"
+#define NULL_SNAPSHOT_ID "null"
+
+/* This flag is used to enable local debugging (dump + proxy + cache + restore)
+ * on the same machine. The idea is that both dump and restore processes are
+ * orthogonal to this. */
+#define LOCAL_DEVEL 1
+
+#if LOCAL_DEVEL
+/* In local Devel, the image-proxy will open a local socket on WRITE_IMG_PATH and
+ * the image-cache will open a local socket on READ_IMG_PATH. */
+#define PROXY_IMG_PATH "/tmp/criu-remote-proxy.sock"
+#define CACHE_IMG_PATH "/tmp/criu-remote-cache.sock"
+#define READ_IMG_PATH CACHE_IMG_PATH
+#define WRITE_IMG_PATH PROXY_IMG_PATH
+#else
+#define IMG_PATH "/tmp/criu-remote.sock"
+#define READ_IMG_PATH IMG_PATH
+#define WRITE_IMG_PATH IMG_PATH
+#define PROXY_IMG_PATH IMG_PATH
+#define CACHE_IMG_PATH IMG_PATH
+#endif
+
+/* Only the this channel is TCP.*/
+#define CACHE_WRITE_PORT 9996
+#define PROXY_FWD_PORT CACHE_WRITE_PORT
+#define PROXY_FWD_HOST "localhost"
+
+/* Warning: This may be problematic because of double evaluation... */
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+/* Called by restore to get the fd correspondent to a particular path. This call
+ * will block until the connection is received. */
+int read_remote_image_connection(char* snapshot_id, char* path);
+
+/* Called by dump to create a socket connection to the restore side. The socket
+ * fd is returned for further writing operations. */
+int write_remote_image_connection(char* snapshot_id, char* path, int flags);
+
+/* Called by dump when everything is dumped. This function creates a new
+ * connection with a special control name. The recover side uses it to ack that
+ * no more files are coming. */
+int finish_remote_dump();
+
+/* Starts an image proxy daemon (dump side). It receives image files through
+ * socket connections and forwards them to the image cache (restore side). */
+int image_proxy(char* cache_host, unsigned short cache_port);
+
+/* Starts an image cache daemon (restore side). It receives image files through
+ * socket connections and caches them until they are requested by the restore
+ * process. */
+int image_cache(unsigned short cache_port);
+
+/* Reads (discards) 'len' bytes from fd. This is used to emulate the function
+ * lseek, which is used to advance the file needle. */
+int skip_remote_bytes(int fd, unsigned long len);
+
+/* To support iterative migration, the concept of snapshot_id is introduced 
+ * (only when remote migration is enabled). Each image is tagged with one 
+ * snapshot_id. The snapshot_id is the image directory used for the operation
+ * that creates the image (either predump or dump). Images stored in memory
+ * (both in Image Proxy and Image Cache) are identified by their name and 
+ * snapshot_id. Snapshot_ids are ordered so that we can find parent pagemaps
+ * (that will be used when restoring the process). */
+
+/* Sets the current snapshot_id */
+void init_snapshot_id(char* ns);
+
+/* Returns the current snapshot_id. */
+char* get_curr_snapshot_id();
+
+/* Returns the snapshot_id index representing the current snapshot_id. This 
+ * index represents the hierarchy position. For example: images tagged with 
+ * the snapshot_id with index 1 are more recent than the images tagged with
+ * the snapshot_id with index 0. */
+int get_curr_snapshot_id_idx();
+
+/* Returns the snapshot_id associated with the snapshot_id index. */
+char* get_snapshot_id_from_idx(int idx);
+
+/* Pushes the current snapshot_id into the snapshot_id hierarchy (into the Image
+ * Proxy and Image Cache). */
+int push_snapshot_id();
+
+/* Returns the snapshot id that preceeds the current snapshot_id. */
+char* get_curr_parent_snapshot_id();
+
+#endif
\ No newline at end of file
diff --git a/include/protobuf-desc.h b/include/protobuf-desc.h
index ab7e4f2..ef7544d 100644
--- a/include/protobuf-desc.h
+++ b/include/protobuf-desc.h
@@ -55,16 +55,20 @@ enum {
 	PB_CPUINFO,
 	PB_USERNS,
 	PB_NETNS,
+	PB_REMOTE_IMAGE,
+	PB_LOCAL_IMAGE,         /* 50 */
+	PB_LOCAL_IMAGE_REPLY,
+	PB_SNAPSHOT_ID,
 
 	/* PB_AUTOGEN_STOP */
 
 	PB_PAGEMAP_HEAD,
-	PB_IDS,		/* 50 */
+	PB_IDS,
 	PB_SIGACT,
 	PB_NETDEV,
 	PB_REMAP_FPATH,
 	PB_SK_QUEUES,
-	PB_IPCNS_MSG,
+	PB_IPCNS_MSG,		/* 60 */
 	PB_IPCNS_MSG_ENT,
 
 	PB_MAX,
diff --git a/include/util.h b/include/util.h
index f2300a9..428fc65 100644
--- a/include/util.h
+++ b/include/util.h
@@ -260,5 +260,6 @@ FILE *fopenat(int dirfd, char *path, char *cflags);
 void split(char *str, char token, char ***out, int *n);
 
 int fd_has_data(int lfd);
+size_t read_into_buffer(int fd, char* buff, size_t size);
 
 #endif /* __CR_UTIL_H__ */
diff --git a/page-read.c b/page-read.c
index 832c057..40c38bc 100644
--- a/page-read.c
+++ b/page-read.c
@@ -6,10 +6,13 @@
 #include "cr_options.h"
 #include "servicefd.h"
 #include "page-read.h"
+#include "util.h"
 
 #include "protobuf.h"
 #include "protobuf/pagemap.pb-c.h"
 
+#include "img-remote.h"
+
 #ifndef SEEK_DATA
 #define SEEK_DATA	3
 #define SEEK_HOLE	4
@@ -90,7 +93,12 @@ static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
 		return;
 
 	pr_debug("\tpr%u Skip %lx bytes from page-dump\n", pr->id, len);
-	if (!pr->pe->in_parent)
+	if (!pr->pe->in_parent && opts.remote) {
+		if (skip_remote_bytes(img_raw_fd(pr->pi), len) < 0) {
+			pr_perror("Unable to skip remote bytes");
+		}
+	}
+	else if (!pr->pe->in_parent)
 		lseek(img_raw_fd(pr->pi), len, SEEK_CUR);
 	pr->cvaddr += len;
 }
@@ -146,10 +154,11 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, void *bu
 			return ret;
 	} else {
 		int fd = img_raw_fd(pr->pi);
-		off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
+		/* TODO - lseek is not possible to sockets. Need to find a solution. */
+		off_t current_vaddr = opts.remote ? 0 : lseek(fd, 0, SEEK_CUR);
 		pr_debug("\tpr%u Read page %lx from self %lx/%"PRIx64"\n", pr->id,
 				vaddr, pr->cvaddr, current_vaddr);
-		ret = read(fd, buf, PAGE_SIZE);
+		ret = read_into_buffer(fd, buf, PAGE_SIZE);
 		if (ret != PAGE_SIZE) {
 			pr_perror("Can't read mapping page %d", ret);
 			return -1;
@@ -195,9 +204,24 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
 	int pfd, ret;
 	struct page_read *parent = NULL;
 
-	pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
-	if (pfd < 0 && errno == ENOENT)
-		goto out;
+	if(opts.remote) {
+		/* Note: we are replacing a real directory FD for a snapshot_id
+                 * index. Since we need the parent of the current snapshot_id,
+                 * we want the current snapshot_id index minus one. It is
+                 * possible that dfd is already a snapshot_id index. We test it
+                 * by comparing it to the service FD. When opening an image (see
+                 * do_open_image) we convert the snapshot_id index into a real
+                 * snapshot_id. */
+		pfd = dfd == get_service_fd(IMG_FD_OFF) ?
+                    get_curr_snapshot_id_idx() - 1 : dfd - 1;
+		if(pfd < 0)
+			goto out;
+	}
+	else {
+		pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
+		if (pfd < 0 && errno == ENOENT)
+			goto out;
+	}
 
 	parent = xmalloc(sizeof(*parent));
 	if (!parent)
@@ -211,8 +235,8 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
 		xfree(parent);
 		parent = NULL;
 	}
-
-	close(pfd);
+	if(!opts.remote)
+		close(pfd);
 out:
 	pr->parent = parent;
 	return 0;
@@ -220,7 +244,8 @@ out:
 err_free:
 	xfree(parent);
 err_cl:
-	close(pfd);
+	if(!opts.remote)
+		close(pfd);
 	return -1;
 }
 
diff --git a/page-xfer.c b/page-xfer.c
index 7465ed8..43ee1bd 100644
--- a/page-xfer.c
+++ b/page-xfer.c
@@ -17,6 +17,8 @@
 #include "protobuf.h"
 #include "protobuf/pagemap.pb-c.h"
 
+#include "img-remote.h"
+
 struct page_server_iov {
 	u32	cmd;
 	u32	nr_pages;
@@ -742,12 +744,21 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
 		int ret;
 		int pfd;
 
-		pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
-		if (pfd < 0 && errno == ENOENT)
-			goto out;
+		if (opts.remote) {
+			if (get_curr_parent_snapshot_id() == NULL)
+				goto out;
+		}
+		else {
+			pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
+			if (pfd < 0 && errno == ENOENT)
+				goto out;
+		}
 
 		xfer->parent = xmalloc(sizeof(*xfer->parent));
-		if (!xfer->parent) {
+		if (!xfer->parent && opts.remote) {
+			return -1;
+		}
+		else if (!xfer->parent) {
 			close(pfd);
 			return -1;
 		}
@@ -757,7 +768,8 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
 			pr_perror("No parent image found, though parent directory is set");
 			xfree(xfer->parent);
 			xfer->parent = NULL;
-			close(pfd);
+			if(!opts.remote)
+				close(pfd);
 			goto out;
 		}
 		close(pfd);
@@ -852,6 +864,8 @@ int check_parent_page_xfer(int fd_type, long id)
 {
 	if (opts.use_page_server)
 		return check_parent_server_xfer(fd_type, id);
+	else if (opts.remote)
+		return get_curr_parent_snapshot_id() == NULL ? 0 : 1;
 	else
 		return check_parent_local_xfer(fd_type, id);
 }
diff --git a/protobuf-desc.c b/protobuf-desc.c
index 873fd3b..2b58aab 100644
--- a/protobuf-desc.c
+++ b/protobuf-desc.c
@@ -61,6 +61,7 @@
 #include "protobuf/timerfd.pb-c.h"
 #include "protobuf/cpuinfo.pb-c.h"
 #include "protobuf/userns.pb-c.h"
+#include "protobuf/remote-image.pb-c.h"
 
 struct cr_pb_message_desc cr_pb_descs[PB_MAX];
 
diff --git a/protobuf/Makefile b/protobuf/Makefile
index 0b11852..f685fc6 100644
--- a/protobuf/Makefile
+++ b/protobuf/Makefile
@@ -48,6 +48,7 @@ proto-obj-y	+= tty.o
 proto-obj-y	+= file-lock.o
 proto-obj-y	+= rlimit.o
 proto-obj-y	+= pagemap.o
+proto-obj-y	+= remote-image.o
 proto-obj-y	+= siginfo.o
 proto-obj-y	+= rpc.o
 proto-obj-y	+= ext-file.o
diff --git a/protobuf/remote-image.proto b/protobuf/remote-image.proto
new file mode 100644
index 0000000..481098a
--- /dev/null
+++ b/protobuf/remote-image.proto
@@ -0,0 +1,20 @@
+message local_image_entry {
+	required string name		= 1;
+	required string snapshot_id	= 2;
+	required uint32 open_mode	= 3;
+}
+
+message remote_image_entry {
+	required string name		= 1;
+	required string snapshot_id	= 2;
+	required uint32 open_mode	= 3;
+	required uint64 size		= 4;
+}
+
+message local_image_reply_entry {
+	required uint32 error		= 1;
+}
+
+message snapshot_id_entry {
+	required string snapshot_id	= 1;
+}
diff --git a/util.c b/util.c
index b916eca..a643466 100644
--- a/util.c
+++ b/util.c
@@ -845,3 +845,17 @@ int fd_has_data(int lfd)
 
 	return ret;
 }
+
+size_t read_into_buffer(int fd, char* buff, size_t size)
+{
+	size_t n = 0;
+	size_t curr = 0;
+	while(1) {
+		n  = read(fd, buff + curr, size - curr);
+		if(n < 1)
+			return n;
+		curr += n;
+		if(curr == size)
+			return size;
+	}
+}
-- 
1.9.1

-- 
Rodrigo Bruno <rbruno at gsd.inesc-id.pt>


More information about the CRIU mailing list