[CRIU] [Process Migration using Sockets v4 - Patch 1/3]

Pavel Emelyanov xemul at parallels.com
Tue Dec 15 02:56:02 PST 2015


On 12/12/2015 01:18 AM, Rodrigo Bruno wrote:
> Hi, 
> 
> I think I fixed all the comments you added in the last version of the patch.
> 
> It took me a little longer to send the new version because a I found a bug when
> a subprocess that was snapshoted by a predump no longer exists when the dump takes
> place. I was not considering that situation (but now it is perfectly okey).
> 
> I also took some time to use the checkpatch.pl from linux. I fixed all the code.
> 
> Here is the first part of the patch:

OK, great :) I think this version is ready for merging. To do that I need
from you a patch commit message -- a good description of what's going on,
protocol details, all that you wrote before, but what was left in previous
e-mails. Please, send me one as a plain text, I'll paste it in the patch
before committing.

The need for patch commit message also exists for patch #2, so please do
it as well.

Meanwhile I'll split the patch #1 and commit some pieces that make sense
as themselves (next time, please, do it yourself in advance).

As far as patch #3 is concerned -- such small test for such a huge change
is too little :( Please, look at test/zdtm.py launcher, there's an ability
run tests using page-server, change this launcher so that there appears a
mode to dump via page cache and proxy.

-- Pavel

> Signed-off-by: Rodrigo Bruno <rbruno at gsd.inesc-id.pt>
>>From 029e179269163fc069fbbcf26cb86f056940dc7d Mon Sep 17 00:00:00 2001
> From: rodrigo-bruno <rbruno at gsd.inesc-id.pt>
> Date: Fri, 11 Dec 2015 22:07:12 +0000
> Subject: [PATCH 1/3] Process Migration using Sockets v4 (1/3)
> 
> ---
>  Makefile.crtools            |   4 +
>  cr-dedup.c                  |   1 +
>  cr-dump.c                   |  16 +++
>  cr-service.c                |   6 +-
>  crtools.c                   |  37 +++++-
>  image-desc.c                |   4 +-
>  image.c                     |  27 ++++-
>  img-remote.c                | 272 ++++++++++++++++++++++++++++++++++++++++++++
>  include/cr_options.h        |   5 +-
>  include/image.h             |   3 +-
>  include/img-remote.h        |  79 +++++++++++++
>  include/protobuf-desc.h     |   8 +-
>  include/util.h              |   1 +
>  page-read.c                 |  41 +++++--
>  page-xfer.c                 |  40 ++++---
>  protobuf-desc.c             |   1 +
>  protobuf/Makefile           |   1 +
>  protobuf/remote-image.proto |  20 ++++
>  util.c                      |  15 +++
>  19 files changed, 542 insertions(+), 39 deletions(-)
>  create mode 100644 img-remote.c
>  create mode 100644 include/img-remote.h
>  create mode 100644 protobuf/remote-image.proto
> 
> diff --git a/Makefile.crtools b/Makefile.crtools
> index 80f704f..3af28b1 100644
> --- a/Makefile.crtools
> +++ b/Makefile.crtools
> @@ -6,6 +6,10 @@ obj-y	+= crtools.o
>  obj-y	+= security.o
>  obj-y	+= image.o
>  obj-y	+= image-desc.o
> +obj-y	+= img-remote.o
> +obj-y	+= img-proxy.o
> +obj-y	+= img-cache.o
> +obj-y	+= img-remote-proto.o
>  obj-y	+= net.o
>  obj-y	+= tun.o
>  obj-y	+= proc_parse.o
> diff --git a/cr-dedup.c b/cr-dedup.c
> index b453c3e..77f0b39 100644
> --- a/cr-dedup.c
> +++ b/cr-dedup.c
> @@ -9,6 +9,7 @@
>  
>  #define MAX_BUNCH_SIZE 256
>  
> +/* TODO - patch this for using remote migration using sockets */
>  static int cr_dedup_one_pagemap(int pid);
>  
>  int cr_dedup(void)
> diff --git a/cr-dump.c b/cr-dump.c
> index 3af077b..81ea880 100644
> --- a/cr-dump.c
> +++ b/cr-dump.c
> @@ -83,6 +83,8 @@
>  
>  #include "asm/dump.h"
>  
> +#include "img-remote.h"
> +
>  static char loc_buf[PAGE_SIZE];
>  
>  static void close_vma_file(struct vma_area *vma)
> @@ -1343,6 +1345,11 @@ int cr_pre_dump_tasks(pid_t pid)
>  	LIST_HEAD(ctls);
>  	struct parasite_ctl *ctl, *n;
>  
> +	if (opts.remote && push_snapshot_id() < 0) {
> +		pr_err("Failed to push image namespace.\n");
> +		goto err;
> +	}
> +
>  	if (!opts.track_mem) {
>  		pr_info("Enforcing memory tracking for pre-dump.\n");
>  		opts.track_mem = true;
> @@ -1448,6 +1455,11 @@ int cr_dump_tasks(pid_t pid)
>  	pr_info("Dumping processes (pid: %d)\n", pid);
>  	pr_info("========================================\n");
>  
> +	if (opts.remote && push_snapshot_id() < 0) {
> +		pr_err("Failed to push image namepsace.\n");
> +		goto err;
> +	}
> +
>  	if (init_stats(DUMP_STATS))
>  		goto err;
>  
> @@ -1612,6 +1624,10 @@ err:
>  
>  	close_service_fd(CR_PROC_FD_OFF);
>  
> +	if (opts.remote) {
> +		finish_remote_dump();
> +	}
> +
>  	if (ret) {
>  		kill_inventory();
>  		pr_err("Dumping FAILED.\n");
> diff --git a/cr-service.c b/cr-service.c
> index 938ea9e..02ec0ed 100644
> --- a/cr-service.c
> +++ b/cr-service.c
> @@ -340,7 +340,7 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
>  	if (req->ps) {
>  		opts.use_page_server = true;
>  		opts.addr = req->ps->address;
> -		opts.ps_port = htons((short)req->ps->port);
> +		opts.port = htons((short)req->ps->port);
>  
>  		if (req->ps->has_fd) {
>  			if (!opts.swrk_restore)
> @@ -649,7 +649,7 @@ static int start_page_server_req(int sk, CriuOpts *req)
>  		if (setup_opts_from_req(sk, req))
>  			goto out_ch;
>  
> -		setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.ps_port);
> +		setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port);
>  
>  		pr_debug("Starting page server\n");
>  
> @@ -658,7 +658,7 @@ static int start_page_server_req(int sk, CriuOpts *req)
>  			goto out_ch;
>  
>  		info.pid = pid;
> -		info.port = opts.ps_port;
> +		info.port = opts.port;
>  
>  		count = write(start_pipe[1], &info, sizeof(info));
>  		if (count != sizeof(info))
> diff --git a/crtools.c b/crtools.c
> index ea8b889..b1f6115 100644
> --- a/crtools.c
> +++ b/crtools.c
> @@ -43,6 +43,8 @@
>  
>  #include "setproctitle.h"
>  
> +#include "img-remote.h"
> +
>  struct cr_options opts;
>  
>  void init_opts(void)
> @@ -62,6 +64,10 @@ void init_opts(void)
>  	opts.cpu_cap = CPU_CAP_DEFAULT;
>  	opts.manage_cgroups = CG_MODE_DEFAULT;
>  	opts.ps_socket = -1;
> +	opts.addr = DEFAULT_CACHE_HOST;
> +	opts.port = DEFAULT_CACHE_PORT;
> +	opts.local_cache_path = DEFAULT_IMG_PATH;
> +	opts.local_proxy_path = DEFAULT_IMG_PATH;
>  	opts.ghost_limit = DEFAULT_GHOST_LIMIT;
>  }
>  
> @@ -252,6 +258,9 @@ int main(int argc, char *argv[], char *envp[])
>  		{ "freeze-cgroup",		required_argument,	0, 1068 },
>  		{ "ghost-limit",		required_argument,	0, 1069 },
>  		{ "irmap-scan-path",		required_argument,	0, 1070 },
> +		{ "remote",			no_argument,		0, 1071 },
> +		{ "local-cache-path",		required_argument,	0, 1072 },
> +		{ "local-proxy-path",		required_argument,	0, 1073 },
>  		{ },
>  	};
>  
> @@ -405,8 +414,8 @@ int main(int argc, char *argv[], char *envp[])
>  			opts.addr = optarg;
>  			break;
>  		case 1052:
> -			opts.ps_port = htons(atoi(optarg));
> -			if (!opts.ps_port)
> +			opts.port = htons(atoi(optarg));
> +			if (!opts.port)
>  				goto bad_arg;
>  			break;
>  		case 'j':
> @@ -494,6 +503,15 @@ int main(int argc, char *argv[], char *envp[])
>  			if (irmap_scan_path_add(optarg))
>  				return -1;
>  			break;
> +		case 1071:
> +			opts.remote = true;
> +			break;
> +		case 1072:
> +			opts.local_cache_path = optarg;
> +			break;
> +		case 1073:
> +			opts.local_proxy_path = optarg;
> +			break;
>  		case 'M':
>  			{
>  				char *aux;
> @@ -642,6 +660,12 @@ int main(int argc, char *argv[], char *envp[])
>  	if (!strcmp(argv[optind], "page-server"))
>  		return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;
>  
> +	if (!strcmp(argv[optind], "image-cache"))
> +		return image_cache(opts.local_cache_path, opts.port);
> +
> +	if (!strcmp(argv[optind], "image-proxy"))
> +		return image_proxy(opts.local_proxy_path, opts.addr, opts.port);
> +
>  	if (!strcmp(argv[optind], "service"))
>  		return cr_service(opts.daemon_mode);
>  
> @@ -668,6 +692,8 @@ usage:
>  "  criu page-server\n"
>  "  criu service [<options>]\n"
>  "  criu dedup\n"
> +"  criu image-cache [<options>]\n"
> +"  criu image-proxy [<options>]\n"
>  "\n"
>  "Commands:\n"
>  "  dump           checkpoint a process/tree identified by pid\n"
> @@ -680,6 +706,8 @@ usage:
>  "  dedup          remove duplicates in memory dump\n"
>  "  cpuinfo dump   writes cpu information into image file\n"
>  "  cpuinfo check  validates cpu information read from image file\n"
> +"  image-cache    launch destination-side cache for images sent from the source-side\n"
> +"  image-proxy    launch source-side proxy to sent images to the destination-side\n"
>  	);
>  
>  	if (usage_error) {
> @@ -706,6 +734,7 @@ usage:
>  "                        restore making it the parent of the restored process\n"
>  "  --freeze-cgroup\n"
>  "                        use cgroup freezer to collect processes\n"
> +"  --remote              dump/restore images directly to/from remote node using image-proxy/image-cache\n"
>  "\n"
>  "* Special resources support:\n"
>  "  -x|--" USK_EXT_PARAM "inode,.." "      allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
> @@ -762,9 +791,9 @@ usage:
>  "                        when used on restore, as soon as page is restored, it\n"
>  "                        will be punched from the image.\n"
>  "\n"
> -"Page/Service server options:\n"
> +"Page/Service/image-cache/image-proxy server options:\n"
>  "  --address ADDR        address of server or service\n"
> -"  --port PORT           port of page server\n"
> +"  --port PORT           port of server or service\n"
>  "  -d|--daemon           run in the background after creating socket\n"
>  "\n"
>  "Other options:\n"
> diff --git a/image-desc.c b/image-desc.c
> index 773f2fa..48e0116 100644
> --- a/image-desc.c
> +++ b/image-desc.c
> @@ -94,13 +94,13 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
>  	[CR_FD_STATS] = {
>  		.fmt	= "stats-%s",
>  		.magic	= STATS_MAGIC,
> -		.oflags = O_SERVICE,
> +		.oflags = O_SERVICE | O_FORCE_LOCAL,
>  	},
>  
>  	[CR_FD_IRMAP_CACHE] = {
>  		.fmt	= "irmap-cache",
>  		.magic	= IRMAP_CACHE_MAGIC,
> -		.oflags = O_SERVICE,
> +		.oflags = O_SERVICE | O_FORCE_LOCAL,
>  	},
>  
>  	[CR_FD_FILE_LOCKS_PID] = {
> diff --git a/image.c b/image.c
> index dc9d6a1..4ca740e 100644
> --- a/image.c
> +++ b/image.c
> @@ -12,6 +12,7 @@
>  #include "protobuf.h"
>  #include "protobuf/inventory.pb-c.h"
>  #include "protobuf/pagemap.pb-c.h"
> +#include "img-remote.h"
>  
>  bool fdinfo_per_id = false;
>  bool ns_per_id = false;
> @@ -306,9 +307,26 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of
>  {
>  	int ret, flags;
>  
> -	flags = oflags & ~(O_NOBUF | O_SERVICE);
> +	flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL);
>  
> -	ret = openat(dfd, path, flags, CR_FD_PERM);
> +	if (opts.remote && !(oflags & O_FORCE_LOCAL)) {
> +		char *snapshot_id = NULL;
> +
> +		snapshot_id = get_snapshot_id_from_idx(dfd);
> +
> +		if (snapshot_id == NULL)
> +			ret = -1;
> +		else if (flags == O_RDONLY) {
> +			pr_info("do_open_remote_image RDONLY path=%s snapshot_id=%s\n",
> +			  path, snapshot_id);
> +			ret = read_remote_image_connection(snapshot_id, path);
> +		} else {
> +			pr_info("do_open_remote_image WDONLY path=%s snapshot_id=%s\n",
> +			  path, snapshot_id);
> +			ret = write_remote_image_connection(snapshot_id, path, O_WRONLY);
> +		}
> +	} else
> +		ret = openat(dfd, path, flags, CR_FD_PERM);
>  	if (ret < 0) {
>  		if (!(flags & O_CREAT) && (errno == ENOENT)) {
>  			pr_info("No %s image\n", path);
> @@ -319,7 +337,6 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of
>  		pr_perror("Unable to open %s", path);
>  		goto err;
>  	}
> -
>  	img->_x.fd = ret;
>  	if (oflags & O_NOBUF)
>  		bfd_setraw(&img->_x);
> @@ -410,7 +427,9 @@ int open_image_dir(char *dir)
>  	close(fd);
>  	fd = ret;
>  
> -	if (opts.img_parent) {
> +	if (opts.remote) {
> +		init_snapshot_id(dir);
> +	} else if (opts.img_parent) {
>  		ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
>  		if (ret < 0 && errno != EEXIST) {
>  			pr_perror("Can't link parent snapshot");
> diff --git a/img-remote.c b/img-remote.c
> new file mode 100644
> index 0000000..7f4f400
> --- /dev/null
> +++ b/img-remote.c
> @@ -0,0 +1,272 @@
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <netinet/in.h>
> +#include <netdb.h>
> +#include "xmalloc.h"
> +#include "criu-log.h"
> +#include "img-remote.h"
> +#include "img-remote-proto.h"
> +#include "protobuf/remote-image.pb-c.h"
> +#include "protobuf-desc.h"
> +#include <fcntl.h>
> +#include "servicefd.h"
> +#include "compiler.h"
> +#include "cr_options.h"
> +
> +#define PB_LOCAL_IMAGE_SIZE PATHLEN
> +
> +static char *snapshot_id;
> +
> +LIST_HEAD(snapshot_head);
> +
> +/* A snapshot is a dump or pre-dump operation. Each snapshot is identified by an
> + * ID which corresponds to the working directory specefied by the user.
> + */
> +struct snapshot {
> +	char snapshot_id[PATHLEN];
> +	struct list_head l;
> +};
> +
> +struct snapshot *new_snapshot(char *snapshot_id)
> +{
> +	struct snapshot *s = malloc(sizeof(struct snapshot));
> +
> +	if (!s)
> +		pr_perror("Failed to allocate snapshot structure");
> +	strncpy(s->snapshot_id, snapshot_id, PATHLEN);
> +	return s;
> +}
> +
> +void add_snapshot(struct snapshot *snapshot)
> +{
> +	list_add_tail(&(snapshot->l), &snapshot_head);
> +}
> +
> +static char *get_local_img_path()
> +{
> +	static char *local_img_path = NULL;
> +
> +	if (local_img_path != NULL)
> +		return local_img_path;
> +
> +	if (strcmp(opts.local_cache_path, DEFAULT_IMG_PATH))
> +		local_img_path = opts.local_cache_path;
> +	else if (strcmp(opts.local_proxy_path, DEFAULT_IMG_PATH))
> +		local_img_path = opts.local_proxy_path;
> +
> +	return local_img_path;
> +}
> +
> +int read_remote_image_connection(char *snapshot_id, char *path)
> +{
> +	int error;
> +	int sockfd = setup_UNIX_client_socket(get_local_img_path());
> +
> +	if (sockfd < 0) {
> +		pr_perror("Error opening local connection for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +
> +	if (write_header(sockfd, snapshot_id, path, O_RDONLY) < 0) {
> +		pr_perror("Error writing header for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +
> +	if (read_reply_header(sockfd, &error) < 0) {
> +		pr_perror("Error reading reply header for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +	errno = error;
> +	if (!error)
> +		return sockfd;
> +	else if (error == ENOENT) {
> +		pr_info("Image does not exist (%s:%s)\n", path, snapshot_id);
> +		close(sockfd);
> +		return -1;
> +	}
> +	pr_perror("Unexpected error returned: %d (%s:%s)\n", error, path, snapshot_id);
> +	close(sockfd);
> +	return -1;
> +}
> +
> +int write_remote_image_connection(char *snapshot_id, char *path, int flags)
> +{
> +	int sockfd = setup_UNIX_client_socket(get_local_img_path());
> +
> +	if (sockfd < 0)
> +		return -1;
> +
> +	if (write_header(sockfd, snapshot_id, path, flags) < 0) {
> +		pr_perror("Error writing header for %s:%s", path, snapshot_id);
> +		return -1;
> +	}
> +	return sockfd;
> +}
> +
> +int finish_remote_dump()
> +{
> +	pr_info("Dump side is calling finish\n");
> +	int fd = write_remote_image_connection(NULL_SNAPSHOT_ID, DUMP_FINISH, O_WRONLY);
> +
> +	if (fd == -1) {
> +		pr_perror("Unable to open finish dump connection");
> +		return -1;
> +	}
> +
> +	close(fd);
> +	return 0;
> +}
> +
> +int skip_remote_bytes(int fd, unsigned long len)
> +{
> +	static char buf[4096];
> +	int n = 0;
> +	unsigned long curr = 0;
> +
> +	for (; curr < len; ) {
> +		n = read(fd, buf, min(len - curr, (unsigned long)4096));
> +		if (n == 0) {
> +			pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)",
> +				curr, len);
> +			return -1;
> +		} else if (n > 0) {
> +			curr += n;
> +		} else {
> +			pr_perror("Error while skipping bytes from stream (%lx/%lx)",
> +				curr, len);
> +			return -1;
> +		}
> +	}
> +
> +	if (curr != len) {
> +		pr_perror("Unable to skip the current number of bytes: %lx instead of %lx",
> +			curr, len);
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +static int pull_snapshot_ids()
> +{
> +	int n, sockfd;
> +	SnapshotIdEntry *ls;
> +	struct snapshot *s = NULL;
> +
> +	sockfd = read_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG);
> +
> +	/* The connection was successful but there is not file. */
> +	if (sockfd < 0 && errno == ENOENT)
> +		return 0;
> +	else if (sockfd < 0) {
> +		pr_perror("Unable to open snapshot id read connection");
> +		return -1;
> +	}
> +
> +	while (1) {
> +		n = pb_read_obj(sockfd, (void **)&ls, PB_SNAPSHOT_ID);
> +		if (!n) {
> +			close(sockfd);
> +			return n;
> +		} else if (n < 0) {
> +			pr_perror("Unable to read remote snapshot ids");
> +			close(sockfd);
> +			return n;
> +		}
> +
> +		s = new_snapshot(ls->snapshot_id);
> +		if (!s) {
> +			pr_perror("Unable create new snapshot structure");
> +			close(sockfd);
> +			return -1;
> +		}
> +		add_snapshot(s);
> +		pr_info("[read_snapshot ids] parent = %s\n", ls->snapshot_id);
> +	}
> +	free(ls);
> +	close(sockfd);
> +	return n;
> +}
> +
> +int push_snapshot_id()
> +{
> +	int n;
> +	SnapshotIdEntry rn = SNAPSHOT_ID_ENTRY__INIT;
> +	int sockfd = write_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG, O_APPEND);
> +
> +	if (sockfd < 0) {
> +		pr_perror("Unable to open snapshot id push connection");
> +		return -1;
> +	}
> +
> +	rn.snapshot_id = xmalloc(sizeof(char) * PATHLEN);
> +	if (!rn.snapshot_id) {
> +		pr_perror("Unable to allocate snapshot id buffer");
> +		close(sockfd);
> +		return -1;
> +	}
> +	strncpy(rn.snapshot_id, snapshot_id, PATHLEN);
> +
> +	n = pb_write_obj(sockfd, &rn, PB_SNAPSHOT_ID);
> +
> +	xfree(rn.snapshot_id);
> +	close(sockfd);
> +	return n;
> +}
> +
> +void init_snapshot_id(char *si)
> +{
> +	snapshot_id = si;
> +}
> +
> +char *get_curr_snapshot_id()
> +{
> +	return snapshot_id;
> +}
> +
> +int get_curr_snapshot_id_idx()
> +{
> +	struct snapshot *si;
> +	int idx = 0;
> +
> +	if (list_empty(&snapshot_head))
> +		pull_snapshot_ids();
> +
> +	list_for_each_entry(si, &snapshot_head, l) {
> +	if (!strncmp(si->snapshot_id, snapshot_id, PATHLEN))
> +			return idx;
> +		idx++;
> +	}
> +
> +	pr_perror("Error, could not find current snapshot id (%s) fd", snapshot_id);
> +	return -1;
> +}
> +
> +char *get_snapshot_id_from_idx(int idx)
> +{
> +	struct snapshot *si;
> +
> +	if (list_empty(&snapshot_head))
> +		pull_snapshot_ids();
> +
> +	/* Note: if idx is the service fd then we need the current
> +	 * snapshot_id idx. Else we need a parent snapshot_id idx.
> +	 */
> +	if (idx == get_service_fd(IMG_FD_OFF))
> +		idx = get_curr_snapshot_id_idx();
> +
> +	list_for_each_entry(si, &snapshot_head, l) {
> +		if (!idx)
> +			return si->snapshot_id;
> +		idx--;
> +	}
> +
> +	pr_perror("Error, could not find snapshot id for idx %d", idx);
> +	return NULL;
> +}
> +
> +int get_curr_parent_snapshot_id_idx()
> +{
> +	return get_curr_snapshot_id_idx() - 1;
> +}
> diff --git a/include/cr_options.h b/include/cr_options.h
> index af130dd..22f7e0d 100644
> --- a/include/cr_options.h
> +++ b/include/cr_options.h
> @@ -74,7 +74,7 @@ struct cr_options {
>  	struct list_head	inherit_fds;
>  	char			*libdir;
>  	bool			use_page_server;
> -	unsigned short		ps_port;
> +	unsigned short		port;
>  	char			*addr;
>  	int			ps_socket;
>  	bool			track_mem;
> @@ -91,6 +91,9 @@ struct cr_options {
>  	bool			enable_external_masters;
>  	bool			aufs;		/* auto-deteced, not via cli */
>  	bool			overlayfs;
> +	bool			remote;
> +	char			*local_cache_path;
> +	char			*local_proxy_path;
>  	size_t			ghost_limit;
>  	struct list_head	irmap_scan_paths;
>  };
> diff --git a/include/image.h b/include/image.h
> index 305febf..70363f6 100644
> --- a/include/image.h
> +++ b/include/image.h
> @@ -85,7 +85,7 @@
>   *  - unsupported
>   *  	stands for any unknown memory areas, usually means
>   *  	we don't know how to work with it and should stop
> - *  	processing exiting with error; while the rest of bits
> + *  	processing exiting with error; wO_WRONLYhile the rest of bits
>   *  	are part of image ABI, this particular one must never
>   *  	be used in image.
>   */
> @@ -128,6 +128,7 @@ extern bool img_common_magic;
>  #define O_DUMP		(O_WRONLY | O_CREAT | O_TRUNC)
>  #define O_SHOW		(O_RDONLY | O_NOBUF)
>  #define O_RSTR		(O_RDONLY)
> +#define O_FORCE_LOCAL   (O_SYNC)
>  
>  struct cr_img {
>  	union {
> diff --git a/include/img-remote.h b/include/img-remote.h
> new file mode 100644
> index 0000000..706b67f
> --- /dev/null
> +++ b/include/img-remote.h
> @@ -0,0 +1,79 @@
> +#include <limits.h>
> +
> +#ifndef IMAGE_REMOTE_H
> +#define	IMAGE_REMOTE_H
> +
> +#define PATHLEN PATH_MAX
> +#define DUMP_FINISH "DUMP_FINISH"
> +#define PARENT_IMG "parent"
> +#define NULL_SNAPSHOT_ID "null"
> +#define DEFAULT_IMG_PATH "/tmp/criu-img-path.sock"
> +#define DEFAULT_CACHE_PORT 9996
> +#define DEFAULT_CACHE_HOST "localhost"
> +
> +/* Called by restore to get the fd correspondent to a particular path. This call
> + * will block until the connection is received.
> + */
> +int read_remote_image_connection(char *snapshot_id, char *path);
> +
> +/* Called by dump to create a socket connection to the restore side. The socket
> + * fd is returned for further writing operations.
> + */
> +int write_remote_image_connection(char *snapshot_id, char *path, int flags);
> +
> +/* Called by dump when everything is dumped. This function creates a new
> + * connection with a special control name. The recover side uses it to ack that
> + * no more files are coming.
> + */
> +int finish_remote_dump();
> +
> +/* Starts an image proxy daemon (dump side). It receives image files through
> + * socket connections and forwards them to the image cache (restore side).
> + */
> +int image_proxy(char *local_proxy_path, char *cache_host, unsigned short cache_port);
> +
> +/* Starts an image cache daemon (restore side). It receives image files through
> + * socket connections and caches them until they are requested by the restore
> + * process.
> + */
> +int image_cache(char *local_cache_path, unsigned short cache_port);
> +
> +/* Reads (discards) 'len' bytes from fd. This is used to emulate the function
> + * lseek, which is used to advance the file needle.
> + */
> +int skip_remote_bytes(int fd, unsigned long len);
> +
> +/* To support iterative migration, the concept of snapshot_id is introduced
> + * (only when remote migration is enabled). Each image is tagged with one
> + * snapshot_id. The snapshot_id is the image directory used for the operation
> + * that creates the image (either predump or dump). Images stored in memory
> + * (both in Image Proxy and Image Cache) are identified by their name and
> + * snapshot_id. Snapshot_ids are ordered so that we can find parent pagemaps
> + * (that will be used when restoring the process).
> + */
> +
> +/* Sets the current snapshot_id */
> +void init_snapshot_id(char *ns);
> +
> +/* Returns the current snapshot_id. */
> +char *get_curr_snapshot_id();
> +
> +/* Returns the snapshot_id index representing the current snapshot_id. This
> + * index represents the hierarchy position. For example: images tagged with
> + * the snapshot_id with index 1 are more recent than the images tagged with
> + * the snapshot_id with index 0.
> + */
> +int get_curr_snapshot_id_idx();
> +
> +/* Returns the snapshot_id associated with the snapshot_id index. */
> +char *get_snapshot_id_from_idx(int idx);
> +
> +/* Pushes the current snapshot_id into the snapshot_id hierarchy (into the Image
> + * Proxy and Image Cache).
> + */
> +int push_snapshot_id();
> +
> +/* Returns the snapshot id index that preceeds the current snapshot_id. */
> +int get_curr_parent_snapshot_id_idx();
> +
> +#endif
> diff --git a/include/protobuf-desc.h b/include/protobuf-desc.h
> index ab7e4f2..ad9b37f 100644
> --- a/include/protobuf-desc.h
> +++ b/include/protobuf-desc.h
> @@ -55,16 +55,20 @@ enum {
>  	PB_CPUINFO,
>  	PB_USERNS,
>  	PB_NETNS,
> +	PB_REMOTE_IMAGE,        /* Header for images sent from proxy to cache.*/
> +	PB_LOCAL_IMAGE,         /* Header for reading/writing images from/to proxy or cache. */
> +	PB_LOCAL_IMAGE_REPLY,	/* Header for reading/writing images reply. */
> +	PB_SNAPSHOT_ID,         /* Contains a single id. Used for reading/writing ids from proxy or cache. */
>  
>  	/* PB_AUTOGEN_STOP */
>  
>  	PB_PAGEMAP_HEAD,
> -	PB_IDS,		/* 50 */
> +	PB_IDS,
>  	PB_SIGACT,
>  	PB_NETDEV,
>  	PB_REMAP_FPATH,
>  	PB_SK_QUEUES,
> -	PB_IPCNS_MSG,
> +	PB_IPCNS_MSG,		/* 60 */
>  	PB_IPCNS_MSG_ENT,
>  
>  	PB_MAX,
> diff --git a/include/util.h b/include/util.h
> index f2300a9..a732e3f 100644
> --- a/include/util.h
> +++ b/include/util.h
> @@ -260,5 +260,6 @@ FILE *fopenat(int dirfd, char *path, char *cflags);
>  void split(char *str, char token, char ***out, int *n);
>  
>  int fd_has_data(int lfd);
> +size_t read_into_buffer(int fd, char *buff, size_t size);
>  
>  #endif /* __CR_UTIL_H__ */
> diff --git a/page-read.c b/page-read.c
> index 832c057..6eae937 100644
> --- a/page-read.c
> +++ b/page-read.c
> @@ -6,10 +6,13 @@
>  #include "cr_options.h"
>  #include "servicefd.h"
>  #include "page-read.h"
> +#include "util.h"
>  
>  #include "protobuf.h"
>  #include "protobuf/pagemap.pb-c.h"
>  
> +#include "img-remote.h"
> +
>  #ifndef SEEK_DATA
>  #define SEEK_DATA	3
>  #define SEEK_HOLE	4
> @@ -90,7 +93,10 @@ static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
>  		return;
>  
>  	pr_debug("\tpr%u Skip %lx bytes from page-dump\n", pr->id, len);
> -	if (!pr->pe->in_parent)
> +	if (!pr->pe->in_parent && opts.remote) {
> +		if (skip_remote_bytes(img_raw_fd(pr->pi), len) < 0)
> +			pr_perror("Unable to skip remote bytes");
> +	} else if (!pr->pe->in_parent)
>  		lseek(img_raw_fd(pr->pi), len, SEEK_CUR);
>  	pr->cvaddr += len;
>  }
> @@ -146,10 +152,11 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, void *bu
>  			return ret;
>  	} else {
>  		int fd = img_raw_fd(pr->pi);
> -		off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
> +		/* TODO - lseek is not possible to sockets. Need to find a solution. */
> +		off_t current_vaddr = opts.remote ? 0 : lseek(fd, 0, SEEK_CUR);
>  		pr_debug("\tpr%u Read page %lx from self %lx/%"PRIx64"\n", pr->id,
>  				vaddr, pr->cvaddr, current_vaddr);
> -		ret = read(fd, buf, PAGE_SIZE);
> +		ret = read_into_buffer(fd, buf, PAGE_SIZE);
>  		if (ret != PAGE_SIZE) {
>  			pr_perror("Can't read mapping page %d", ret);
>  			return -1;
> @@ -195,9 +202,24 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
>  	int pfd, ret;
>  	struct page_read *parent = NULL;
>  
> -	pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
> -	if (pfd < 0 && errno == ENOENT)
> -		goto out;
> +	if (opts.remote) {
> +		/* Note: we are replacing a real directory FD for a snapshot_id
> +		 * index. Since we need the parent of the current snapshot_id,
> +		 * we want the current snapshot_id index minus one. It is
> +		 * possible that dfd is already a snapshot_id index. We test it
> +		 * by comparing it to the service FD. When opening an image (see
> +		 * do_open_image) we convert the snapshot_id index into a real
> +		 * snapshot_id.
> +		 */
> +		pfd = dfd == get_service_fd(IMG_FD_OFF) ?
> +			get_curr_snapshot_id_idx() - 1 : dfd - 1;
> +		if (pfd < 0)
> +			goto out;
> +	} else {
> +		pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
> +		if (pfd < 0 && errno == ENOENT)
> +			goto out;
> +	}
>  
>  	parent = xmalloc(sizeof(*parent));
>  	if (!parent)
> @@ -211,8 +233,8 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
>  		xfree(parent);
>  		parent = NULL;
>  	}
> -
> -	close(pfd);
> +	if (!opts.remote)
> +		close(pfd);
>  out:
>  	pr->parent = parent;
>  	return 0;
> @@ -220,7 +242,8 @@ out:
>  err_free:
>  	xfree(parent);
>  err_cl:
> -	close(pfd);
> +	if (!opts.remote)
> +		close(pfd);
>  	return -1;
>  }
>  
> diff --git a/page-xfer.c b/page-xfer.c
> index 7465ed8..2513347 100644
> --- a/page-xfer.c
> +++ b/page-xfer.c
> @@ -17,6 +17,8 @@
>  #include "protobuf.h"
>  #include "protobuf/pagemap.pb-c.h"
>  
> +#include "img-remote.h"
> +
>  struct page_server_iov {
>  	u32	cmd;
>  	u32	nr_pages;
> @@ -290,7 +292,7 @@ static int get_sockaddr_in(struct sockaddr_in *addr)
>  		return -1;
>  	}
>  
> -	addr->sin_port = opts.ps_port;
> +	addr->sin_port = opts.port;
>  	return 0;
>  }
>  
> @@ -310,7 +312,7 @@ int cr_page_server(bool daemon_mode, int cfd)
>  		goto no_server;
>  	}
>  
> -	pr_info("Starting page server on port %u\n", (int)ntohs(opts.ps_port));
> +	pr_info("Starting page server on port %u\n", (int)ntohs(opts.port));
>  
>  	sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
>  	if (sk < 0) {
> @@ -332,14 +334,14 @@ int cr_page_server(bool daemon_mode, int cfd)
>  	}
>  
>  	/* Get socket port in case of autobind */
> -	if (opts.ps_port == 0) {
> +	if (opts.port == 0) {
>  		if (getsockname(sk, (struct sockaddr *)&saddr, &slen)) {
>  			pr_perror("Can't get page server name");
>  			goto out;
>  		}
>  
> -		opts.ps_port = ntohs(saddr.sin_port);
> -		pr_info("Using %u port\n", opts.ps_port);
> +		opts.port = ntohs(saddr.sin_port);
> +		pr_info("Using %u port\n", opts.port);
>  	}
>  
>  no_server:
> @@ -404,7 +406,7 @@ int connect_to_page_server(void)
>  	}
>  
>  	pr_info("Connecting to server %s:%u\n",
> -			opts.addr, (int)ntohs(opts.ps_port));
> +			opts.addr, (int)ntohs(opts.port));
>  
>  	page_server_sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
>  	if (page_server_sk < 0) {
> @@ -436,7 +438,7 @@ int disconnect_from_page_server(void)
>  		return 0;
>  
>  	pr_info("Disconnect from the page server %s:%u\n",
> -			opts.addr, (int)ntohs(opts.ps_port));
> +			opts.addr, (int)ntohs(opts.port));
>  
>  	if (opts.ps_socket != -1)
>  		/*
> @@ -742,12 +744,20 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
>  		int ret;
>  		int pfd;
>  
> -		pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> -		if (pfd < 0 && errno == ENOENT)
> -			goto out;
> +		if (opts.remote) {
> +			pfd = get_curr_parent_snapshot_id_idx();
> +			if (pfd == -1)
> +				goto out;
> +		} else {
> +			pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> +			if (pfd < 0 && errno == ENOENT)
> +				goto out;
> +		}
>  
>  		xfer->parent = xmalloc(sizeof(*xfer->parent));
> -		if (!xfer->parent) {
> +		if (!xfer->parent && opts.remote) {
> +			return -1;
> +		} else if (!xfer->parent) {
>  			close(pfd);
>  			return -1;
>  		}
> @@ -757,10 +767,12 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
>  			pr_perror("No parent image found, though parent directory is set");
>  			xfree(xfer->parent);
>  			xfer->parent = NULL;
> -			close(pfd);
> +			if (!opts.remote)
> +				close(pfd);
>  			goto out;
>  		}
> -		close(pfd);
> +		if (!opts.remote)
> +			close(pfd);
>  	}
>  
>  out:
> @@ -852,6 +864,8 @@ int check_parent_page_xfer(int fd_type, long id)
>  {
>  	if (opts.use_page_server)
>  		return check_parent_server_xfer(fd_type, id);
> +	else if (opts.remote)
> +		return get_curr_parent_snapshot_id_idx() == -1 ? 0 : 1;
>  	else
>  		return check_parent_local_xfer(fd_type, id);
>  }
> diff --git a/protobuf-desc.c b/protobuf-desc.c
> index 873fd3b..2b58aab 100644
> --- a/protobuf-desc.c
> +++ b/protobuf-desc.c
> @@ -61,6 +61,7 @@
>  #include "protobuf/timerfd.pb-c.h"
>  #include "protobuf/cpuinfo.pb-c.h"
>  #include "protobuf/userns.pb-c.h"
> +#include "protobuf/remote-image.pb-c.h"
>  
>  struct cr_pb_message_desc cr_pb_descs[PB_MAX];
>  
> diff --git a/protobuf/Makefile b/protobuf/Makefile
> index 0b11852..f685fc6 100644
> --- a/protobuf/Makefile
> +++ b/protobuf/Makefile
> @@ -48,6 +48,7 @@ proto-obj-y	+= tty.o
>  proto-obj-y	+= file-lock.o
>  proto-obj-y	+= rlimit.o
>  proto-obj-y	+= pagemap.o
> +proto-obj-y	+= remote-image.o
>  proto-obj-y	+= siginfo.o
>  proto-obj-y	+= rpc.o
>  proto-obj-y	+= ext-file.o
> diff --git a/protobuf/remote-image.proto b/protobuf/remote-image.proto
> new file mode 100644
> index 0000000..1212627
> --- /dev/null
> +++ b/protobuf/remote-image.proto
> @@ -0,0 +1,20 @@
> +message local_image_entry {
> +	required string name		= 1;
> +	required string snapshot_id	= 2;
> +	required uint32 open_mode	= 3;
> +}
> +
> +message remote_image_entry {
> +	required string name		= 1;
> +	required string snapshot_id	= 2;
> +	required uint32 open_mode	= 3;
> +	required uint64 size		= 4;
> +}
> +
> +message local_image_reply_entry {
> +	required uint32 error           = 1;
> +}
> +
> +message snapshot_id_entry {
> +	required string snapshot_id	= 1;
> +}
> diff --git a/util.c b/util.c
> index b916eca..70e7bff 100644
> --- a/util.c
> +++ b/util.c
> @@ -845,3 +845,18 @@ int fd_has_data(int lfd)
>  
>  	return ret;
>  }
> +
> +size_t read_into_buffer(int fd, char *buff, size_t size)
> +{
> +	size_t n = 0;
> +	size_t curr = 0;
> +
> +	while (1) {
> +		n  = read(fd, buff + curr, size - curr);
> +		if (n < 1)
> +			return n;
> +		curr += n;
> +		if (curr == size)
> +			return size;
> +	}
> +}
> 



More information about the CRIU mailing list