[CRIU] [PATCH RFC v3 2/4] mem: Introduce image-proxy/image-cache & remote option
Pavel Emelyanov
xemul at virtuozzo.com
Fri Aug 12 07:16:38 PDT 2016
On 08/10/2016 04:19 PM, Katerina Koukiou wrote:
> This patch introduces --remote option and image-proxy/image-cache processes.
> This leaves user the option to decide if the checkpoint data are to be stored
> on disk or sent through socket to the image-proxy.
> The latter forwards the data to the destination node where image-cache receives
> them.
>
> The overall communication is performed as follows:
> rc_node CRIU dump -> (sends images using a local socket) -> image-proxy
> |
> V
> dst_node: CRIU restore <- (receives images from a local socket) <- image-cache
>
> Running criu with --remote option is like this:
>
> dst_node# criu image-cache --port <port> -o /tmp/image-cache.log
> --local-cache-path <local_cache_path> ...
> dst_node# criu restore --remote -o /tmp/image-cache.log
> --local-cache-path <local_cache_path> ...
> src_node# criu image-proxy --port <port> --address <dst_node> -o /tmp/image-proxy.log
> --local-proxy-path <local_proxy_path> ...
> src_node# criu dump -t <pid> --remote -o /tmp/dump.log
> --local-proxy-path <local_proxy_path> ...
Katerina, thanks A LOT for picking this up :) Please, find my comments inline.
I remember that you do it as a part of you GSoC project, how much time do
you still have to play with it?
> Signed-off-by: Rodrigo Bruno <rbruno at gsd.inesc-id.pt>
> Signed-off-by: Katerina Koukiou <k.koukiou at gmail.com>
> ---
> criu/Makefile.crtools | 4 +
> criu/cr-dump.c | 17 +++
> criu/crtools.c | 30 ++++-
> criu/image-desc.c | 4 +-
> criu/image.c | 28 ++++-
> criu/img-remote-proto.c | 4 +
> criu/img-remote.c | 278 +++++++++++++++++++++++++++++++++++++++++++
> criu/include/cr_options.h | 3 +
> criu/include/image.h | 1 +
> criu/include/img-remote.h | 79 ++++++++++++
> criu/include/protobuf-desc.h | 4 +
> criu/page-xfer.c | 26 +++-
> criu/pagemap.c | 53 +++++++--
> criu/protobuf-desc.c | 1 +
> images/Makefile | 1 +
> images/remote-image.proto | 20 ++++
> 16 files changed, 529 insertions(+), 24 deletions(-)
Although I don't argue these patches work :) this is too much for a
single patch. Let's try to split it into pieces.
Suggestions on how to split and other comments below.
> create mode 100644 criu/img-remote.c
> create mode 100644 criu/include/img-remote.h
> create mode 100644 images/remote-image.proto
>
> diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools
> index 8e9c3b4..95a0521 100644
> --- a/criu/Makefile.crtools
> +++ b/criu/Makefile.crtools
> @@ -26,6 +26,10 @@ obj-y += files-reg.o
> obj-y += fsnotify.o
> obj-y += image-desc.o
> obj-y += image.o
> +obj-y += img-remote.o
> +obj-y += img-proxy.o
> +obj-y += img-cache.o
> +obj-y += img-remote-proto.o
> obj-y += ipc_ns.o
> obj-y += irmap.o
> obj-y += kcmp-ids.o
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 06ff2d7..5b5a232 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -84,6 +84,8 @@
>
> #include "asm/dump.h"
>
> +#include "img-remote.h"
> +
> static char loc_buf[PAGE_SIZE];
>
> void free_mappings(struct vm_area_list *vma_area_list)
> @@ -1504,6 +1506,11 @@ int cr_pre_dump_tasks(pid_t pid)
> struct pstree_item *item;
> int ret = -1;
>
> + if (opts.remote && push_snapshot_id() < 0) {
> + pr_err("Failed to push image namespace.\n");
> + goto err;
> + }
> +
> root_item = alloc_pstree_item();
> if (!root_item)
> goto err;
> @@ -1660,6 +1667,11 @@ static int cr_dump_finish(int ret)
>
> close_service_fd(CR_PROC_FD_OFF);
>
> + if (opts.remote && (finish_remote_dump() < 0)) {
> + pr_err("Finish remote dump failed.\n");
> + return post_dump_ret ? : 1;
> + }
> +
> if (ret) {
> pr_err("Dumping FAILED.\n");
> } else {
> @@ -1680,6 +1692,11 @@ int cr_dump_tasks(pid_t pid)
> pr_info("Dumping processes (pid: %d)\n", pid);
> pr_info("========================================\n");
>
> + if (opts.remote && push_snapshot_id() < 0) {
> + pr_err("Failed to push image namespace.\n");
> + goto err;
> + }
> +
> root_item = alloc_pstree_item();
> if (!root_item)
> goto err;
> diff --git a/criu/crtools.c b/criu/crtools.c
> index 7e11c22..2d1ece2 100644
> --- a/criu/crtools.c
> +++ b/criu/crtools.c
> @@ -48,6 +48,7 @@
> #include "namespaces.h"
> #include "setproctitle.h"
> #include "sysctl.h"
> +#include "img-remote.h"
>
> struct cr_options opts;
>
> @@ -72,6 +73,10 @@ void init_opts(void)
> opts.ghost_limit = DEFAULT_GHOST_LIMIT;
> opts.timeout = DEFAULT_TIMEOUT;
> opts.empty_ns = 0;
> + opts.addr = DEFAULT_CACHE_HOST;
> + opts.port = DEFAULT_CACHE_PORT;
> + opts.local_cache_path = DEFAULT_IMG_PATH;
> + opts.local_proxy_path = DEFAULT_IMG_PATH;
> }
>
> static int parse_join_ns(const char *ptr)
> @@ -324,6 +329,9 @@ int main(int argc, char *argv[], char *envp[])
> { "cgroup-props-file", required_argument, 0, 1081 },
> { "cgroup-dump-controller", required_argument, 0, 1082 },
> { SK_INFLIGHT_PARAM, no_argument, 0, 1083 },
> + { "remote", no_argument, 0, 1084 },
> + { "local-cache-path", required_argument, 0, 1085 },
> + { "local-proxy-path", required_argument, 0, 1086 },
These are paths to sockets, aren't they? We've had the same functionality
in lazy-restore and decided to drop configurable paths to sockets and
just put sockets with hard-coded names into work-dir.
> { },
> };
>
> @@ -639,6 +647,15 @@ int main(int argc, char *argv[], char *envp[])
> pr_msg("Will skip in-flight TCP connections\n");
> opts.tcp_skip_in_flight = true;
> break;
> + case 1084:
> + opts.remote = true;
> + break;
> + case 1085:
> + opts.local_cache_path = optarg;
> + break;
> + case 1086:
> + opts.local_proxy_path = optarg;
> + break;
> case 'V':
> pr_msg("Version: %s\n", CRIU_VERSION);
> if (strcmp(CRIU_GITID, "0"))
> @@ -794,6 +811,12 @@ int main(int argc, char *argv[], char *envp[])
> if (!strcmp(argv[optind], "page-server"))
> return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;
>
> + if (!strcmp(argv[optind], "image-cache"))
> + return image_cache(opts.local_cache_path, opts.port);
> +
> + if (!strcmp(argv[optind], "image-proxy"))
> + return image_proxy(opts.local_proxy_path, opts.addr, opts.port);
> +
> if (!strcmp(argv[optind], "service"))
> return cr_service(opts.daemon_mode);
>
> @@ -821,6 +844,8 @@ usage:
> " criu service [<options>]\n"
> " criu dedup\n"
> " criu lazy-pages -D DIR [<options>]\n"
> +" criu image-cache [<options>]\n"
> +" criu image-proxy [<options>]\n"
> "\n"
> "Commands:\n"
> " dump checkpoint a process/tree identified by pid\n"
> @@ -833,6 +858,8 @@ usage:
> " dedup remove duplicates in memory dump\n"
> " cpuinfo dump writes cpu information into image file\n"
> " cpuinfo check validates cpu information read from image file\n"
> +" image-cache launch destination-side cache for images sent from the source-side\n"
> +" image-proxy launch source-side proxy to sent images to the destination-side\n"
> );
>
> if (usage_error) {
> @@ -864,6 +891,7 @@ usage:
> " this requires running a second instance of criu\n"
> " in lazy-pages mode: 'criu lazy-pages -D DIR'\n"
> " --lazy-pages and lazy-pages mode require userfaultfd\n"
> +" --remote dump/restore images directly to/from remote node using image-proxy/image-cache\n"
> "\n"
> "* Special resources support:\n"
> " -x|--" USK_EXT_PARAM "inode,.." " allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
> @@ -973,7 +1001,7 @@ usage:
> "\n"
> "Page/Service server options:\n"
> " --address ADDR address of server or service\n"
> -" --port PORT port of page server\n"
> +" --port PORT port of page serve or service\n"
> " -d|--daemon run in the background after creating socket\n"
> "\n"
> "Other options:\n"
> diff --git a/criu/image-desc.c b/criu/image-desc.c
> index 2b31354..e146ef8 100644
> --- a/criu/image-desc.c
> +++ b/criu/image-desc.c
> @@ -102,13 +102,13 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
> [CR_FD_STATS] = {
> .fmt = "stats-%s",
> .magic = STATS_MAGIC,
> - .oflags = O_SERVICE,
> + .oflags = O_SERVICE | O_FORCE_LOCAL,
> },
>
> [CR_FD_IRMAP_CACHE] = {
> .fmt = "irmap-cache",
> .magic = IRMAP_CACHE_MAGIC,
> - .oflags = O_SERVICE,
> + .oflags = O_SERVICE | O_FORCE_LOCAL,
Please, introduce the O_FORCE_LOCAL flag with separate patch.
> },
>
> [CR_FD_FILE_LOCKS_PID] = {
> diff --git a/criu/image.c b/criu/image.c
> index a3bb285..38a8ea9 100644
> --- a/criu/image.c
> +++ b/criu/image.c
> @@ -13,6 +13,7 @@
> #include "protobuf.h"
> #include "images/inventory.pb-c.h"
> #include "images/pagemap.pb-c.h"
> +#include "img-remote.h"
>
> bool ns_per_id = false;
> bool img_common_magic = true;
> @@ -309,11 +310,28 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of
> {
> int ret, flags;
>
> - flags = oflags & ~(O_NOBUF | O_SERVICE);
> + flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL);
>
> - ret = openat(dfd, path, flags, CR_FD_PERM);
> + if (opts.remote && !(oflags & O_FORCE_LOCAL)) {
> + char *snapshot_id = NULL;
> +
> + snapshot_id = get_snapshot_id_from_idx(dfd);
> +
> + if (snapshot_id == NULL)
> + ret = -1;
> + else if (flags == O_RDONLY) {
> + pr_debug("do_open_remote_image RDONLY path=%s snapshot_id=%s\n",
> + path, snapshot_id);
> + ret = read_remote_image_connection(snapshot_id, path);
> + } else {
> + pr_debug("do_open_remote_image WDONLY path=%s snapshot_id=%s\n",
> + path, snapshot_id);
> + ret = write_remote_image_connection(snapshot_id, path, O_WRONLY);
> + }
Can we have this if () branch moved into a separate helper function?
> + } else
> + ret = openat(dfd, path, flags, CR_FD_PERM);
> if (ret < 0) {
> - if (!(flags & O_CREAT) && (errno == ENOENT)) {
> + if (!(flags & O_CREAT) && (errno == ENOENT || ret == -ENOENT)) {
> pr_info("No %s image\n", path);
> img->_x.fd = EMPTY_IMG_FD;
> goto skip_magic;
> @@ -413,7 +431,9 @@ int open_image_dir(char *dir)
> close(fd);
> fd = ret;
>
> - if (opts.img_parent) {
> + if (opts.remote) {
> + init_snapshot_id(dir);
> + } else if (opts.img_parent) {
> ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
> if (ret < 0 && errno != EEXIST) {
> pr_perror("Can't link parent snapshot");
> diff --git a/criu/img-remote-proto.c b/criu/img-remote-proto.c
> index d8fd8cd..19cfe35 100644
> --- a/criu/img-remote-proto.c
> +++ b/criu/img-remote-proto.c
> @@ -236,6 +236,10 @@ int setup_TCP_client_socket(char *hostname, int port)
>
> int setup_UNIX_server_socket(char *path)
> {
> + if (!path) {
> + pr_err("Path should not be empty\n");
> + return -1;
> + }
This fixlet in separate patch, please.
> struct sockaddr_un addr;
> int sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
>
> diff --git a/criu/img-remote.c b/criu/img-remote.c
> new file mode 100644
> index 0000000..9e244c6
> --- /dev/null
> +++ b/criu/img-remote.c
> @@ -0,0 +1,278 @@
> +#include <unistd.h>
> +#include <stdlib.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <netinet/in.h>
> +#include <netdb.h>
> +#include "xmalloc.h"
> +#include "criu-log.h"
> +#include "img-remote.h"
> +#include "img-remote-proto.h"
> +#include "images/remote-image.pb-c.h"
> +#include "protobuf-desc.h"
> +#include <fcntl.h>
> +#include "servicefd.h"
> +#include "compiler.h"
> +#include "cr_options.h"
> +
> +#define PB_LOCAL_IMAGE_SIZE PATHLEN
> +
> +static char *snapshot_id;
> +
> +LIST_HEAD(snapshot_head);
> +
> +/* A snapshot is a dump or pre-dump operation. Each snapshot is identified by an
> + * ID which corresponds to the working directory specefied by the user.
> + */
> +struct snapshot {
> + char snapshot_id[PATHLEN];
> + struct list_head l;
> +};
> +
> +struct snapshot *new_snapshot(char *snapshot_id)
> +{
> + struct snapshot *s = malloc(sizeof(struct snapshot));
> +
> + if (!s) {
> + pr_perror("Failed to allocate snapshot structure");
> + return NULL;
> + }
> + strncpy(s->snapshot_id, snapshot_id, PATHLEN);
> + return s;
> +}
> +
> +void add_snapshot(struct snapshot *snapshot)
> +{
> + list_add_tail(&(snapshot->l), &snapshot_head);
> +}
> +
> +static char *get_local_img_path(void)
> +{
> + static char *local_img_path = NULL;
> +
> + if (local_img_path != NULL)
> + return local_img_path;
> +
> + if (strcmp(opts.local_cache_path, DEFAULT_IMG_PATH))
> + local_img_path = opts.local_cache_path;
> + else if (strcmp(opts.local_proxy_path, DEFAULT_IMG_PATH))
> + local_img_path = opts.local_proxy_path;
> + else if (opts.local_proxy_path || opts.local_cache_path)
> + local_img_path = DEFAULT_IMG_PATH;
> + else
> + pr_err("Local img path is missing. Possible missing "
> + "--local-{cache,proxy}-path option\n");
> +
> + return local_img_path;
> +}
> +
> +int read_remote_image_connection(char *snapshot_id, char *path)
> +{
> + int error;
> + int sockfd = setup_UNIX_client_socket(get_local_img_path());
> +
> + if (sockfd < 0) {
> + pr_perror("Error opening local connection for %s:%s", path, snapshot_id);
> + return -1;
> + }
> +
> + if (write_header(sockfd, snapshot_id, path, O_RDONLY) < 0) {
> + pr_perror("Error writing header for %s:%s", path, snapshot_id);
> + return -1;
> + }
> +
> + if (read_reply_header(sockfd, &error) < 0) {
> + pr_perror("Error reading reply header for %s:%s", path, snapshot_id);
> + return -1;
> + }
> + if (!error)
> + return sockfd;
> + else if (error == ENOENT) {
> + pr_info("Image does not exist (%s:%s)\n", path, snapshot_id);
> + close(sockfd);
> + return -ENOENT;
> + }
> + pr_perror("Unexpected error returned: %d (%s:%s)\n", error, path, snapshot_id);
> + close(sockfd);
> + return -1;
> +}
> +
> +int write_remote_image_connection(char *snapshot_id, char *path, int flags)
> +{
> + int sockfd = setup_UNIX_client_socket(get_local_img_path());
> +
> + if (sockfd < 0)
> + return -1;
> +
> + if (write_header(sockfd, snapshot_id, path, flags) < 0) {
> + pr_perror("Error writing header for %s:%s", path, snapshot_id);
> + return -1;
> + }
> + return sockfd;
> +}
> +
> +int finish_remote_dump(void)
> +{
> + pr_info("Dump side is calling finish\n");
> + int fd = write_remote_image_connection(NULL_SNAPSHOT_ID, DUMP_FINISH, O_WRONLY);
> +
> + if (fd == -1) {
> + pr_perror("Unable to open finish dump connection");
> + return -1;
> + }
> +
> + close(fd);
> + return 0;
> +}
> +
> +int skip_remote_bytes(int fd, unsigned long len)
> +{
> + static char buf[4096];
> + int n = 0;
> + unsigned long curr = 0;
> +
> + for (; curr < len; ) {
> + n = read(fd, buf, min(len - curr, (unsigned long)4096));
> + if (n == 0) {
> + pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)",
> + curr, len);
> + return -1;
> + } else if (n > 0) {
> + curr += n;
> + } else {
> + pr_perror("Error while skipping bytes from stream (%lx/%lx)",
> + curr, len);
> + return -1;
> + }
> + }
> +
> + if (curr != len) {
> + pr_perror("Unable to skip the current number of bytes: %lx instead of %lx",
> + curr, len);
> + return -1;
> + }
> + return 0;
> +}
> +
> +static int pull_snapshot_ids(void)
> +{
> + int n, sockfd;
> + SnapshotIdEntry *ls;
> + struct snapshot *s = NULL;
> +
> + sockfd = read_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG);
> +
> + /* The connection was successful but there is not file. */
> + if (sockfd < 0 && errno == ENOENT)
> + return 0;
> + else if (sockfd < 0) {
> + pr_perror("Unable to open snapshot id read connection");
> + return -1;
> + }
> +
> + while (1) {
> + n = pb_read_obj(sockfd, (void **)&ls, PB_SNAPSHOT_ID);
> + if (!n) {
> + close(sockfd);
> + return n;
> + } else if (n < 0) {
> + pr_perror("Unable to read remote snapshot ids");
> + close(sockfd);
> + return n;
> + }
> +
> + s = new_snapshot(ls->snapshot_id);
> + if (!s) {
> + pr_perror("Unable create new snapshot structure");
> + close(sockfd);
> + return -1;
> + }
> + add_snapshot(s);
> + pr_info("[read_snapshot ids] parent = %s\n", ls->snapshot_id);
> + }
> + free(ls);
> + close(sockfd);
> + return n;
> +}
> +
> +int push_snapshot_id(void)
> +{
> + int n;
> + SnapshotIdEntry rn = SNAPSHOT_ID_ENTRY__INIT;
> + int sockfd = write_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG, O_APPEND);
> +
> + if (sockfd < 0) {
> + pr_perror("Unable to open snapshot id push connection");
> + return -1;
> + }
> +
> + rn.snapshot_id = xmalloc(sizeof(char) * PATHLEN);
> + if (!rn.snapshot_id) {
> + pr_perror("Unable to allocate snapshot id buffer");
> + close(sockfd);
> + return -1;
> + }
> + strncpy(rn.snapshot_id, snapshot_id, PATHLEN);
> +
> + n = pb_write_obj(sockfd, &rn, PB_SNAPSHOT_ID);
> +
> + xfree(rn.snapshot_id);
> + close(sockfd);
> + return n;
> +}
> +
> +void init_snapshot_id(char *si)
> +{
> + snapshot_id = si;
> +}
> +
> +char *get_curr_snapshot_id(void)
> +{
> + return snapshot_id;
> +}
> +
> +int get_curr_snapshot_id_idx(void)
> +{
> + struct snapshot *si;
> + int idx = 0;
> +
> + if (list_empty(&snapshot_head))
> + pull_snapshot_ids();
> +
> + list_for_each_entry(si, &snapshot_head, l) {
> + if (!strncmp(si->snapshot_id, snapshot_id, PATHLEN))
> + return idx;
> + idx++;
> + }
> +
> + pr_perror("Error, could not find current snapshot id (%s) fd", snapshot_id);
> + return -1;
> +}
> +
> +char *get_snapshot_id_from_idx(int idx)
> +{
> + struct snapshot *si;
> +
> + if (list_empty(&snapshot_head))
> + pull_snapshot_ids();
> +
> + /* Note: if idx is the service fd then we need the current
> + * snapshot_id idx. Else we need a parent snapshot_id idx.
> + */
> + if (idx == get_service_fd(IMG_FD_OFF))
> + idx = get_curr_snapshot_id_idx();
> +
> + list_for_each_entry(si, &snapshot_head, l) {
> + if (!idx)
> + return si->snapshot_id;
> + idx--;
> + }
> +
> + pr_perror("Error, could not find snapshot id for idx %d", idx);
> + return NULL;
> +}
> +
> +int get_curr_parent_snapshot_id_idx(void)
> +{
> + return get_curr_snapshot_id_idx() - 1;
> +}
> diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h
> index 35c1ace..6f1b202 100644
> --- a/criu/include/cr_options.h
> +++ b/criu/include/cr_options.h
> @@ -113,6 +113,9 @@ struct cr_options {
> bool lazy_pages;
> bool tcp_skip_in_flight;
> char *work_dir;
> + bool remote;
> + char *local_cache_path;
> + char *local_proxy_path;
> };
>
> extern struct cr_options opts;
> diff --git a/criu/include/image.h b/criu/include/image.h
> index 65b7b0a..9ba6ab8 100644
> --- a/criu/include/image.h
> +++ b/criu/include/image.h
> @@ -104,6 +104,7 @@ extern bool img_common_magic;
> #define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC)
> #define O_SHOW (O_RDONLY | O_NOBUF)
> #define O_RSTR (O_RDONLY)
> +#define O_FORCE_LOCAL (O_SYNC)
>
> struct cr_img {
> union {
> diff --git a/criu/include/img-remote.h b/criu/include/img-remote.h
> new file mode 100644
> index 0000000..706b67f
> --- /dev/null
> +++ b/criu/include/img-remote.h
> @@ -0,0 +1,79 @@
> +#include <limits.h>
> +
> +#ifndef IMAGE_REMOTE_H
> +#define IMAGE_REMOTE_H
> +
> +#define PATHLEN PATH_MAX
> +#define DUMP_FINISH "DUMP_FINISH"
> +#define PARENT_IMG "parent"
> +#define NULL_SNAPSHOT_ID "null"
> +#define DEFAULT_IMG_PATH "/tmp/criu-img-path.sock"
> +#define DEFAULT_CACHE_PORT 9996
> +#define DEFAULT_CACHE_HOST "localhost"
> +
> +/* Called by restore to get the fd correspondent to a particular path. This call
> + * will block until the connection is received.
> + */
> +int read_remote_image_connection(char *snapshot_id, char *path);
> +
> +/* Called by dump to create a socket connection to the restore side. The socket
> + * fd is returned for further writing operations.
> + */
> +int write_remote_image_connection(char *snapshot_id, char *path, int flags);
> +
> +/* Called by dump when everything is dumped. This function creates a new
> + * connection with a special control name. The recover side uses it to ack that
> + * no more files are coming.
> + */
> +int finish_remote_dump();
> +
> +/* Starts an image proxy daemon (dump side). It receives image files through
> + * socket connections and forwards them to the image cache (restore side).
> + */
> +int image_proxy(char *local_proxy_path, char *cache_host, unsigned short cache_port);
> +
> +/* Starts an image cache daemon (restore side). It receives image files through
> + * socket connections and caches them until they are requested by the restore
> + * process.
> + */
> +int image_cache(char *local_cache_path, unsigned short cache_port);
> +
> +/* Reads (discards) 'len' bytes from fd. This is used to emulate the function
> + * lseek, which is used to advance the file needle.
> + */
> +int skip_remote_bytes(int fd, unsigned long len);
> +
> +/* To support iterative migration, the concept of snapshot_id is introduced
> + * (only when remote migration is enabled). Each image is tagged with one
> + * snapshot_id. The snapshot_id is the image directory used for the operation
> + * that creates the image (either predump or dump). Images stored in memory
> + * (both in Image Proxy and Image Cache) are identified by their name and
> + * snapshot_id. Snapshot_ids are ordered so that we can find parent pagemaps
> + * (that will be used when restoring the process).
> + */
> +
> +/* Sets the current snapshot_id */
> +void init_snapshot_id(char *ns);
> +
> +/* Returns the current snapshot_id. */
> +char *get_curr_snapshot_id();
> +
> +/* Returns the snapshot_id index representing the current snapshot_id. This
> + * index represents the hierarchy position. For example: images tagged with
> + * the snapshot_id with index 1 are more recent than the images tagged with
> + * the snapshot_id with index 0.
> + */
> +int get_curr_snapshot_id_idx();
> +
> +/* Returns the snapshot_id associated with the snapshot_id index. */
> +char *get_snapshot_id_from_idx(int idx);
> +
> +/* Pushes the current snapshot_id into the snapshot_id hierarchy (into the Image
> + * Proxy and Image Cache).
> + */
> +int push_snapshot_id();
> +
> +/* Returns the snapshot id index that preceeds the current snapshot_id. */
> +int get_curr_parent_snapshot_id_idx();
> +
> +#endif
> diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h
> index 6c76b49..43ac534 100644
> --- a/criu/include/protobuf-desc.h
> +++ b/criu/include/protobuf-desc.h
> @@ -59,6 +59,10 @@ enum {
> PB_BINFMT_MISC, /* 50 */
> PB_TTY_DATA,
> PB_AUTOFS,
> + PB_REMOTE_IMAGE, /* Header for images sent from proxy to cache.*/
> + PB_LOCAL_IMAGE, /* Header for reading/writing images from/to proxy or cache. */
> + PB_LOCAL_IMAGE_REPLY, /* Header for reading/writing images reply. */
> + PB_SNAPSHOT_ID, /* Contains a single id. Used for reading/writing ids from proxy or cache. */
>
> /* PB_AUTOGEN_STOP */
>
> diff --git a/criu/page-xfer.c b/criu/page-xfer.c
> index 0da20e2..678ed85 100644
> --- a/criu/page-xfer.c
> +++ b/criu/page-xfer.c
> @@ -19,6 +19,8 @@
> #include "pstree.h"
> #include "parasite-syscall.h"
>
> +#include "img-remote.h"
> +
> static int page_server_sk = -1;
>
> struct page_server_iov {
> @@ -310,7 +312,8 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
>
> xfer->parent = xmalloc(sizeof(*xfer->parent));
> if (!xfer->parent) {
> - close(pfd);
> + if (!opts.remote)
> + close(pfd);
> return -1;
> }
>
> @@ -319,10 +322,12 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
> pr_perror("No parent image found, though parent directory is set");
> xfree(xfer->parent);
> xfer->parent = NULL;
> - close(pfd);
> + if (!opts.remote)
> + close(pfd);
> goto out;
> }
> - close(pfd);
> + if (!opts.remote)
> + close(pfd);
> }
>
> out:
> @@ -459,9 +464,16 @@ int check_parent_local_xfer(int fd_type, int id)
> struct stat st;
> int ret, pfd;
>
> - pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> - if (pfd < 0 && errno == ENOENT)
> - return 0;
> + if (opts.remote) {
> + pfd = get_curr_parent_snapshot_id_idx();
> + pr_err("Unable to get parent snapsgot id");
> + if (pfd == -1)
> + return -1;
> + } else {
> + pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
> + if (pfd < 0 && errno == ENOENT)
> + return 0;
> + }
>
> snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id);
> ret = fstatat(pfd, path, &st, 0);
> @@ -523,6 +535,8 @@ int check_parent_page_xfer(int fd_type, long id)
> {
> if (opts.use_page_server)
> return check_parent_server_xfer(fd_type, id);
> + else if (opts.remote)
> + return get_curr_parent_snapshot_id_idx() == -1 ? 0 : 1;
> else
> return check_parent_local_xfer(fd_type, id);
> }
> diff --git a/criu/pagemap.c b/criu/pagemap.c
> index 227d561..81b31c5 100644
> --- a/criu/pagemap.c
> +++ b/criu/pagemap.c
> @@ -11,6 +11,8 @@
> #include "protobuf.h"
> #include "images/pagemap.pb-c.h"
>
> +#include "img-remote.h"
> +
> #ifndef SEEK_DATA
> #define SEEK_DATA 3
> #define SEEK_HOLE 4
> @@ -139,6 +141,8 @@ static int get_pagemap(struct page_read *pr, struct iovec *iov)
> if (!pe->zero)
> break;
> put_pagemap(pr);
> +
> + pe = pr->pmes[pr->curr_pme];
What is this hunk doing?
> }
>
> pagemap2iovec(pe, iov);
> @@ -160,7 +164,7 @@ static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
> return;
>
> pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
> - if (!pr->pe->in_parent && !pr->pe->zero && !pr->pe->lazy)
> + if (!pr->pe->in_parent && !pr->pe->zero && !pr->pe->lazy && !opts.remote)
This place also needs explanation.
> pr->pi_off += len;
> pr->cvaddr += len;
> }
> @@ -268,12 +272,18 @@ static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr,
> } else {
> int fd = img_raw_fd(pr->pi);
> off_t current_vaddr = lseek(fd, pr->pi_off, SEEK_SET);
> + size_t curr = 0;
>
> pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, current_vaddr);
> - ret = read(fd, buf, len);
> - if (ret != len) {
> - pr_perror("Can't read mapping page %d", ret);
> - return -1;
> + while (1) {
> + ret = read(fd, buf + curr, len - curr);
> + if (ret < 1) {
> + pr_perror("Can't read mapping page %d", ret);
> + return -1;
> + }
> + curr += ret;
> + if (curr == len)
> + break;
Please, send this as separate patch.
> }
>
> pr->pi_off += len;
> @@ -345,9 +355,24 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
> int pfd, ret;
> struct page_read *parent = NULL;
>
> - pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
> - if (pfd < 0 && errno == ENOENT)
> - goto out;
> + if (opts.remote) {
> + /* Note: we are replacing a real directory FD for a snapshot_id
> + * index. Since we need the parent of the current snapshot_id,
> + * we want the current snapshot_id index minus one. It is
> + * possible that dfd is already a snapshot_id index. We test it
> + * by comparing it to the service FD. When opening an image (see
> + * do_open_image) we convert the snapshot_id index into a real
> + * snapshot_id.
> + */
> + pfd = dfd == get_service_fd(IMG_FD_OFF) ?
> + get_curr_snapshot_id_idx() - 1 : dfd - 1;
> + if (pfd < 0)
> + goto out;
> + } else {
> + pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
> + if (pfd < 0 && errno == ENOENT)
> + goto out;
> + }
>
> parent = xmalloc(sizeof(*parent));
> if (!parent)
> @@ -362,7 +387,8 @@ static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
> parent = NULL;
> }
>
> - close(pfd);
> + if (!opts.remote)
> + close(pfd);
> out:
> pr->parent = parent;
> return 0;
> @@ -370,7 +396,8 @@ out:
> err_free:
> xfree(parent);
> err_cl:
> - close(pfd);
> + if (!opts.remote)
> + close(pfd);
> return -1;
> }
>
> @@ -387,7 +414,11 @@ static int init_pagemaps(struct page_read *pr)
> off_t fsize;
> int nr_pmes, nr_realloc;
>
> - fsize = img_raw_size(pr->pmi);
> + if (!opts.remote)
> + fsize = img_raw_size(pr->pmi);
> + else
> + fsize = 1024; /*FIXME*/
> +
> if (fsize < 0)
> return -1;
>
> diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c
> index 9352a76..c1850f9 100644
> --- a/criu/protobuf-desc.c
> +++ b/criu/protobuf-desc.c
> @@ -64,6 +64,7 @@
> #include "images/seccomp.pb-c.h"
> #include "images/binfmt-misc.pb-c.h"
> #include "images/autofs.pb-c.h"
> +#include "images/remote-image.pb-c.h"
>
> struct cr_pb_message_desc cr_pb_descs[PB_MAX];
>
> diff --git a/images/Makefile b/images/Makefile
> index cf50794..3753d62 100644
> --- a/images/Makefile
> +++ b/images/Makefile
> @@ -60,6 +60,7 @@ proto-obj-y += binfmt-misc.o
> proto-obj-y += time.o
> proto-obj-y += sysctl.o
> proto-obj-y += autofs.o
> +proto-obj-y += remote-image.o
>
> CFLAGS += -iquote $(obj)/
>
> diff --git a/images/remote-image.proto b/images/remote-image.proto
> new file mode 100644
> index 0000000..1212627
> --- /dev/null
> +++ b/images/remote-image.proto
> @@ -0,0 +1,20 @@
> +message local_image_entry {
> + required string name = 1;
> + required string snapshot_id = 2;
> + required uint32 open_mode = 3;
> +}
> +
> +message remote_image_entry {
> + required string name = 1;
> + required string snapshot_id = 2;
> + required uint32 open_mode = 3;
> + required uint64 size = 4;
> +}
> +
> +message local_image_reply_entry {
> + required uint32 error = 1;
> +}
> +
> +message snapshot_id_entry {
> + required string snapshot_id = 1;
> +}
>
That's the protocol. Can we have it separately (and the code too), please?
-- Pavel
More information about the CRIU
mailing list