[CRIU] [PATCH v6 7/9] unix: Add support of ghost sockets

Cyrill Gorcunov gorcunov at gmail.com
Mon May 21 23:05:59 MSK 2018


Unix sockets may be connected via deleted socket name,
moreover the name may be reused (ie same sun_addr but
different inodes).

To be able to handle them we do a few tricks:

 - when collecting sockets we figure out if "deleted"
   mark is present on the socket and if such we order
   this sockets creation and deletion with mutex, together
   with adding missing directories, and save this descriptors
   in fdstore if there are peers connected to

 - on restore we connect via procfs/fd/X as suggested by
   Andrew Vagin

Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
 criu/cr-restore.c      |   4 +
 criu/include/sockets.h |   1 +
 criu/sk-unix.c         | 357 +++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 309 insertions(+), 53 deletions(-)

diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index e969c24cd1d8..645a0e724970 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -384,6 +384,10 @@ static int root_prepare_shared(void)
 	if (ret)
 		goto err;
 
+	ret = unix_prepare_root_shared();
+	if (ret)
+		goto err;
+
 	ret = add_fake_unix_queuers();
 	if (ret)
 		goto err;
diff --git a/criu/include/sockets.h b/criu/include/sockets.h
index 1d0e1f29304c..f2085ace70b2 100644
--- a/criu/include/sockets.h
+++ b/criu/include/sockets.h
@@ -60,6 +60,7 @@ extern int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg
 
 extern int unix_sk_id_add(unsigned int ino);
 extern int unix_sk_ids_parse(char *optarg);
+extern int unix_prepare_root_shared(void);
 
 extern int do_dump_opt(int sk, int level, int name, void *val, int len);
 #define dump_opt(s, l, n, f)	do_dump_opt(s, l, n, f, sizeof(*f))
diff --git a/criu/sk-unix.c b/criu/sk-unix.c
index 88859da02f35..c0b607d3a1d0 100644
--- a/criu/sk-unix.c
+++ b/criu/sk-unix.c
@@ -9,6 +9,7 @@
 #include <sys/un.h>
 #include <stdlib.h>
 #include <dlfcn.h>
+#include <libgen.h>
 
 #include "libnetlink.h"
 #include "cr_options.h"
@@ -31,6 +32,7 @@
 #include "fdstore.h"
 #include "fdinfo.h"
 #include "kerndat.h"
+#include "rst-malloc.h"
 
 #include "protobuf.h"
 #include "images/sk-unix.pb-c.h"
@@ -89,11 +91,21 @@ struct unix_sk_desc {
 	UnixSkEntry		*ue;
 };
 
+/*
+ * The mutex_ghost is accessed from different tasks,
+ * so make sure it is in shared memory.
+ */
+static mutex_t *mutex_ghost;
+
 static LIST_HEAD(unix_sockets);
+static LIST_HEAD(unix_ghost_addr);
 
 static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d,
 			     UnixSkEntry *ue, const struct fd_parms *p);
 
+struct unix_sk_info;
+static int unlink_sk(struct unix_sk_info *ui);
+
 struct unix_sk_listen_icon {
 	unsigned int			peer_ino;
 	struct unix_sk_desc		*sk_desc;
@@ -886,12 +898,15 @@ struct unix_sk_info {
 	char			*name;
 	char			*name_dir;
 	unsigned		flags;
+	int			fdstore_id;
 	struct unix_sk_info	*peer;
 	struct pprep_head	peer_resolve; /* XXX : union with the above? */
 	struct file_desc	d;
 	struct list_head	connected; /* List of sockets, connected to me */
 	struct list_head	node; /* To link in peer's connected list  */
 	struct list_head	scm_fles;
+	struct list_head	ghost_node;
+	size_t			ghost_dir_pos;
 
 	/*
 	 * For DGRAM sockets with queues, we should only restore the queue
@@ -916,6 +931,8 @@ struct scm_fle {
 
 #define USK_PAIR_MASTER		0x1
 #define USK_PAIR_SLAVE		0x2
+#define USK_GHOST_FDSTORE	0x4
+#define USK_GHOST_RENAMED	0x8
 
 static struct unix_sk_info *find_unix_sk_by_ino(int ino)
 {
@@ -1241,6 +1258,7 @@ static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd,
 
 static int post_open_standalone(struct file_desc *d, int fd)
 {
+	int fdstore_fd = -1, procfs_self_dir = -1, len;
 	struct unix_sk_info *ui;
 	struct unix_sk_info *peer;
 	struct sockaddr_un addr;
@@ -1269,22 +1287,49 @@ static int post_open_standalone(struct file_desc *d, int fd)
 
 	memset(&addr, 0, sizeof(addr));
 	addr.sun_family = AF_UNIX;
-	memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
 
 	pr_info("\tConnect %d to %d\n", ui->ue->ino, peer->ue->ino);
 
-	if (prep_unix_sk_cwd(peer, &cwd_fd, NULL, &ns_fd))
+	if (prep_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd))
 		return -1;
 
-	if (connect(fd, (struct sockaddr *)&addr,
-				sizeof(addr.sun_family) +
-				peer->ue->name.len) < 0) {
+	if (peer->flags & USK_GHOST_FDSTORE) {
+		procfs_self_dir = open_proc(getpid(), "fd");
+		fdstore_fd = fdstore_get(peer->fdstore_id);
+
+		if (fdstore_fd < 0 || procfs_self_dir < 0)
+			goto err_revert_and_exit;
+
+		/*
+		 * WARNING: After this call we rely on revert_unix_sk_cwd
+		 * to restore the former directories so that connect
+		 * will operate inside proc/$pid/fd/X.
+		 */
+		if (fchdir(procfs_self_dir)) {
+			pr_perror("Can't change to procfs");
+			goto err_revert_and_exit;
+		}
+		len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%d", fdstore_fd);
+	} else {
+		memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
+		len = peer->ue->name.len;
+	}
+
+	/*
+	 * Make sure the target is not being renamed at the moment
+	 * while we're connecting in sake of ghost sockets.
+	 */
+	mutex_lock(mutex_ghost);
+	if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) {
 		pr_perror("Can't connect %d socket", ui->ue->ino);
-		revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
-		return -1;
+		goto err_revert_and_exit;
 	}
+	mutex_unlock(mutex_ghost);
+
 	ui->is_connected = true;
 
+	close_safe(&procfs_self_dir);
+	close_safe(&fdstore_fd);
 	revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
 
 restore_queue:
@@ -1296,48 +1341,126 @@ static int post_open_standalone(struct file_desc *d, int fd)
 	if (ui->queuer && !ui->queuer->peer_queue_restored)
 		return 1;
 	return restore_sk_common(fd, ui);
+
+err_revert_and_exit:
+	close_safe(&procfs_self_dir);
+	close_safe(&fdstore_fd);
+	revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
+	return -1;
 }
 
-static int bind_deleted_unix_sk(int sk, struct unix_sk_info *ui,
-					struct sockaddr_un *addr)
+static int keep_deleted(struct unix_sk_info *ui)
 {
-	char temp[PATH_MAX];
-	int ret;
+	if (ui->flags & USK_GHOST_FDSTORE) {
+		int fd = open(ui->name, O_PATH);
+		if (fd < 0) {
+			pr_perror("ghost: Can't open id %#x ino %d addr %s",
+				  ui->ue->id, ui->ue->ino, ui->name);
+			return -1;
+		}
+		ui->fdstore_id = fdstore_add(fd);
+		pr_debug("ghost: id %#x %d fdstore_id %d %s\n",
+			 ui->ue->id, ui->ue->ino, ui->fdstore_id, ui->name);
+		close(fd);
+		return ui->fdstore_id;
+	}
+	return 0;
+}
 
-	pr_info("found duplicate unix socket bound at %s\n", addr->sun_path);
+static int drop_deleted(struct unix_sk_info *ui)
+{
+	if (ui->ue->deleted)
+		return unlink_sk(ui);
+	return 0;
+}
 
-	ret = snprintf(temp, sizeof(temp),
-			"%s-%s-%d", addr->sun_path, "criu-temp", getpid());
-	/* this shouldn't happen, since sun_addr is only 108 chars long */
-	if (ret < 0 || ret >= sizeof(temp)) {
-		pr_err("snprintf of %s failed?\n", addr->sun_path);
-		return -1;
+#define UNIX_GHOST_FMT "%s.criu-sk-ghost"
+
+/*
+ * When path where socket lives is deleted, we need to reconstruct
+ * it back up but allow caller to remove it after.
+ */
+static int bind_on_deleted(int sk, struct unix_sk_info *ui)
+{
+	char path[PATH_MAX], *pos;
+	struct sockaddr_un addr;
+	int ret;
+
+	if (ui->ue->name.len >= sizeof(path)) {
+		pr_err("ghost: Too long name for socket\n");
+		return -ENOSPC;
 	}
 
-	ret = rename(addr->sun_path, temp);
-	if (ret < 0) {
-		pr_perror("couldn't move socket for binding");
-		return -1;
+	memcpy(path, ui->name, ui->ue->name.len);
+	path[ui->ue->name.len] = '\0';
+
+	for (pos = strrchr(path, '/'); pos;
+	     pos = strrchr(path, '/')) {
+		*pos = '\0';
+
+		ret = access(path, R_OK | W_OK | X_OK);
+		if (ret == 0) {
+			ui->ghost_dir_pos = pos - path;
+			pr_debug("ghost: detected F_OK %s\n", path);
+			break;
+		}
+
+		if (errno != ENOENT) {
+			ret = -errno;
+			pr_perror("ghost: Can't access %s\n", path);
+			return ret;
+		}
 	}
 
-	ret = bind(sk, (struct sockaddr *)addr,
-			sizeof(addr->sun_family) + ui->ue->name.len);
-	if (ret < 0) {
-		pr_perror("Can't bind socket after move");
-		return -1;
+	memcpy(path, ui->name, ui->ue->name.len);
+	path[ui->ue->name.len] = '\0';
+
+	pos = dirname(path);
+	pr_debug("ghost: creating %s\n", pos);
+	ret = mkdirpat(AT_FDCWD, pos, 0755);
+	if (ret) {
+		errno = -ret;
+		pr_perror("ghost: Can't create %s\n", pos);
+		return ret;
 	}
 
-	ret = rename(temp, addr->sun_path);
+	memset(&addr, 0, sizeof(addr));
+	addr.sun_family = AF_UNIX;
+	memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
+
+	ret = bind(sk, (struct sockaddr *)&addr,
+		   sizeof(addr.sun_family) + ui->ue->name.len);
 	if (ret < 0) {
-		pr_perror("couldn't move socket back");
-		return -1;
+		/*
+		 * In case if there some real living socket
+		 * with same name just move it aside for a
+		 * while, we will move it back once ghost
+		 * socket is processed.
+		 */
+		if (errno == EADDRINUSE) {
+			char path[PATH_MAX];
+
+			snprintf(path, sizeof(path), UNIX_GHOST_FMT, ui->name);
+			if (rename(ui->name, path)) {
+				ret = -errno;
+				pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s\n",
+					  ui->ue->id, ui->ue->ino, ui->name, path);
+				return ret;
+			}
+			ui->flags |= USK_GHOST_RENAMED;
+			pr_debug("ghost: id %#x ino %d renamed %s -> %s\n",
+				 ui->ue->id, ui->ue->ino, ui->name, path);
+			ret = bind(sk, (struct sockaddr *)&addr,
+				   sizeof(addr.sun_family) + ui->ue->name.len);
+		}
+		if (ret < 0) {
+			ret = -errno;
+			pr_perror("ghost: Can't bind on socket id %#x ino %d addr %s",
+				  ui->ue->id, ui->ue->ino, ui->name);
+			return ret;
+		}
 	}
 
-	/* we've handled the deleted-ness of this
-	 * socket and we don't want to delete it later
-	 * since it's not /this/ socket.
-	 */
-	ui->ue->deleted = false;
 	return 0;
 }
 
@@ -1365,22 +1488,40 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
 	addr.sun_family = AF_UNIX;
 	memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
 
-	if (ui->name[0] && prep_unix_sk_cwd(ui, &cwd_fd, NULL, &ns_fd))
+	if (ui->name[0] && prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd))
 		return -1;
 
-	ret = bind(sk, (struct sockaddr *)&addr,
-			sizeof(addr.sun_family) + ui->ue->name.len);
+	/*
+	 * Order binding for sake of ghost sockets. We might rename
+	 * existing socket to some temp name, bind ghost, delete it,
+	 * and finally move the former back, thus while we're doing
+	 * this stuff we should not be interruped by connection
+	 * from another sockets.
+	 *
+	 * FIXME: Probably wort make it per address rather for
+	 * optimization sake.
+	 */
+	mutex_lock(mutex_ghost);
+
+	if (ui->flags & USK_GHOST_FDSTORE) {
+		pr_debug("ghost: bind id %#x ino %d addr %s\n",
+			 ui->ue->id, ui->ue->ino, ui->name);
+		ret = bind_on_deleted(sk, ui);
+		if (ret)
+			errno = -ret;
+	} else {
+		pr_debug("bind id %#x ino %d addr %s\n",
+			 ui->ue->id, ui->ue->ino, ui->name);
+		ret = bind(sk, (struct sockaddr *)&addr,
+			   sizeof(addr.sun_family) + ui->ue->name.len);
+	}
 	if (ret < 0) {
-		if (ui->ue->has_deleted && ui->ue->deleted && errno == EADDRINUSE) {
-			if (bind_deleted_unix_sk(sk, ui, &addr))
-				goto done;
-		} else {
-			pr_perror("Can't bind socket");
-			goto done;
-		}
+		pr_perror("Can't bind id %#x ino %d addr %s",
+			  ui->ue->id, ui->ue->ino, ui->name);
+		goto done;
 	}
 
-	if (*ui->name && ui->ue->file_perms) {
+	if (ui->ue->file_perms) {
 		FilePermsEntry *perms = ui->ue->file_perms;
 		char fname[PATH_MAX];
 
@@ -1403,8 +1544,8 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
 		}
 	}
 
-	if (ui->ue->deleted && unlink((char *)ui->ue->name.data) < 0) {
-		pr_perror("failed to unlink %s", ui->ue->name.data);
+	if (keep_deleted(ui) < 0) {
+		pr_err("Can't save socket in fdstore\n");
 		goto done;
 	}
 
@@ -1416,6 +1557,9 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
 	exit_code = 0;
 done:
 	revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd);
+	if (drop_deleted(ui))
+		exit_code = -1;
+	mutex_unlock(mutex_ghost);
 	return exit_code;
 }
 
@@ -1551,11 +1695,20 @@ static int setup_second_end(int *sks, struct fdinfo_list_entry *second_end)
 static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd)
 {
 	struct unix_sk_info *queuer = ui->queuer;
-	struct fdinfo_list_entry *fle;
+	struct unix_sk_info *peer = ui->peer;
+	struct fdinfo_list_entry *fle, *fle_peer;
 	int sk;
 
 	fle = file_master(&ui->d);
 	pr_info_opening("standalone", ui, fle);
+
+	if (peer && (peer->flags & USK_GHOST_FDSTORE)) {
+		fle_peer = file_master(&peer->d);
+		if (fle_peer->stage < FLE_OPEN) {
+			return 1;
+		}
+	}
+
 	if (fle->stage == FLE_OPEN)
 		return post_open_standalone(&ui->d, fle->fe->fd);
 
@@ -1758,15 +1911,15 @@ static struct file_desc_ops unix_desc_ops = {
  * Make FS clean from sockets we're about to
  * restore. See for how we bind them for details
  */
-static void unlink_sk(struct unix_sk_info *ui)
+static int unlink_sk(struct unix_sk_info *ui)
 {
-	int ret, cwd_fd = -1, root_fd = -1, ns_fd = -1;
+	int ret = 0, cwd_fd = -1, root_fd = -1, ns_fd = -1;
 
 	if (!ui->name || ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN))
-		return;
+		return 0;
 
 	if (prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, NULL))
-		return;
+		return -1;
 
 	ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0;
 	if (ret < 0 && errno != ENOENT) {
@@ -1774,13 +1927,55 @@ static void unlink_sk(struct unix_sk_info *ui)
 			ui->ue->ino, ui->ue->peer,
 			ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
 			ui->name_dir ? ui->name_dir : "-");
+		ret = -errno;
+		goto out;
 	} else if (ret == 0) {
 		pr_debug("Unlinked socket %d peer %d (name %s dir %s)\n",
 			 ui->ue->ino, ui->ue->peer,
 			 ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
 			 ui->name_dir ? ui->name_dir : "-");
 	}
+
+	if (ui->ghost_dir_pos) {
+		char path[PATH_MAX], *pos;
+
+		memcpy(path, ui->name, ui->ue->name.len);
+		path[ui->ue->name.len] = '\0';
+
+		for (pos = strrchr(path, '/');
+		     pos && (pos - path) > ui->ghost_dir_pos;
+		     pos = strrchr(path, '/')) {
+			*pos = '\0';
+			if (rmdir(path)) {
+				ret = - errno;
+				pr_perror("ghost: Can't remove %s\n", path);
+				goto out;
+			}
+			pr_debug("ghost: Removed %s\n", path);
+		}
+	}
+
+	/*
+	 * If it was a ghost socket we should move original
+	 * socket back into place.
+	 */
+	if (ui->flags & USK_GHOST_RENAMED) {
+		char path[PATH_MAX];
+
+		snprintf(path, sizeof(path), UNIX_GHOST_FMT, ui->name);
+		if (rename(path, ui->name)) {
+			pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s\n",
+				  ui->ue->id, ui->ue->ino, path, ui->name);
+			ret = -errno;
+		} else {
+			pr_debug("ghost: id %#x ino %d addr %s -> %s\n",
+				 ui->ue->id, ui->ue->ino, path, ui->name);
+		}
+	}
+
+out:
 	revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd);
+	return ret;
 }
 
 static void try_resolve_unix_peer(struct unix_sk_info *ui);
@@ -1812,6 +2007,8 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
 	ui->name_dir = (void *)ue->name_dir;
 
 	ui->flags		= 0;
+	ui->fdstore_id		= -1;
+	ui->ghost_dir_pos	= 0;
 	ui->peer		= NULL;
 	ui->queuer		= NULL;
 	ui->bound		= 0;
@@ -1826,6 +2023,51 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
 	INIT_LIST_HEAD(&ui->connected);
 	INIT_LIST_HEAD(&ui->node);
 	INIT_LIST_HEAD(&ui->scm_fles);
+	INIT_LIST_HEAD(&ui->ghost_node);
+
+	return 0;
+}
+
+int unix_prepare_root_shared(void)
+{
+	struct unix_sk_info *ui, *t;
+
+	mutex_ghost = shmalloc(sizeof(*mutex_ghost));
+	if (!mutex_ghost) {
+		pr_err("ghost: Can't allocate mutex\n");
+		return -ENOMEM;
+	}
+	mutex_init(mutex_ghost);
+
+	pr_debug("ghost: Resolving addresses\n");
+
+	list_for_each_entry(ui, &unix_ghost_addr, ghost_node) {
+		pr_debug("ghost: id %#x type %s state %s ino %d peer %d address %s\n",
+			 ui->ue->id, socket_type_name(ui->ue->type),
+			 tcp_state_name(ui->ue->state),
+			 ui->ue->ino, ui->peer ? ui->peer->ue->ino : 0,
+			 ui->name);
+
+		unlink_sk(ui);
+
+		/*
+		 * Figure out who is connected to this peer,
+		 * so the name will be removed from FS only
+		 * when last one is connected.
+		 */
+		list_for_each_entry(t, &unix_sockets, list) {
+			if (t->flags & USK_GHOST_FDSTORE)
+				continue;
+			if (ui == t || t->peer != ui)
+				continue;
+
+			pr_debug("\t\tghost: id %#x type %s state %s connected to us %d -> %d\n",
+				 t->ue->id, socket_type_name(t->ue->type),
+				 tcp_state_name(t->ue->state),
+				 t->ue->ino, ui->ue->ino);
+		}
+		ui->flags |= USK_GHOST_FDSTORE;
+	}
 
 	return 0;
 }
@@ -1873,6 +2115,15 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
 		add_post_prepare_cb(&ui->peer_resolve);
 	}
 
+	if (ui->ue->deleted) {
+		if (!ui->name || !ui->ue->name.len || !ui->name[0]) {
+			pr_err("No name present, ino %d\n", ui->ue->ino);
+			return -1;
+		}
+
+		list_add_tail(&ui->ghost_node, &unix_ghost_addr);
+	}
+
 	list_add_tail(&ui->list, &unix_sockets);
 	return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops);
 }
-- 
2.14.3



More information about the CRIU mailing list