[CRIU] [PATCH v7 7/9] unix: Add support of ghost sockets
Cyrill Gorcunov
gorcunov at gmail.com
Wed May 23 19:06:13 MSK 2018
Unix sockets may be connected via deleted socket name,
moreover the name may be reused (ie same sun_addr but
different inodes).
To be able to handle them we do a few tricks:
- when collecting sockets we figure out if "deleted"
mark is present on the socket and if such we order
this sockets creation and deletion with mutex, together
with adding missing directories, and save this descriptors
in fdstore if there are peers connected to
- on restore we connect via procfs/fd/X as suggested by
Andrew Vagin
Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
criu/cr-restore.c | 4 +
criu/include/sockets.h | 1 +
criu/sk-unix.c | 353 +++++++++++++++++++++++++++++++++++++++++--------
3 files changed, 305 insertions(+), 53 deletions(-)
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index e969c24cd1d8..645a0e724970 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -384,6 +384,10 @@ static int root_prepare_shared(void)
if (ret)
goto err;
+ ret = unix_prepare_root_shared();
+ if (ret)
+ goto err;
+
ret = add_fake_unix_queuers();
if (ret)
goto err;
diff --git a/criu/include/sockets.h b/criu/include/sockets.h
index 1d0e1f29304c..f2085ace70b2 100644
--- a/criu/include/sockets.h
+++ b/criu/include/sockets.h
@@ -60,6 +60,7 @@ extern int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg
extern int unix_sk_id_add(unsigned int ino);
extern int unix_sk_ids_parse(char *optarg);
+extern int unix_prepare_root_shared(void);
extern int do_dump_opt(int sk, int level, int name, void *val, int len);
#define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f))
diff --git a/criu/sk-unix.c b/criu/sk-unix.c
index 88859da02f35..e39f313aaa0b 100644
--- a/criu/sk-unix.c
+++ b/criu/sk-unix.c
@@ -9,6 +9,7 @@
#include <sys/un.h>
#include <stdlib.h>
#include <dlfcn.h>
+#include <libgen.h>
#include "libnetlink.h"
#include "cr_options.h"
@@ -31,6 +32,7 @@
#include "fdstore.h"
#include "fdinfo.h"
#include "kerndat.h"
+#include "rst-malloc.h"
#include "protobuf.h"
#include "images/sk-unix.pb-c.h"
@@ -89,11 +91,21 @@ struct unix_sk_desc {
UnixSkEntry *ue;
};
+/*
+ * The mutex_ghost is accessed from different tasks,
+ * so make sure it is in shared memory.
+ */
+static mutex_t *mutex_ghost;
+
static LIST_HEAD(unix_sockets);
+static LIST_HEAD(unix_ghost_addr);
static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d,
UnixSkEntry *ue, const struct fd_parms *p);
+struct unix_sk_info;
+static int unlink_sk(struct unix_sk_info *ui);
+
struct unix_sk_listen_icon {
unsigned int peer_ino;
struct unix_sk_desc *sk_desc;
@@ -886,12 +898,15 @@ struct unix_sk_info {
char *name;
char *name_dir;
unsigned flags;
+ int fdstore_id;
struct unix_sk_info *peer;
struct pprep_head peer_resolve; /* XXX : union with the above? */
struct file_desc d;
struct list_head connected; /* List of sockets, connected to me */
struct list_head node; /* To link in peer's connected list */
struct list_head scm_fles;
+ struct list_head ghost_node;
+ size_t ghost_dir_pos;
/*
* For DGRAM sockets with queues, we should only restore the queue
@@ -916,6 +931,8 @@ struct scm_fle {
#define USK_PAIR_MASTER 0x1
#define USK_PAIR_SLAVE 0x2
+#define USK_GHOST_FDSTORE 0x4 /* bound but removed address */
+#define USK_GHOST_RENAMED 0x8 /* temporary renamed address */
static struct unix_sk_info *find_unix_sk_by_ino(int ino)
{
@@ -1241,6 +1258,7 @@ static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd,
static int post_open_standalone(struct file_desc *d, int fd)
{
+ int fdstore_fd = -1, procfs_self_dir = -1, len;
struct unix_sk_info *ui;
struct unix_sk_info *peer;
struct sockaddr_un addr;
@@ -1269,22 +1287,49 @@ static int post_open_standalone(struct file_desc *d, int fd)
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
- memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
pr_info("\tConnect %d to %d\n", ui->ue->ino, peer->ue->ino);
- if (prep_unix_sk_cwd(peer, &cwd_fd, NULL, &ns_fd))
+ if (prep_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd))
return -1;
- if (connect(fd, (struct sockaddr *)&addr,
- sizeof(addr.sun_family) +
- peer->ue->name.len) < 0) {
+ if (peer->flags & USK_GHOST_FDSTORE) {
+ procfs_self_dir = open_proc(getpid(), "fd");
+ fdstore_fd = fdstore_get(peer->fdstore_id);
+
+ if (fdstore_fd < 0 || procfs_self_dir < 0)
+ goto err_revert_and_exit;
+
+ /*
+ * WARNING: After this call we rely on revert_unix_sk_cwd
+ * to restore the former directories so that connect
+ * will operate inside proc/$pid/fd/X.
+ */
+ if (fchdir(procfs_self_dir)) {
+ pr_perror("Can't change to procfs");
+ goto err_revert_and_exit;
+ }
+ len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%d", fdstore_fd);
+ } else {
+ memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
+ len = peer->ue->name.len;
+ }
+
+ /*
+ * Make sure the target is not being renamed at the moment
+ * while we're connecting in sake of ghost sockets.
+ */
+ mutex_lock(mutex_ghost);
+ if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) {
pr_perror("Can't connect %d socket", ui->ue->ino);
- revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
- return -1;
+ goto err_revert_and_exit;
}
+ mutex_unlock(mutex_ghost);
+
ui->is_connected = true;
+ close_safe(&procfs_self_dir);
+ close_safe(&fdstore_fd);
revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
restore_queue:
@@ -1296,48 +1341,126 @@ static int post_open_standalone(struct file_desc *d, int fd)
if (ui->queuer && !ui->queuer->peer_queue_restored)
return 1;
return restore_sk_common(fd, ui);
+
+err_revert_and_exit:
+ close_safe(&procfs_self_dir);
+ close_safe(&fdstore_fd);
+ revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
+ return -1;
}
-static int bind_deleted_unix_sk(int sk, struct unix_sk_info *ui,
- struct sockaddr_un *addr)
+static int keep_deleted(struct unix_sk_info *ui)
{
- char temp[PATH_MAX];
- int ret;
+ if (ui->flags & USK_GHOST_FDSTORE) {
+ int fd = open(ui->name, O_PATH);
+ if (fd < 0) {
+ pr_perror("ghost: Can't open id %#x ino %d addr %s",
+ ui->ue->id, ui->ue->ino, ui->name);
+ return -1;
+ }
+ ui->fdstore_id = fdstore_add(fd);
+ pr_debug("ghost: id %#x %d fdstore_id %d %s\n",
+ ui->ue->id, ui->ue->ino, ui->fdstore_id, ui->name);
+ close(fd);
+ return ui->fdstore_id;
+ }
+ return 0;
+}
- pr_info("found duplicate unix socket bound at %s\n", addr->sun_path);
+static int drop_deleted(struct unix_sk_info *ui)
+{
+ if (ui->ue->deleted)
+ return unlink_sk(ui);
+ return 0;
+}
- ret = snprintf(temp, sizeof(temp),
- "%s-%s-%d", addr->sun_path, "criu-temp", getpid());
- /* this shouldn't happen, since sun_addr is only 108 chars long */
- if (ret < 0 || ret >= sizeof(temp)) {
- pr_err("snprintf of %s failed?\n", addr->sun_path);
- return -1;
+#define UNIX_GHOST_FMT "%s.criu-sk-ghost"
+
+/*
+ * When path where socket lives is deleted, we need to reconstruct
+ * it back up but allow caller to remove it after.
+ */
+static int bind_on_deleted(int sk, struct unix_sk_info *ui)
+{
+ char path[PATH_MAX], *pos;
+ struct sockaddr_un addr;
+ int ret;
+
+ if (ui->ue->name.len >= sizeof(path)) {
+ pr_err("ghost: Too long name for socket\n");
+ return -ENOSPC;
}
- ret = rename(addr->sun_path, temp);
- if (ret < 0) {
- pr_perror("couldn't move socket for binding");
- return -1;
+ memcpy(path, ui->name, ui->ue->name.len);
+ path[ui->ue->name.len] = '\0';
+
+ for (pos = strrchr(path, '/'); pos;
+ pos = strrchr(path, '/')) {
+ *pos = '\0';
+
+ ret = access(path, R_OK | W_OK | X_OK);
+ if (ret == 0) {
+ ui->ghost_dir_pos = pos - path;
+ pr_debug("ghost: detected F_OK %s\n", path);
+ break;
+ }
+
+ if (errno != ENOENT) {
+ ret = -errno;
+ pr_perror("ghost: Can't access %s\n", path);
+ return ret;
+ }
}
- ret = bind(sk, (struct sockaddr *)addr,
- sizeof(addr->sun_family) + ui->ue->name.len);
- if (ret < 0) {
- pr_perror("Can't bind socket after move");
- return -1;
+ memcpy(path, ui->name, ui->ue->name.len);
+ path[ui->ue->name.len] = '\0';
+
+ pos = dirname(path);
+ pr_debug("ghost: creating %s\n", pos);
+ ret = mkdirpat(AT_FDCWD, pos, 0755);
+ if (ret) {
+ errno = -ret;
+ pr_perror("ghost: Can't create %s\n", pos);
+ return ret;
}
- ret = rename(temp, addr->sun_path);
+ memset(&addr, 0, sizeof(addr));
+ addr.sun_family = AF_UNIX;
+ memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
+
+ ret = bind(sk, (struct sockaddr *)&addr,
+ sizeof(addr.sun_family) + ui->ue->name.len);
if (ret < 0) {
- pr_perror("couldn't move socket back");
- return -1;
+ /*
+ * In case if there some real living socket
+ * with same name just move it aside for a
+ * while, we will move it back once ghost
+ * socket is processed.
+ */
+ if (errno == EADDRINUSE) {
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), UNIX_GHOST_FMT, ui->name);
+ if (rename(ui->name, path)) {
+ ret = -errno;
+ pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s\n",
+ ui->ue->id, ui->ue->ino, ui->name, path);
+ return ret;
+ }
+ ui->flags |= USK_GHOST_RENAMED;
+ pr_debug("ghost: id %#x ino %d renamed %s -> %s\n",
+ ui->ue->id, ui->ue->ino, ui->name, path);
+ ret = bind(sk, (struct sockaddr *)&addr,
+ sizeof(addr.sun_family) + ui->ue->name.len);
+ }
+ if (ret < 0) {
+ ret = -errno;
+ pr_perror("ghost: Can't bind on socket id %#x ino %d addr %s",
+ ui->ue->id, ui->ue->ino, ui->name);
+ return ret;
+ }
}
- /* we've handled the deleted-ness of this
- * socket and we don't want to delete it later
- * since it's not /this/ socket.
- */
- ui->ue->deleted = false;
return 0;
}
@@ -1365,22 +1488,40 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
addr.sun_family = AF_UNIX;
memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
- if (ui->name[0] && prep_unix_sk_cwd(ui, &cwd_fd, NULL, &ns_fd))
+ if (ui->name[0] && prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd))
return -1;
- ret = bind(sk, (struct sockaddr *)&addr,
- sizeof(addr.sun_family) + ui->ue->name.len);
+ /*
+ * Order binding for sake of ghost sockets. We might rename
+ * existing socket to some temp name, bind ghost, delete it,
+ * and finally move the former back, thus while we're doing
+ * this stuff we should not be interruped by connection
+ * from another sockets.
+ *
+ * FIXME: Probably wort make it per address rather for
+ * optimization sake.
+ */
+ mutex_lock(mutex_ghost);
+
+ if (ui->flags & USK_GHOST_FDSTORE) {
+ pr_debug("ghost: bind id %#x ino %d addr %s\n",
+ ui->ue->id, ui->ue->ino, ui->name);
+ ret = bind_on_deleted(sk, ui);
+ if (ret)
+ errno = -ret;
+ } else {
+ pr_debug("bind id %#x ino %d addr %s\n",
+ ui->ue->id, ui->ue->ino, ui->name);
+ ret = bind(sk, (struct sockaddr *)&addr,
+ sizeof(addr.sun_family) + ui->ue->name.len);
+ }
if (ret < 0) {
- if (ui->ue->has_deleted && ui->ue->deleted && errno == EADDRINUSE) {
- if (bind_deleted_unix_sk(sk, ui, &addr))
- goto done;
- } else {
- pr_perror("Can't bind socket");
- goto done;
- }
+ pr_perror("Can't bind id %#x ino %d addr %s",
+ ui->ue->id, ui->ue->ino, ui->name);
+ goto done;
}
- if (*ui->name && ui->ue->file_perms) {
+ if (ui->ue->file_perms) {
FilePermsEntry *perms = ui->ue->file_perms;
char fname[PATH_MAX];
@@ -1403,8 +1544,8 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
}
}
- if (ui->ue->deleted && unlink((char *)ui->ue->name.data) < 0) {
- pr_perror("failed to unlink %s", ui->ue->name.data);
+ if (keep_deleted(ui) < 0) {
+ pr_err("Can't save socket in fdstore\n");
goto done;
}
@@ -1416,6 +1557,9 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
exit_code = 0;
done:
revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd);
+ if (drop_deleted(ui))
+ exit_code = -1;
+ mutex_unlock(mutex_ghost);
return exit_code;
}
@@ -1551,11 +1695,27 @@ static int setup_second_end(int *sks, struct fdinfo_list_entry *second_end)
static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd)
{
struct unix_sk_info *queuer = ui->queuer;
- struct fdinfo_list_entry *fle;
+ struct unix_sk_info *peer = ui->peer;
+ struct fdinfo_list_entry *fle, *fle_peer;
int sk;
fle = file_master(&ui->d);
pr_info_opening("standalone", ui, fle);
+
+ /*
+ * If we're about to connect to the peer which
+ * has been bound to removed address we should
+ * wait until it is processed and put into fdstore
+ * engine, later we will use the engine to connect
+ * into it in a special way.
+ */
+ if (peer && (peer->flags & USK_GHOST_FDSTORE)) {
+ fle_peer = file_master(&peer->d);
+ if (fle_peer->stage < FLE_OPEN) {
+ return 1;
+ }
+ }
+
if (fle->stage == FLE_OPEN)
return post_open_standalone(&ui->d, fle->fe->fd);
@@ -1758,15 +1918,15 @@ static struct file_desc_ops unix_desc_ops = {
* Make FS clean from sockets we're about to
* restore. See for how we bind them for details
*/
-static void unlink_sk(struct unix_sk_info *ui)
+static int unlink_sk(struct unix_sk_info *ui)
{
- int ret, cwd_fd = -1, root_fd = -1, ns_fd = -1;
+ int ret = 0, cwd_fd = -1, root_fd = -1, ns_fd = -1;
if (!ui->name || ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN))
- return;
+ return 0;
if (prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, NULL))
- return;
+ return -1;
ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0;
if (ret < 0 && errno != ENOENT) {
@@ -1774,13 +1934,55 @@ static void unlink_sk(struct unix_sk_info *ui)
ui->ue->ino, ui->ue->peer,
ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
ui->name_dir ? ui->name_dir : "-");
+ ret = -errno;
+ goto out;
} else if (ret == 0) {
pr_debug("Unlinked socket %d peer %d (name %s dir %s)\n",
ui->ue->ino, ui->ue->peer,
ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
ui->name_dir ? ui->name_dir : "-");
}
+
+ if (ui->ghost_dir_pos) {
+ char path[PATH_MAX], *pos;
+
+ memcpy(path, ui->name, ui->ue->name.len);
+ path[ui->ue->name.len] = '\0';
+
+ for (pos = strrchr(path, '/');
+ pos && (pos - path) > ui->ghost_dir_pos;
+ pos = strrchr(path, '/')) {
+ *pos = '\0';
+ if (rmdir(path)) {
+ ret = - errno;
+ pr_perror("ghost: Can't remove %s\n", path);
+ goto out;
+ }
+ pr_debug("ghost: Removed %s\n", path);
+ }
+ }
+
+ /*
+ * If it was a ghost socket we should move original
+ * socket back into place.
+ */
+ if (ui->flags & USK_GHOST_RENAMED) {
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), UNIX_GHOST_FMT, ui->name);
+ if (rename(path, ui->name)) {
+ pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s\n",
+ ui->ue->id, ui->ue->ino, path, ui->name);
+ ret = -errno;
+ } else {
+ pr_debug("ghost: id %#x ino %d addr %s -> %s\n",
+ ui->ue->id, ui->ue->ino, path, ui->name);
+ }
+ }
+
+out:
revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd);
+ return ret;
}
static void try_resolve_unix_peer(struct unix_sk_info *ui);
@@ -1812,6 +2014,8 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
ui->name_dir = (void *)ue->name_dir;
ui->flags = 0;
+ ui->fdstore_id = -1;
+ ui->ghost_dir_pos = 0;
ui->peer = NULL;
ui->queuer = NULL;
ui->bound = 0;
@@ -1826,6 +2030,40 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
INIT_LIST_HEAD(&ui->connected);
INIT_LIST_HEAD(&ui->node);
INIT_LIST_HEAD(&ui->scm_fles);
+ INIT_LIST_HEAD(&ui->ghost_node);
+
+ return 0;
+}
+
+int unix_prepare_root_shared(void)
+{
+ struct unix_sk_info *ui;
+
+ mutex_ghost = shmalloc(sizeof(*mutex_ghost));
+ if (!mutex_ghost) {
+ pr_err("ghost: Can't allocate mutex\n");
+ return -ENOMEM;
+ }
+ mutex_init(mutex_ghost);
+
+ pr_debug("ghost: Resolving addresses\n");
+
+ list_for_each_entry(ui, &unix_ghost_addr, ghost_node) {
+ pr_debug("ghost: id %#x type %s state %s ino %d peer %d address %s\n",
+ ui->ue->id, socket_type_name(ui->ue->type),
+ tcp_state_name(ui->ue->state),
+ ui->ue->ino, ui->peer ? ui->peer->ue->ino : 0,
+ ui->name);
+
+ /*
+ * Drop any existing trash on the FS and mark the
+ * peer as a ghost one, so we will put it into
+ * fdstore to be able to connect into it even
+ * when the address is removed from the FS.
+ */
+ unlink_sk(ui);
+ ui->flags |= USK_GHOST_FDSTORE;
+ }
return 0;
}
@@ -1873,6 +2111,15 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
add_post_prepare_cb(&ui->peer_resolve);
}
+ if (ui->ue->deleted) {
+ if (!ui->name || !ui->ue->name.len || !ui->name[0]) {
+ pr_err("No name present, ino %d\n", ui->ue->ino);
+ return -1;
+ }
+
+ list_add_tail(&ui->ghost_node, &unix_ghost_addr);
+ }
+
list_add_tail(&ui->list, &unix_sockets);
return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops);
}
--
2.14.3
More information about the CRIU
mailing list