[CRIU] [PATCH v7 7/9] unix: Add support of ghost sockets

Andrey Vagin avagin at virtuozzo.com
Mon Jun 4 22:52:20 MSK 2018


On Wed, May 23, 2018 at 07:06:13PM +0300, Cyrill Gorcunov wrote:
> Unix sockets may be connected via deleted socket name,
> moreover the name may be reused (ie same sun_addr but
> different inodes).
> 
> To be able to handle them we do a few tricks:
> 
>  - when collecting sockets we figure out if "deleted"
>    mark is present on the socket and if such we order
>    this sockets creation and deletion with mutex, together
>    with adding missing directories, and save this descriptors
>    in fdstore if there are peers connected to
> 
>  - on restore we connect via procfs/fd/X as suggested by
>    Andrew Vagin
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
> ---
>  criu/cr-restore.c      |   4 +
>  criu/include/sockets.h |   1 +
>  criu/sk-unix.c         | 353 +++++++++++++++++++++++++++++++++++++++++--------
>  3 files changed, 305 insertions(+), 53 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index e969c24cd1d8..645a0e724970 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -384,6 +384,10 @@ static int root_prepare_shared(void)
>  	if (ret)
>  		goto err;
>  
> +	ret = unix_prepare_root_shared();
> +	if (ret)
> +		goto err;
> +
>  	ret = add_fake_unix_queuers();
>  	if (ret)
>  		goto err;
> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
> index 1d0e1f29304c..f2085ace70b2 100644
> --- a/criu/include/sockets.h
> +++ b/criu/include/sockets.h
> @@ -60,6 +60,7 @@ extern int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg
>  
>  extern int unix_sk_id_add(unsigned int ino);
>  extern int unix_sk_ids_parse(char *optarg);
> +extern int unix_prepare_root_shared(void);
>  
>  extern int do_dump_opt(int sk, int level, int name, void *val, int len);
>  #define dump_opt(s, l, n, f)	do_dump_opt(s, l, n, f, sizeof(*f))
> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
> index 88859da02f35..e39f313aaa0b 100644
> --- a/criu/sk-unix.c
> +++ b/criu/sk-unix.c
> @@ -9,6 +9,7 @@
>  #include <sys/un.h>
>  #include <stdlib.h>
>  #include <dlfcn.h>
> +#include <libgen.h>
>  
>  #include "libnetlink.h"
>  #include "cr_options.h"
> @@ -31,6 +32,7 @@
>  #include "fdstore.h"
>  #include "fdinfo.h"
>  #include "kerndat.h"
> +#include "rst-malloc.h"
>  
>  #include "protobuf.h"
>  #include "images/sk-unix.pb-c.h"
> @@ -89,11 +91,21 @@ struct unix_sk_desc {
>  	UnixSkEntry		*ue;
>  };
>  
> +/*
> + * The mutex_ghost is accessed from different tasks,
> + * so make sure it is in shared memory.
> + */
> +static mutex_t *mutex_ghost;
> +
>  static LIST_HEAD(unix_sockets);
> +static LIST_HEAD(unix_ghost_addr);
>  
>  static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d,
>  			     UnixSkEntry *ue, const struct fd_parms *p);
>  
> +struct unix_sk_info;
> +static int unlink_sk(struct unix_sk_info *ui);
> +
>  struct unix_sk_listen_icon {
>  	unsigned int			peer_ino;
>  	struct unix_sk_desc		*sk_desc;
> @@ -886,12 +898,15 @@ struct unix_sk_info {
>  	char			*name;
>  	char			*name_dir;
>  	unsigned		flags;
> +	int			fdstore_id;
>  	struct unix_sk_info	*peer;
>  	struct pprep_head	peer_resolve; /* XXX : union with the above? */
>  	struct file_desc	d;
>  	struct list_head	connected; /* List of sockets, connected to me */
>  	struct list_head	node; /* To link in peer's connected list  */
>  	struct list_head	scm_fles;
> +	struct list_head	ghost_node;
> +	size_t			ghost_dir_pos;
>  
>  	/*
>  	 * For DGRAM sockets with queues, we should only restore the queue
> @@ -916,6 +931,8 @@ struct scm_fle {
>  
>  #define USK_PAIR_MASTER		0x1
>  #define USK_PAIR_SLAVE		0x2
> +#define USK_GHOST_FDSTORE	0x4	/* bound but removed address */
> +#define USK_GHOST_RENAMED	0x8	/* temporary renamed address */
>  
>  static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>  {
> @@ -1241,6 +1258,7 @@ static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd,
>  
>  static int post_open_standalone(struct file_desc *d, int fd)
>  {
> +	int fdstore_fd = -1, procfs_self_dir = -1, len;
>  	struct unix_sk_info *ui;
>  	struct unix_sk_info *peer;
>  	struct sockaddr_un addr;
> @@ -1269,22 +1287,49 @@ static int post_open_standalone(struct file_desc *d, int fd)
>  
>  	memset(&addr, 0, sizeof(addr));
>  	addr.sun_family = AF_UNIX;
> -	memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
>  
>  	pr_info("\tConnect %d to %d\n", ui->ue->ino, peer->ue->ino);
>  
> -	if (prep_unix_sk_cwd(peer, &cwd_fd, NULL, &ns_fd))
> +	if (prep_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd))
>  		return -1;
>  
> -	if (connect(fd, (struct sockaddr *)&addr,
> -				sizeof(addr.sun_family) +
> -				peer->ue->name.len) < 0) {
> +	if (peer->flags & USK_GHOST_FDSTORE) {
> +		procfs_self_dir = open_proc(getpid(), "fd");
> +		fdstore_fd = fdstore_get(peer->fdstore_id);
> +
> +		if (fdstore_fd < 0 || procfs_self_dir < 0)
> +			goto err_revert_and_exit;
> +
> +		/*
> +		 * WARNING: After this call we rely on revert_unix_sk_cwd
> +		 * to restore the former directories so that connect
> +		 * will operate inside proc/$pid/fd/X.
> +		 */
> +		if (fchdir(procfs_self_dir)) {
> +			pr_perror("Can't change to procfs");
> +			goto err_revert_and_exit;
> +		}
> +		len = snprintf(addr.sun_path, UNIX_PATH_MAX, "%d", fdstore_fd);
> +	} else {
> +		memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
> +		len = peer->ue->name.len;
> +	}
> +
> +	/*
> +	 * Make sure the target is not being renamed at the moment
> +	 * while we're connecting in sake of ghost sockets.
> +	 */
> +	mutex_lock(mutex_ghost);
> +	if (connect(fd, (struct sockaddr *)&addr, sizeof(addr.sun_family) + len) < 0) {
>  		pr_perror("Can't connect %d socket", ui->ue->ino);
> -		revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
> -		return -1;
> +		goto err_revert_and_exit;
>  	}
> +	mutex_unlock(mutex_ghost);
> +
>  	ui->is_connected = true;
>  
> +	close_safe(&procfs_self_dir);
> +	close_safe(&fdstore_fd);
>  	revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
>  
>  restore_queue:
> @@ -1296,48 +1341,126 @@ static int post_open_standalone(struct file_desc *d, int fd)
>  	if (ui->queuer && !ui->queuer->peer_queue_restored)
>  		return 1;
>  	return restore_sk_common(fd, ui);
> +
> +err_revert_and_exit:
> +	close_safe(&procfs_self_dir);
> +	close_safe(&fdstore_fd);
> +	revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
> +	return -1;
>  }
>  
> -static int bind_deleted_unix_sk(int sk, struct unix_sk_info *ui,
> -					struct sockaddr_un *addr)
> +static int keep_deleted(struct unix_sk_info *ui)
>  {
> -	char temp[PATH_MAX];
> -	int ret;
> +	if (ui->flags & USK_GHOST_FDSTORE) {
> +		int fd = open(ui->name, O_PATH);
> +		if (fd < 0) {
> +			pr_perror("ghost: Can't open id %#x ino %d addr %s",
> +				  ui->ue->id, ui->ue->ino, ui->name);
> +			return -1;
> +		}
> +		ui->fdstore_id = fdstore_add(fd);
> +		pr_debug("ghost: id %#x %d fdstore_id %d %s\n",
> +			 ui->ue->id, ui->ue->ino, ui->fdstore_id, ui->name);
> +		close(fd);
> +		return ui->fdstore_id;
> +	}
> +	return 0;
> +}
>  
> -	pr_info("found duplicate unix socket bound at %s\n", addr->sun_path);
> +static int drop_deleted(struct unix_sk_info *ui)
> +{
> +	if (ui->ue->deleted)
> +		return unlink_sk(ui);
> +	return 0;
> +}
>  
> -	ret = snprintf(temp, sizeof(temp),
> -			"%s-%s-%d", addr->sun_path, "criu-temp", getpid());
> -	/* this shouldn't happen, since sun_addr is only 108 chars long */
> -	if (ret < 0 || ret >= sizeof(temp)) {
> -		pr_err("snprintf of %s failed?\n", addr->sun_path);
> -		return -1;
> +#define UNIX_GHOST_FMT "%s.criu-sk-ghost"
> +
> +/*
> + * When path where socket lives is deleted, we need to reconstruct
> + * it back up but allow caller to remove it after.
> + */
> +static int bind_on_deleted(int sk, struct unix_sk_info *ui)
> +{
> +	char path[PATH_MAX], *pos;
> +	struct sockaddr_un addr;
> +	int ret;
> +
> +	if (ui->ue->name.len >= sizeof(path)) {
> +		pr_err("ghost: Too long name for socket\n");
> +		return -ENOSPC;
>  	}
>  
> -	ret = rename(addr->sun_path, temp);
> -	if (ret < 0) {
> -		pr_perror("couldn't move socket for binding");
> -		return -1;
> +	memcpy(path, ui->name, ui->ue->name.len);
> +	path[ui->ue->name.len] = '\0';
> +
> +	for (pos = strrchr(path, '/'); pos;
> +	     pos = strrchr(path, '/')) {
> +		*pos = '\0';
> +
> +		ret = access(path, R_OK | W_OK | X_OK);
> +		if (ret == 0) {
> +			ui->ghost_dir_pos = pos - path;
> +			pr_debug("ghost: detected F_OK %s\n", path);
> +			break;
> +		}
> +
> +		if (errno != ENOENT) {
> +			ret = -errno;
> +			pr_perror("ghost: Can't access %s\n", path);
> +			return ret;
> +		}
>  	}
>  
> -	ret = bind(sk, (struct sockaddr *)addr,
> -			sizeof(addr->sun_family) + ui->ue->name.len);
> -	if (ret < 0) {
> -		pr_perror("Can't bind socket after move");
> -		return -1;
> +	memcpy(path, ui->name, ui->ue->name.len);
> +	path[ui->ue->name.len] = '\0';
> +
> +	pos = dirname(path);
> +	pr_debug("ghost: creating %s\n", pos);
> +	ret = mkdirpat(AT_FDCWD, pos, 0755);
> +	if (ret) {
> +		errno = -ret;
> +		pr_perror("ghost: Can't create %s\n", pos);
> +		return ret;
>  	}
>  
> -	ret = rename(temp, addr->sun_path);
> +	memset(&addr, 0, sizeof(addr));
> +	addr.sun_family = AF_UNIX;
> +	memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
> +
> +	ret = bind(sk, (struct sockaddr *)&addr,
> +		   sizeof(addr.sun_family) + ui->ue->name.len);
>  	if (ret < 0) {
> -		pr_perror("couldn't move socket back");
> -		return -1;
> +		/*
> +		 * In case if there some real living socket
> +		 * with same name just move it aside for a
> +		 * while, we will move it back once ghost
> +		 * socket is processed.
> +		 */
> +		if (errno == EADDRINUSE) {
> +			char path[PATH_MAX];
> +
> +			snprintf(path, sizeof(path), UNIX_GHOST_FMT, ui->name);
> +			if (rename(ui->name, path)) {
> +				ret = -errno;
> +				pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s\n",
> +					  ui->ue->id, ui->ue->ino, ui->name, path);
> +				return ret;
> +			}
> +			ui->flags |= USK_GHOST_RENAMED;

Why do we use a global flag for this? Why we can't rename this file
back after bind()?

> +			pr_debug("ghost: id %#x ino %d renamed %s -> %s\n",
> +				 ui->ue->id, ui->ue->ino, ui->name, path);
> +			ret = bind(sk, (struct sockaddr *)&addr,
> +				   sizeof(addr.sun_family) + ui->ue->name.len);
> +		}
> +		if (ret < 0) {
> +			ret = -errno;
> +			pr_perror("ghost: Can't bind on socket id %#x ino %d addr %s",
> +				  ui->ue->id, ui->ue->ino, ui->name);
> +			return ret;
> +		}
>  	}
>  
> -	/* we've handled the deleted-ness of this
> -	 * socket and we don't want to delete it later
> -	 * since it's not /this/ socket.
> -	 */
> -	ui->ue->deleted = false;
>  	return 0;
>  }
>  
> @@ -1365,22 +1488,40 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
>  	addr.sun_family = AF_UNIX;
>  	memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
>  
> -	if (ui->name[0] && prep_unix_sk_cwd(ui, &cwd_fd, NULL, &ns_fd))
> +	if (ui->name[0] && prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd))
>  		return -1;
>  
> -	ret = bind(sk, (struct sockaddr *)&addr,
> -			sizeof(addr.sun_family) + ui->ue->name.len);
> +	/*
> +	 * Order binding for sake of ghost sockets. We might rename
> +	 * existing socket to some temp name, bind ghost, delete it,
> +	 * and finally move the former back, thus while we're doing
> +	 * this stuff we should not be interruped by connection
> +	 * from another sockets.
> +	 *
> +	 * FIXME: Probably wort make it per address rather for
> +	 * optimization sake.
> +	 */
> +	mutex_lock(mutex_ghost);
> +
> +	if (ui->flags & USK_GHOST_FDSTORE) {
> +		pr_debug("ghost: bind id %#x ino %d addr %s\n",
> +			 ui->ue->id, ui->ue->ino, ui->name);
> +		ret = bind_on_deleted(sk, ui);
> +		if (ret)
> +			errno = -ret;
> +	} else {
> +		pr_debug("bind id %#x ino %d addr %s\n",
> +			 ui->ue->id, ui->ue->ino, ui->name);
> +		ret = bind(sk, (struct sockaddr *)&addr,
> +			   sizeof(addr.sun_family) + ui->ue->name.len);
> +	}
>  	if (ret < 0) {
> -		if (ui->ue->has_deleted && ui->ue->deleted && errno == EADDRINUSE) {
> -			if (bind_deleted_unix_sk(sk, ui, &addr))
> -				goto done;
> -		} else {
> -			pr_perror("Can't bind socket");
> -			goto done;
> -		}
> +		pr_perror("Can't bind id %#x ino %d addr %s",
> +			  ui->ue->id, ui->ue->ino, ui->name);
> +		goto done;
>  	}
>  
> -	if (*ui->name && ui->ue->file_perms) {
> +	if (ui->ue->file_perms) {
>  		FilePermsEntry *perms = ui->ue->file_perms;
>  		char fname[PATH_MAX];
>  
> @@ -1403,8 +1544,8 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
>  		}
>  	}
>  
> -	if (ui->ue->deleted && unlink((char *)ui->ue->name.data) < 0) {
> -		pr_perror("failed to unlink %s", ui->ue->name.data);
> +	if (keep_deleted(ui) < 0) {
> +		pr_err("Can't save socket in fdstore\n");
>  		goto done;
>  	}
>  
> @@ -1416,6 +1557,9 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
>  	exit_code = 0;
>  done:
>  	revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd);
> +	if (drop_deleted(ui))
> +		exit_code = -1;
> +	mutex_unlock(mutex_ghost);
>  	return exit_code;
>  }
>  
> @@ -1551,11 +1695,27 @@ static int setup_second_end(int *sks, struct fdinfo_list_entry *second_end)
>  static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd)
>  {
>  	struct unix_sk_info *queuer = ui->queuer;
> -	struct fdinfo_list_entry *fle;
> +	struct unix_sk_info *peer = ui->peer;
> +	struct fdinfo_list_entry *fle, *fle_peer;
>  	int sk;
>  
>  	fle = file_master(&ui->d);
>  	pr_info_opening("standalone", ui, fle);
> +
> +	/*
> +	 * If we're about to connect to the peer which
> +	 * has been bound to removed address we should
> +	 * wait until it is processed and put into fdstore
> +	 * engine, later we will use the engine to connect
> +	 * into it in a special way.
> +	 */
> +	if (peer && (peer->flags & USK_GHOST_FDSTORE)) {
> +		fle_peer = file_master(&peer->d);
> +		if (fle_peer->stage < FLE_OPEN) {
> +			return 1;
> +		}
> +	}
> +
>  	if (fle->stage == FLE_OPEN)
>  		return post_open_standalone(&ui->d, fle->fe->fd);
>  
> @@ -1758,15 +1918,15 @@ static struct file_desc_ops unix_desc_ops = {
>   * Make FS clean from sockets we're about to
>   * restore. See for how we bind them for details
>   */
> -static void unlink_sk(struct unix_sk_info *ui)
> +static int unlink_sk(struct unix_sk_info *ui)
>  {
> -	int ret, cwd_fd = -1, root_fd = -1, ns_fd = -1;
> +	int ret = 0, cwd_fd = -1, root_fd = -1, ns_fd = -1;
>  
>  	if (!ui->name || ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN))
> -		return;
> +		return 0;
>  
>  	if (prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, NULL))
> -		return;
> +		return -1;
>  
>  	ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0;
>  	if (ret < 0 && errno != ENOENT) {
> @@ -1774,13 +1934,55 @@ static void unlink_sk(struct unix_sk_info *ui)
>  			ui->ue->ino, ui->ue->peer,
>  			ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
>  			ui->name_dir ? ui->name_dir : "-");
> +		ret = -errno;
> +		goto out;
>  	} else if (ret == 0) {
>  		pr_debug("Unlinked socket %d peer %d (name %s dir %s)\n",
>  			 ui->ue->ino, ui->ue->peer,
>  			 ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
>  			 ui->name_dir ? ui->name_dir : "-");
>  	}
> +
> +	if (ui->ghost_dir_pos) {
> +		char path[PATH_MAX], *pos;
> +
> +		memcpy(path, ui->name, ui->ue->name.len);
> +		path[ui->ue->name.len] = '\0';
> +
> +		for (pos = strrchr(path, '/');
> +		     pos && (pos - path) > ui->ghost_dir_pos;
> +		     pos = strrchr(path, '/')) {
> +			*pos = '\0';
> +			if (rmdir(path)) {
> +				ret = - errno;
> +				pr_perror("ghost: Can't remove %s\n", path);
> +				goto out;
> +			}
> +			pr_debug("ghost: Removed %s\n", path);
> +		}
> +	}
> +
> +	/*
> +	 * If it was a ghost socket we should move original
> +	 * socket back into place.
> +	 */
> +	if (ui->flags & USK_GHOST_RENAMED) {
> +		char path[PATH_MAX];
> +
> +		snprintf(path, sizeof(path), UNIX_GHOST_FMT, ui->name);
> +		if (rename(path, ui->name)) {
> +			pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s\n",
> +				  ui->ue->id, ui->ue->ino, path, ui->name);
> +			ret = -errno;
> +		} else {
> +			pr_debug("ghost: id %#x ino %d addr %s -> %s\n",
> +				 ui->ue->id, ui->ue->ino, path, ui->name);
> +		}
> +	}
> +
> +out:
>  	revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd);
> +	return ret;
>  }
>  
>  static void try_resolve_unix_peer(struct unix_sk_info *ui);
> @@ -1812,6 +2014,8 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
>  	ui->name_dir = (void *)ue->name_dir;
>  
>  	ui->flags		= 0;
> +	ui->fdstore_id		= -1;
> +	ui->ghost_dir_pos	= 0;
>  	ui->peer		= NULL;
>  	ui->queuer		= NULL;
>  	ui->bound		= 0;
> @@ -1826,6 +2030,40 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
>  	INIT_LIST_HEAD(&ui->connected);
>  	INIT_LIST_HEAD(&ui->node);
>  	INIT_LIST_HEAD(&ui->scm_fles);
> +	INIT_LIST_HEAD(&ui->ghost_node);
> +
> +	return 0;
> +}
> +
> +int unix_prepare_root_shared(void)
> +{
> +	struct unix_sk_info *ui;
> +
> +	mutex_ghost = shmalloc(sizeof(*mutex_ghost));
> +	if (!mutex_ghost) {
> +		pr_err("ghost: Can't allocate mutex\n");
> +		return -ENOMEM;
> +	}
> +	mutex_init(mutex_ghost);
> +
> +	pr_debug("ghost: Resolving addresses\n");
> +
> +	list_for_each_entry(ui, &unix_ghost_addr, ghost_node) {
> +		pr_debug("ghost: id %#x type %s state %s ino %d peer %d address %s\n",
> +			 ui->ue->id, socket_type_name(ui->ue->type),
> +			 tcp_state_name(ui->ue->state),
> +			 ui->ue->ino, ui->peer ? ui->peer->ue->ino : 0,
> +			 ui->name);
> +
> +		/*
> +		 * Drop any existing trash on the FS and mark the
> +		 * peer as a ghost one, so we will put it into
> +		 * fdstore to be able to connect into it even
> +		 * when the address is removed from the FS.
> +		 */
> +		unlink_sk(ui);

Hm. If a socket is a ghost one, we don't know an owner of a socket file
on a file system, so we don't have rights to delete it, do we?

> +		ui->flags |= USK_GHOST_FDSTORE;
> +	}
>  
>  	return 0;
>  }
> @@ -1873,6 +2111,15 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>  		add_post_prepare_cb(&ui->peer_resolve);
>  	}
>  
> +	if (ui->ue->deleted) {
> +		if (!ui->name || !ui->ue->name.len || !ui->name[0]) {
> +			pr_err("No name present, ino %d\n", ui->ue->ino);
> +			return -1;
> +		}
> +
> +		list_add_tail(&ui->ghost_node, &unix_ghost_addr);
> +	}
> +
>  	list_add_tail(&ui->list, &unix_sockets);
>  	return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops);
>  }
> -- 
> 2.14.3
> 


More information about the CRIU mailing list