[CRIU] [PATCH 16/17] unix: Add support of ghost sockets

Andrei Vagin avagin at virtuozzo.com
Tue Apr 3 03:12:00 MSK 2018


On Sun, Apr 01, 2018 at 11:07:42PM +0300, Cyrill Gorcunov wrote:
> Unix sockets may be connected via deleted socket name,
> moreover the name may be reused (ie same sun_addr but
> different inodes).
> 
> To be able to handle them we do a few tricks:
> 
>  - when collecting sockets we figure out if "deleted"
>    mark is present on the socket and if such we rename
>    it into a new unique name
> 
>  - then we wait until all users are connected and
>    remove the socket from the FS
> 
> Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
> ---
>  criu/cr-restore.c      |   4 +
>  criu/include/sockets.h |   1 +
>  criu/sk-unix.c         | 290 ++++++++++++++++++++++++++++++++++++++++++-------
>  3 files changed, 255 insertions(+), 40 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index db913b2dae2e..ff1e4dcc34df 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -388,6 +388,10 @@ static int root_prepare_shared(void)
>  	if (ret)
>  		goto err;
>  
> +	ret = unix_resolve_ghost_addr();
> +	if (ret)
> +		goto err;
> +
>  	show_saved_files();
>  err:
>  	return ret;
> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
> index db330428850c..23f5b11c1b58 100644
> --- a/criu/include/sockets.h
> +++ b/criu/include/sockets.h
> @@ -60,6 +60,7 @@ extern int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg
>  
>  extern int unix_sk_id_add(unsigned int ino);
>  extern int unix_sk_ids_parse(char *optarg);
> +extern int unix_resolve_ghost_addr(void);
>  
>  extern int do_dump_opt(int sk, int level, int name, void *val, int len);
>  #define dump_opt(s, l, n, f)	do_dump_opt(s, l, n, f, sizeof(*f))
> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
> index 4feaa7722a50..c0861d17c8b1 100644
> --- a/criu/sk-unix.c
> +++ b/criu/sk-unix.c
> @@ -9,6 +9,7 @@
>  #include <sys/un.h>
>  #include <stdlib.h>
>  #include <dlfcn.h>
> +#include <libgen.h>
>  
>  #include "libnetlink.h"
>  #include "cr_options.h"
> @@ -31,6 +32,7 @@
>  #include "fdstore.h"
>  #include "fdinfo.h"
>  #include "kerndat.h"
> +#include "rst-malloc.h"
>  
>  #include "protobuf.h"
>  #include "images/sk-unix.pb-c.h"
> @@ -90,10 +92,14 @@ struct unix_sk_desc {
>  };
>  
>  static LIST_HEAD(unix_sockets);
> +static LIST_HEAD(unix_ghost_addr);
>  
>  static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d,
>  			     UnixSkEntry *ue, const struct fd_parms *p);
>  
> +struct unix_sk_info;
> +static void unlink_sk(struct unix_sk_info *ui);
> +
>  struct unix_sk_listen_icon {
>  	unsigned int			peer_ino;
>  	struct unix_sk_desc		*sk_desc;
> @@ -892,6 +898,12 @@ struct unix_sk_info {
>  	struct list_head	connected; /* List of sockets, connected to me */
>  	struct list_head	node; /* To link in peer's connected list  */
>  	struct list_head	scm_fles;
> +	struct list_head	ghost_node;
> +	struct list_head	ghost_wait_head;
> +	struct list_head	ghost_waiter;
> +	atomic_t		name_ref;
> +	atomic_t		name_rdy;

pls add descriptors for these fields

> +	struct unix_sk_info	*ghost_master;
>  
>  	/*
>  	 * For DGRAM sockets with queues, we should only restore the queue
> @@ -916,6 +928,7 @@ struct scm_fle {
>  
>  #define USK_PAIR_MASTER		0x1
>  #define USK_PAIR_SLAVE		0x2
> +#define USK_GHOST_NAME		0x4
>  
>  static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>  {
> @@ -1077,6 +1090,17 @@ static int wake_connected_sockets(struct unix_sk_info *ui)
>  	return 0;
>  }
>  
> +static void wake_ghost_waiters(struct unix_sk_info *ui)
> +{
> +	struct fdinfo_list_entry *fle;
> +	struct unix_sk_info *tmp;
> +
> +	list_for_each_entry(tmp, &ui->ghost_wait_head, ghost_waiter) {
> +		fle = file_master(&tmp->d);
> +		set_fds_event(fle->pid);
> +	}
> +}
> +
>  static bool peer_is_not_prepared(struct unix_sk_info *peer)
>  {
>  	if (peer->ue->state != TCP_LISTEN)
> @@ -1239,6 +1263,38 @@ static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd,
>  	return -1;
>  }
>  
> +static void drop_ghost_master(struct unix_sk_info *ui)
> +{
> +	struct unix_sk_info *gm = ui->ghost_master;
> +	if (gm) {
> +		if (atomic_dec_and_test(&gm->name_ref)) {
> +			pr_debug("ghost: Unlinking ghost master %s\n", gm->ue->name.data);
> +			unlink_sk(gm);
> +		}
> +	}
> +}
> +
> +static void drop_deleted(struct unix_sk_info *ui)

Pls, add description what differece between ghost and deleted?

> +{
> +	if (ui->ue->has_deleted && ui->ue->deleted) {
> +		if (atomic_dec_and_test(&ui->name_ref)) {
> +			pr_debug("ghost: Unlinking regular %s\n", ui->ue->name.data);
> +			unlink_sk(ui);
> +		}
> +	}
> +}
> +
> +static bool wait_ghost_master(struct unix_sk_info *ui)
> +{
> +	/*
> +	 * If we're to bound to deleted wait socket,
> +	 * wait until master create it.

I don't understand this comment.

> +	 */
> +	if (ui->ghost_master)
> +		return !atomic_read(&ui->ghost_master->name_rdy);
> +	return false;
> +}
> +
>  static int post_open_standalone(struct file_desc *d, int fd)
>  {
>  	struct unix_sk_info *ui;
> @@ -1286,6 +1342,7 @@ static int post_open_standalone(struct file_desc *d, int fd)
>  	ui->is_connected = true;
>  
>  	revert_unix_sk_cwd(peer, &cwd_fd, &root_fd, &ns_fd);
> +	drop_ghost_master(ui);
>  
>  restore_queue:
>  	if (peer->queuer == ui &&
> @@ -1298,46 +1355,60 @@ static int post_open_standalone(struct file_desc *d, int fd)
>  	return restore_sk_common(fd, ui);
>  }
>  
> -static int bind_deleted_unix_sk(int sk, struct unix_sk_info *ui,
> -					struct sockaddr_un *addr)
> +/*
> + * When path where socket lives is deleted, we need to reconstruct
> + * it back up but allow caller to remove it after.
> + */
> +static int bind_on_deleted(int sk, struct unix_sk_info *ui)
>  {
> -	char temp[PATH_MAX];
> +	char path[PATH_MAX], *pos;
> +	struct sockaddr_un addr;
>  	int ret;
>  
> -	pr_info("found duplicate unix socket bound at %s\n", addr->sun_path);
> -
> -	ret = snprintf(temp, sizeof(temp),
> -			"%s-%s-%d", addr->sun_path, "criu-temp", getpid());
> -	/* this shouldn't happen, since sun_addr is only 108 chars long */
> -	if (ret < 0 || ret >= sizeof(temp)) {
> -		pr_err("snprintf of %s failed?\n", addr->sun_path);
> -		return -1;;
> +	if (ui->ue->name.len >= sizeof(path)) {
> +		pr_err("Too long name for socket\n");
> +		return -ENOSPC;
>  	}
>  
> -	ret = rename(addr->sun_path, temp);
> -	if (ret < 0) {
> -		pr_perror("couldn't move socket for binding");
> -		return -1;
> +	memcpy(path, ui->name, ui->ue->name.len);
> +	path[ui->ue->name.len] = '\0';
> +
> +	for (pos = strrchr(path, '/'); pos;
> +	     pos = strrchr(path, '/')) {
> +		*pos = '\0';
> +
> +		ret = access(path, R_OK | W_OK | X_OK);
> +		if (ret == 0)
> +			break;
> +
> +		if (errno != ENOENT) {
> +			ret = -errno;
> +			pr_perror("Can't access %s\n", path);
> +			return ret;
> +		}
>  	}
>  
> -	ret = bind(sk, (struct sockaddr *)addr,
> -			sizeof(addr->sun_family) + ui->ue->name.len);
> -	if (ret < 0) {
> -		pr_perror("Can't bind socket after move");
> -		return -1;;
> +	memcpy(path, ui->name, ui->ue->name.len);
> +	path[ui->ue->name.len] = '\0';
> +
> +	pos = dirname(path);
> +	ret = mkdirpat(AT_FDCWD, pos, 0755);
> +	if (ret) {
> +		pr_err("Can't create %s\n", pos);
> +		return ret;
>  	}
>  
> -	ret = rename(temp, addr->sun_path);
> +	memset(&addr, 0, sizeof(addr));
> +	addr.sun_family = AF_UNIX;
> +	memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
> +
> +	ret = bind(sk, (struct sockaddr *)&addr,
> +		   sizeof(addr.sun_family) + ui->ue->name.len);
>  	if (ret < 0) {
> -		pr_perror("couldn't move socket back");
> -		return -1;
> +		pr_perror("Can't bind on socket %s", (char *)ui->ue->name.data);
> +		return ret;
>  	}
>  
> -	/* we've handled the deleted-ness of this
> -	 * socket and we don't want to delete it later
> -	 * since it's not /this/ socket.
> -	 */
> -	ui->ue->deleted = false;
>  	return 0;
>  }
>  
> @@ -1347,7 +1418,7 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
>  	int cwd_fd = -1, root_fd = -1, ns_fd = -1;
>  	int ret, exit_code = -1;
>  
> -	if (ui->ue->name.len == 0)
> +	if (ui->ue->name.len == 0 || atomic_read(&ui->name_rdy))
>  		return 0;
>  
>  	if ((ui->ue->type == SOCK_STREAM) && (ui->ue->state == TCP_ESTABLISHED)) {
> @@ -1371,16 +1442,13 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
>  	ret = bind(sk, (struct sockaddr *)&addr,
>  			sizeof(addr.sun_family) + ui->ue->name.len);
>  	if (ret < 0) {
> -		if (ui->ue->has_deleted && ui->ue->deleted && errno == EADDRINUSE) {
> -			if (bind_deleted_unix_sk(sk, ui, &addr))
> -				goto done;
> -		} else {
> -			pr_perror("Can't bind socket");
> +		if (ui->ue->has_deleted && ui->ue->deleted)
> +			ret = bind_on_deleted(sk, ui);
> +		if (ret)
>  			goto done;
> -		}
>  	}
>  
> -	if (*ui->name && ui->ue->file_perms) {
> +	if (ui->ue->file_perms) {
>  		FilePermsEntry *perms = ui->ue->file_perms;
>  		char fname[PATH_MAX];
>  
> @@ -1403,19 +1471,20 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui)
>  		}
>  	}
>  
> -	if (ui->ue->deleted && unlink((char *)ui->ue->name.data) < 0) {
> -		pr_perror("failed to unlink %s", ui->ue->name.data);
> -		goto done;
> -	}
> +	atomic_inc(&ui->name_rdy);
> +	pr_debug("name_rdy %#x\n", ui->ue->ino);
>  
>  	if (ui->ue->state != TCP_LISTEN) {
>  		ui->bound = 1;
>  		wake_connected_sockets(ui);
>  	}
>  
> +	wake_ghost_waiters(ui);
>  	exit_code = 0;
>  done:
>  	revert_unix_sk_cwd(ui, &cwd_fd, &root_fd, &ns_fd);
> +	if (exit_code == 0)
> +		drop_deleted(ui);
>  	return exit_code;
>  }
>  
> @@ -1501,6 +1570,9 @@ static int open_unixsk_pair_master(struct unix_sk_info *ui, int *new_fd)
>  	if (bind_unix_sk(sk[1], peer))
>  		return -1;
>  
> +	drop_ghost_master(ui);
> +	drop_ghost_master(peer);
> +
>  	*new_fd = sk[0];
>  	return 1;
>  }
> @@ -1556,6 +1628,10 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd)
>  
>  	fle = file_master(&ui->d);
>  	pr_info_opening("standalone", ui, fle);
> +
> +	if (wait_ghost_master(ui))
> +		return 1;
> +
>  	if (fle->stage == FLE_OPEN)
>  		return post_open_standalone(&ui->d, fle->fe->fd);
>  
> @@ -1814,11 +1890,15 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
>  	ui->flags		= 0;
>  	ui->peer		= NULL;
>  	ui->queuer		= NULL;
> +	ui->ghost_master	= NULL;
>  	ui->bound		= 0;
>  	ui->listen		= 0;
>  	ui->is_connected	= 0;
>  	ui->peer_queue_restored = 0;
>  
> +	atomic_set(&ui->name_ref, 1);
> +	atomic_set(&ui->name_rdy, 0);
> +
>  	memzero(&ui->peer_resolve, sizeof(ui->peer_resolve));
>  	memzero(&ui->d, sizeof(ui->d));
>  
> @@ -1826,6 +1906,127 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue)
>  	INIT_LIST_HEAD(&ui->connected);
>  	INIT_LIST_HEAD(&ui->node);
>  	INIT_LIST_HEAD(&ui->scm_fles);
> +	INIT_LIST_HEAD(&ui->ghost_node);
> +	INIT_LIST_HEAD(&ui->ghost_wait_head);
> +	INIT_LIST_HEAD(&ui->ghost_waiter);
> +
> +	return 0;
> +}
> +
> +#define GHOST_NAME_FMT		"~criu-%u"
> +#define GHOST_NAME_FMT_PREFIX	6 /* num of chars before counter */
> +
> +static int ghost_new_name(char *name, size_t namelen,
> +			  char **name_new, size_t *namelen_new)
> +{
> +	char sname[64], *pos, *oldname = name;
> +	static unsigned int cnt = 0;
> +	size_t k;
> +
> +	pr_debug("\tghost: handling name %s namelen %zu\n", name, namelen);
> +
> +	for (pos = &name[namelen - 1]; pos > name; pos--) {
> +		if (*pos == GHOST_NAME_FMT[0])
> +			break;
> +	}
> +
> +	if (strncmp(pos, GHOST_NAME_FMT, GHOST_NAME_FMT_PREFIX) == 0) {
> +		unsigned int __cnt;
> +		char *__name;
> +
> +		if (sscanf(pos, GHOST_NAME_FMT, &__cnt) == 1) {
> +			pr_debug("\tghost: cnt %d detected\n", __cnt);
> +			cnt = __cnt + 1;
> +		}
> +
> +		namelen = (pos - name);
> +		__name = alloca(namelen + 1);
> +		memcpy(__name, name, namelen);
> +		__name[namelen++] = '\0';
> +		name = __name;
> +		pr_debug("\tghost: Name stipped to %s (namelen %zu)\n",
> +			 name, namelen);
> +	}
> +
> +	memzero(sname, sizeof(sname));
> +	k = snprintf(sname, sizeof(sname), GHOST_NAME_FMT, cnt++);
> +	*namelen_new = namelen + k;
> +	if (*namelen_new > UNIX_PATH_MAX) {
> +		pr_err("\tghost: New name for socket is too long\n");
> +		return -1;
> +	}
> +
> +	*name_new = shmalloc(*namelen_new);

Why do you use shmalloc here?

> +	if (!*name_new) {
> +		pr_err("\tghost: Can't allocate new name for socket\n");
> +		return -ENOMEM;
> +	}
> +
> +	k = snprintf(*name_new, *namelen_new, "%s%s", name, sname);
> +	if (k != (*namelen_new - 1)) {
> +		pr_err("\tghost: Name generation failed (%s %d %d)\n",
> +		       *name_new, (int)k, (int)*namelen_new);
> +		return -1;
> +	}
> +
> +	pr_debug("\tghost: name transition %s -> %s\n", oldname, *name_new);
> +	return 0;
> +}
> +
> +int unix_resolve_ghost_addr(void)
> +{
> +	struct unix_sk_info *ui, *t;
> +
> +	pr_debug("ghost: Resolving addresses\n");
> +
> +	/*
> +	 * Walk over ghost unix entries and find one
> +	 * which gonna be a master and won't unlink
> +	 * the name until all peers are connected to
> +	 * this designation.
> +	 */
> +
> +	list_for_each_entry(ui, &unix_ghost_addr, ghost_node) {
> +		size_t newnamelen;
> +		char *newname;
> +
> +		pr_debug("ghost: ino %#x peer %#x address %s\n",
> +			 ui->ue->ino, ui->peer ? ui->peer->ue->ino : 0,
> +			 ui->name);
> +
> +		unlink_sk(ui);
> +
> +		if (ghost_new_name(ui->name, ui->ue->name.len,
> +				   &newname, &newnamelen))
> +			return -1;
> +
> +		ui->name = newname;
> +		ui->ue->name.len = newnamelen;
> +		ui->ue->name.data = (void *)newname;
> +		ui->flags |= USK_GHOST_NAME;
> +
> +		unlink_sk(ui);

Why do we need to unlink_sk with a new uniq name?
> +
> +		/*
> +		 * Figure out who is connected to this peer,
> +		 * so the name will be removed from FS only
> +		 * when last one is connected.
> +		 */
> +		list_for_each_entry(t, &unix_sockets, list) {
> +			if (t->flags & USK_GHOST_NAME)
> +				continue;
> +			if (ui == t || t->peer != ui)
> +				continue;
> +
> +			pr_debug("\t\tghost: connected to us %#x -> %#x\n",
> +				 t->ue->ino, ui->ue->ino);
> +
> +			t->flags |= USK_GHOST_NAME;
> +			t->ghost_master = ui;
> +			atomic_inc(&ui->name_ref);
> +			list_add(&t->ghost_waiter, &ui->ghost_wait_head);
> +		}
> +	}
>  
>  	return 0;
>  }
> @@ -1873,6 +2074,15 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>  		add_post_prepare_cb(&ui->peer_resolve);
>  	}
>  
> +	if (ui->ue->deleted) {
> +		if (!ui->name || !ui->ue->name.len || !ui->name[0]) {
> +			pr_err("No name present, ino %#x\n", ui->ue->ino);
> +			return -1;
> +		}
> +
> +		list_add_tail(&ui->ghost_node, &unix_ghost_addr);
> +	}
> +
>  	list_add_tail(&ui->list, &unix_sockets);
>  	return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops);
>  }
> -- 
> 2.14.3
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu


More information about the CRIU mailing list