[CRIU] [PATCH 18/18] SCM: Dump and restore SCM_RIGHTs

Kirill Tkhai ktkhai at virtuozzo.com
Tue Jul 11 15:38:32 MSK 2017


On Mon, Jul 10, 2017 at 12:42, Pavel Emelyanov wrote:
> Most of the pieces has already been described in the previous patches :)
> so here's the summary.
> 
> * Dump:
> 
> When receiving a message, also receive any SCM-s (already there) and when
> SCM_RIGHTs one is met -- go ahead and just dump received descriptors using
> regular code, but taking current as the victim task.
> 
> Few words about file paths resolution -- since we do dump path-ed files
> by receiving them from victim's parasite, such files sent via sockets
> should still work OK, as we still receive them, just from another socket.
> 
> Several problems here:
> 
> 1. Unix sockets sent via unix sockets form knots. Not supported.
> 2. Eventpolls sent via unix might themseves poll unix sockets. Knots
>    again. Not supported either.
> 
> * Restore:
> 
> On restore we need to make unix socket wait for the soon-to-be-scm-sent
> descriptors to get restored, so we need to find them, then put a dependency.
> After that, the fake fdinfo entry is attached to the respective file
> descs, when sent the respective descriptors are closed.
> 
> https://github.com/xemul/criu/issues/251
> 
> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
>  criu/cr-restore.c      |   8 +++
>  criu/include/sockets.h |   2 +
>  criu/sk-queue.c        | 157 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  criu/sk-unix.c         | 127 ++++++++++++++++++++++++++++++++++++++-
>  images/sk-packet.proto |   6 ++
>  5 files changed, 295 insertions(+), 5 deletions(-)
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index e14fa06..b9ef49c 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -364,6 +364,14 @@ static int root_prepare_shared(void)
>  	if (ret)
>  		goto err;
>  
> +	/*
> +	 * This should be called with all packets collected AND all
> +	 * fdescs and fles prepared BUT post-prep-s not run.
> +	 */
> +	ret = prepare_scms();
> +	if (ret)
> +		goto err;

This should be called before add_fake_fds_masters():

1)It may add a master file there, and this master
must be resolved in add_fake_fds_masters(). Otherwise
the task, which is the owner of this just added master,
may not have rights to create the master (imagine,
scm file is a socket of a net_ns, which can't be
assigned by the task);

2)Another case -- there was not a task, which has
permittions to create a socket, and you added it
in prepare_scms(). In this case, we mustn't add
one more fle in add_fake_fds_masters() -- and
if this function is called after prepare_scms(),
it won't add anything. This will reduce number
of fake files, we add.

> +
>  	ret = run_post_prepare();
>  	if (ret)
>  		goto err;
> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
> index 3fa8017..1bd5c67 100644
> --- a/criu/include/sockets.h
> +++ b/criu/include/sockets.h
> @@ -38,6 +38,8 @@ extern int collect_sockets(struct ns_id *);
>  extern struct collect_image_info inet_sk_cinfo;
>  extern struct collect_image_info unix_sk_cinfo;
>  extern int fix_external_unix_sockets(void);
> +extern int prepare_scms(void);
> +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
>  
>  extern struct collect_image_info netlink_sk_cinfo;
>  
> diff --git a/criu/sk-queue.c b/criu/sk-queue.c
> index 77e203e..953db66 100644
> --- a/criu/sk-queue.c
> +++ b/criu/sk-queue.c
> @@ -18,9 +18,9 @@
>  #include "util.h"
>  #include "util-pie.h"
>  #include "sockets.h"
> -
> +#include "xmalloc.h"
>  #include "sk-queue.h"
> -
> +#include "files.h"
>  #include "protobuf.h"
>  #include "images/sk-packet.pb-c.h"
>  
> @@ -28,6 +28,8 @@ struct sk_packet {
>  	struct list_head	list;
>  	SkPacketEntry		*entry;
>  	char        		*data;
> +	unsigned		scm_len;
> +	int			*scm;
>  };
>  
>  static LIST_HEAD(packets_list);
> @@ -37,12 +39,22 @@ static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
>  	struct sk_packet *pkt = obj;
>  
>  	pkt->entry = pb_msg(msg, SkPacketEntry);
> -
> +	pkt->scm = NULL;
>  	pkt->data = xmalloc(pkt->entry->length);
>  	if (pkt->data ==NULL)
>  		return -1;
>  
>  	/*
> +	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
> +	 * only 1 of that kind is possible, thus not more than 1 SCMs
> +	 * on a packet.
> +	 */
> +	if (pkt->entry->n_scm > 1) {
> +		pr_err("More than 1 SCM is not possible\n");
> +		return -1;
> +	}
> +
> +	/*
>  	 * NOTE: packet must be added to the tail. Otherwise sequence
>  	 * will be broken.
>  	 */
> @@ -64,6 +76,50 @@ struct collect_image_info sk_queues_cinfo = {
>  	.collect = collect_one_packet,
>  };
>  
> +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
> +{
> +	int nr_fds, *fds, i;
> +	void *buf;
> +	ScmEntry *scme;
> +
> +	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
> +	fds = (int *)CMSG_DATA(ch);
> +
> +	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
> +	if (!buf)
> +		return -1;
> +
> +	scme = xptr_pull(&buf, ScmEntry);
> +	scm_entry__init(scme);
> +	scme->type = SCM_RIGHTS;
> +	scme->n_rights = nr_fds;
> +	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
> +
> +	for (i = 0; i < nr_fds; i++) {
> +		int ftyp;
> +
> +		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
> +			return -1;
> +
> +		/*
> +		 * Unix sent over Unix or Epoll with some other sh*t
> +		 * sent over unix (maybe with this very unix polled)
> +		 * are tricky and not supported for now. (XXX -- todo)
> +		 */
> +		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
> +			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
> +			return -1;
> +		}
> +	}
> +
> +	i = pe->n_scm++;
> +	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
> +		return -1;
> +
> +	pe->scm[i] = scme;
> +	return 0;
> +}
> +
>  /*
>   * Maximum size of the control messages. XXX -- is there any
>   * way to get this value out of the kernel?
> @@ -73,8 +129,26 @@ struct collect_image_info sk_queues_cinfo = {
>  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>  {
>  	struct cmsghdr *ch;
> +	int n_rights = 0;
>  
>  	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
> +		if (ch->cmsg_type == SCM_RIGHTS) {
> +			if (n_rights) {
> +				/*
> +				 * Even if user is sending more than one cmsg with
> +				 * rights, kernel merges them alltogether on recv.
> +				 */
> +				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
> +				return -1;
> +			}
> +
> +			if (dump_scm_rights(ch, pe))
> +				return -1;
> +
> +			n_rights++;
> +			continue;
> +		}
> +
>  		pr_err("Control messages in queue, not supported\n");
>  		return -1;
>  	}
> @@ -82,6 +156,18 @@ static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>  	return 0;
>  }
>  
> +static void release_cmsg(SkPacketEntry *pe)
> +{
> +	int i;
> +
> +	for (i = 0; i < pe->n_scm; i++)
> +		xfree(pe->scm[i]);
> +	xfree(pe->scm);
> +
> +	pe->n_scm = 0;
> +	pe->scm = NULL;
> +}
> +
>  int dump_sk_queue(int sock_fd, int sock_id)
>  {
>  	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
> @@ -181,6 +267,9 @@ int dump_sk_queue(int sock_fd, int sock_id)
>  			ret = -EIO;
>  			goto err_set_sock;
>  		}
> +
> +		if (pe.scm)
> +			release_cmsg(&pe);
>  	}
>  	ret = 0;
>  
> @@ -197,6 +286,20 @@ err_brk:
>  	return ret;
>  }
>  
> +static void close_scm_fds(struct sk_packet *pkt)
> +{
> +	int i, *fds, n_fds;
> +	struct cmsghdr *ch = (struct cmsghdr *)pkt->scm;
> +
> +	fds = (int *)CMSG_DATA(ch);
> +	n_fds = (ch->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
> +
> +	for (i = 0; i < n_fds; i++) {
> +		if (close(fds[i]))
> +			pr_warn("scm: Error closing sent fd\n");
> +	}
> +}
> +
>  static int send_one_pkt(int fd, struct sk_packet *pkt)
>  {
>  	int ret;
> @@ -209,6 +312,11 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>  	iov.iov_base = pkt->data;
>  	iov.iov_len = entry->length;
>  
> +	if (pkt->scm != NULL) {
> +		mh.msg_controllen = pkt->scm_len;
> +		mh.msg_control = pkt->scm;
> +	}
> +
>  	/*
>  	 * Don't try to use sendfile here, because it use sendpage() and
>  	 * all data are split on pages and a new skb is allocated for
> @@ -229,6 +337,9 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>  		return -1;
>  	}
>  
> +	if (pkt->scm != NULL)
> +		close_scm_fds(pkt);
> +
>  	return 0;
>  }
>  
> @@ -264,3 +375,43 @@ int restore_sk_queue(int fd, unsigned int peer_id)
>  out:
>  	return ret;
>  }

The previous patches don't make restore_sk_queue() find a fle,
which fd is used to set next packet in queue. This series does
not support multi-queuer queues?

It seems, we don't need "unix: Split resolv and interconnect (v2)",
it's just need to select a correct fd from task fles, and to use
it in restore_sk_queue().

> +
> +int prepare_scms(void)
> +{
> +	struct sk_packet *pkt;
> +
> +	pr_info("Preparing SCMs\n");
> +	list_for_each_entry(pkt, &packets_list, list) {
> +		SkPacketEntry *pe = pkt->entry;
> +		ScmEntry *se;
> +		struct cmsghdr *ch;
> +
> +		if (!pe->n_scm)
> +			continue;
> +
> +		se = pe->scm[0]; /* Only 1 SCM is possible */
> +
> +		if (se->type == SCM_RIGHTS) {
> +			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
> +			pkt->scm = xmalloc(pkt->scm_len);
> +			if (!pkt->scm)
> +				return -1;
> +
> +			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
> +			ch->cmsg_level = SOL_SOCKET;
> +			ch->cmsg_type = SCM_RIGHTS;
> +			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
> +
> +			if (unix_note_scm_rights(pe->id_for, se->rights,
> +						(int *)CMSG_DATA(ch), se->n_rights))
> +				return -1;
> +
> +			continue;
> +		}
> +
> +		pr_err("Unsupported scm %d in image\n", se->type);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
> index 42ce1bb..165adc9 100644
> --- a/criu/sk-unix.c
> +++ b/criu/sk-unix.c
> @@ -798,6 +798,7 @@ struct unix_sk_info {
>  	struct file_desc d;
>  	struct list_head connected; /* List of sockets, connected to me */
>  	struct list_head node; /* To link in peer's connected list  */
> +	struct list_head scm_fles;
>  
>  	/*
>  	 * For DGRAM sockets with queues, we should only restore the queue
> @@ -809,6 +810,11 @@ struct unix_sk_info {
>  	u8 listen:1;
>  };
>  
> +struct scm_fle {
> +	struct list_head l;
> +	struct fdinfo_list_entry *fle;
> +};
> +
>  #define USK_PAIR_MASTER		0x1
>  #define USK_PAIR_SLAVE		0x2
>  
> @@ -824,6 +830,116 @@ static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>  	return NULL;
>  }
>  
> +static struct unix_sk_info *find_queuer_for(int id)
> +{
> +	struct unix_sk_info *ui;
> +
> +	list_for_each_entry(ui, &unix_sockets, list) {
> +		if (ui->queuer == id)
> +			return ui;
> +	}
> +
> +	return NULL;
> +}
> +
> +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
> +{
> +	struct unix_sk_info *ui;
> +	struct pstree_item *owner;
> +	int i;
> +
> +	ui = find_queuer_for(id_for);
> +	if (!ui) {
> +		pr_err("Can't find sender for %d\n", id_for);
> +		return -1;
> +	}
> +
> +	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
> +	/*
> +	 * This is the task that will restore this socket
> +	 */
> +	owner = file_master(&ui->d)->task;
> +
> +	pr_info("-> will set up deps\n");
> +	/*
> +	 * The ui will send data to the rights receiver. Add a fake fle
> +	 * for the file and a dependency.
> +	 */
> +	for (i = 0; i < n_ids; i++) {
> +		struct file_desc *tgt;
> +		struct fdinfo_list_entry *fle;
> +		struct scm_fle *sfle;
> +		FdinfoEntry *e;
> +		int fd;
> +
> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
> +		if (!tgt) {
> +			pr_err("Can't find fdesc to send\n");
> +			return -1;
> +		}
> +
> +		pr_info("`- Found %d file\n", file_ids[i]);
> +		fd = find_unused_fd(owner, -1);

If I haven't missed something, I don't see, where you try
to found the fle in task's file descriptor  to use it for
queuing. It seems, we add a fake fle for every packet in
a queue. If it's so, it's not performance-good, and firstly
we should try to check, that task already owns the fle instead
of that.

> +
> +		fle = try_file_master(tgt);
> +		if (fle) {
> +			e = dup_fdinfo(fle->fe, fd, 0);
> +			if (!e) {
> +				pr_err("Can't duplicate fdinfo for scm\n");
> +				return -1;
> +			}
> +		} else {
> +			/*
> +			 * This can happen if the file in question is
> +			 * sent over the socket and closed. In this case
> +			 * we need to ... invent a new one!
> +			 */
> +
> +			e = xmalloc(sizeof(*e));
> +			if (!e)
> +				return -1;
> +
> +			fdinfo_entry__init(e);
> +			e->id = tgt->id;
> +			e->type = tgt->ops->type;
> +			e->fd = fd;
> +			e->flags = 0;

I'd add the code around fdinfo_entry__init() in separate function
as we already have 3 places like it.

> +		}
> +
> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
> +		sfle = xmalloc(sizeof(*sfle));
> +		if (!sfle)
> +			return -1;
> +
> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
> +		if (!sfle->fle) {
> +			pr_err("Can't request new fle for scm\n");
> +			return -1;
> +		}
> +
> +		list_add_tail(&sfle->l, &ui->scm_fles);
> +		fds[i] = fd;
> +	}
> +
> +	return 0;
> +}
> +
> +static int chk_restored_scms(struct unix_sk_info *ui)
> +{
> +	struct scm_fle *sf, *n;
> +
> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
> +		if (sf->fle->stage != FLE_RESTORED)
> +			return 1;

It's need to wait for sf->fle->bound here instead. I suppose,
the above introduces some circular dependencies for cases
with several sockets having epoll, which is listening them.

> +
> +		/* Optimization for the next pass */
> +		list_del(&sf->l);
> +		xfree(sf);
> +	}
> +
> +	return 0;
> +}
> +
>  static int wake_connected_sockets(struct unix_sk_info *ui)
>  {
>  	struct fdinfo_list_entry *fle;
> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>  	struct unix_sk_info *ui;
>  	int ret;
>  
> +	ui = container_of(d, struct unix_sk_info, d);
> +
> +	/* FIXME -- only queue restore may be postponed */
> +	if (chk_restored_scms(ui)) {
> +		pr_info("scm: Wait for tgt to restore\n");
> +		return 1;
> +	}

It's too strong limitation, we delay another tasks here.
We may open the socket and to serve it out, and we do
not need wait queuers for these actions.

It's better to wait for the master fle of packet right
before we restore the queue, and to use it from our
task fle list.

> +
>  	fle = file_master(d);
>  	if (fle->stage >= FLE_OPEN)
>  		return post_open_unix_sk(d, fle->fe->fd);
>  
> -	ui = container_of(d, struct unix_sk_info, d);
> -
>  	if (inherited_fd(d, new_fd)) {
>  		ui->ue->uflags |= USK_INHERIT;
>  		ret = *new_fd >= 0 ? 0 : -1;
> @@ -1440,6 +1562,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>  	ui->listen = 0;
>  	INIT_LIST_HEAD(&ui->connected);
>  	INIT_LIST_HEAD(&ui->node);
> +	INIT_LIST_HEAD(&ui->scm_fles);
>  	ui->flags = 0;
>  	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
>  
> diff --git a/images/sk-packet.proto b/images/sk-packet.proto
> index 27b48e4..009b461 100644
> --- a/images/sk-packet.proto
> +++ b/images/sk-packet.proto
> @@ -1,8 +1,14 @@
>  syntax = "proto2";
>  
> +message scm_entry {
> +	required uint32			type		= 1;
> +	repeated uint32			rights		= 2;
> +}
> +
>  message sk_packet_entry {
>  	required uint32		id_for		= 1;
>  	required uint32		length		= 2;
>  	// optional bytes		addr	= 3;
>  	// optional sk_ucred_entry	ucred	= 128;
> +	repeated scm_entry	scm		= 4;
>  }
> -- 
> 2.1.4
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu


More information about the CRIU mailing list