[CRIU] [PATCH 18/18] SCM: Dump and restore SCM_RIGHTs

Kirill Tkhai ktkhai at virtuozzo.com
Wed Jul 12 12:41:46 MSK 2017


On 11.07.2017 18:49, Pavel Emelyanov wrote:
> 
>>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>>> index e14fa06..b9ef49c 100644
>>> --- a/criu/cr-restore.c
>>> +++ b/criu/cr-restore.c
>>> @@ -364,6 +364,14 @@ static int root_prepare_shared(void)
>>>  	if (ret)
>>>  		goto err;
>>>  
>>> +	/*
>>> +	 * This should be called with all packets collected AND all
>>> +	 * fdescs and fles prepared BUT post-prep-s not run.
>>> +	 */
>>> +	ret = prepare_scms();
>>> +	if (ret)
>>> +		goto err;
>>
>> This should be called before add_fake_fds_masters():
>>
>> 1)It may add a master file there, and this master
>> must be resolved in add_fake_fds_masters(). Otherwise
>> the task, which is the owner of this just added master,
>> may not have rights to create the master (imagine,
>> scm file is a socket of a net_ns, which can't be
>> assigned by the task);
>>
>> 2)Another case -- there was not a task, which has
>> permittions to create a socket, and you added it
>> in prepare_scms(). In this case, we mustn't add
>> one more fle in add_fake_fds_masters() -- and
>> if this function is called after prepare_scms(),
>> it won't add anything. This will reduce number
>> of fake files, we add.
> 
> OK, will move and add this text as code comment.
> 
>>> +
>>>  	ret = run_post_prepare();
>>>  	if (ret)
>>>  		goto err;
>>> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
>>> index 3fa8017..1bd5c67 100644
>>> --- a/criu/include/sockets.h
>>> +++ b/criu/include/sockets.h
>>> @@ -38,6 +38,8 @@ extern int collect_sockets(struct ns_id *);
>>>  extern struct collect_image_info inet_sk_cinfo;
>>>  extern struct collect_image_info unix_sk_cinfo;
>>>  extern int fix_external_unix_sockets(void);
>>> +extern int prepare_scms(void);
>>> +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
>>>  
>>>  extern struct collect_image_info netlink_sk_cinfo;
>>>  
>>> diff --git a/criu/sk-queue.c b/criu/sk-queue.c
>>> index 77e203e..953db66 100644
>>> --- a/criu/sk-queue.c
>>> +++ b/criu/sk-queue.c
>>> @@ -18,9 +18,9 @@
>>>  #include "util.h"
>>>  #include "util-pie.h"
>>>  #include "sockets.h"
>>> -
>>> +#include "xmalloc.h"
>>>  #include "sk-queue.h"
>>> -
>>> +#include "files.h"
>>>  #include "protobuf.h"
>>>  #include "images/sk-packet.pb-c.h"
>>>  
>>> @@ -28,6 +28,8 @@ struct sk_packet {
>>>  	struct list_head	list;
>>>  	SkPacketEntry		*entry;
>>>  	char        		*data;
>>> +	unsigned		scm_len;
>>> +	int			*scm;
>>>  };
>>>  
>>>  static LIST_HEAD(packets_list);
>>> @@ -37,12 +39,22 @@ static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
>>>  	struct sk_packet *pkt = obj;
>>>  
>>>  	pkt->entry = pb_msg(msg, SkPacketEntry);
>>> -
>>> +	pkt->scm = NULL;
>>>  	pkt->data = xmalloc(pkt->entry->length);
>>>  	if (pkt->data ==NULL)
>>>  		return -1;
>>>  
>>>  	/*
>>> +	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
>>> +	 * only 1 of that kind is possible, thus not more than 1 SCMs
>>> +	 * on a packet.
>>> +	 */
>>> +	if (pkt->entry->n_scm > 1) {
>>> +		pr_err("More than 1 SCM is not possible\n");
>>> +		return -1;
>>> +	}
>>> +
>>> +	/*
>>>  	 * NOTE: packet must be added to the tail. Otherwise sequence
>>>  	 * will be broken.
>>>  	 */
>>> @@ -64,6 +76,50 @@ struct collect_image_info sk_queues_cinfo = {
>>>  	.collect = collect_one_packet,
>>>  };
>>>  
>>> +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
>>> +{
>>> +	int nr_fds, *fds, i;
>>> +	void *buf;
>>> +	ScmEntry *scme;
>>> +
>>> +	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
>>> +	fds = (int *)CMSG_DATA(ch);
>>> +
>>> +	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
>>> +	if (!buf)
>>> +		return -1;
>>> +
>>> +	scme = xptr_pull(&buf, ScmEntry);
>>> +	scm_entry__init(scme);
>>> +	scme->type = SCM_RIGHTS;
>>> +	scme->n_rights = nr_fds;
>>> +	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
>>> +
>>> +	for (i = 0; i < nr_fds; i++) {
>>> +		int ftyp;
>>> +
>>> +		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
>>> +			return -1;
>>> +
>>> +		/*
>>> +		 * Unix sent over Unix or Epoll with some other sh*t
>>> +		 * sent over unix (maybe with this very unix polled)
>>> +		 * are tricky and not supported for now. (XXX -- todo)
>>> +		 */
>>> +		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
>>> +			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
>>> +			return -1;
>>> +		}
>>> +	}
>>> +
>>> +	i = pe->n_scm++;
>>> +	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
>>> +		return -1;
>>> +
>>> +	pe->scm[i] = scme;
>>> +	return 0;
>>> +}
>>> +
>>>  /*
>>>   * Maximum size of the control messages. XXX -- is there any
>>>   * way to get this value out of the kernel?
>>> @@ -73,8 +129,26 @@ struct collect_image_info sk_queues_cinfo = {
>>>  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>>  {
>>>  	struct cmsghdr *ch;
>>> +	int n_rights = 0;
>>>  
>>>  	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
>>> +		if (ch->cmsg_type == SCM_RIGHTS) {
>>> +			if (n_rights) {
>>> +				/*
>>> +				 * Even if user is sending more than one cmsg with
>>> +				 * rights, kernel merges them alltogether on recv.
>>> +				 */
>>> +				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
>>> +				return -1;
>>> +			}
>>> +
>>> +			if (dump_scm_rights(ch, pe))
>>> +				return -1;
>>> +
>>> +			n_rights++;
>>> +			continue;
>>> +		}
>>> +
>>>  		pr_err("Control messages in queue, not supported\n");
>>>  		return -1;
>>>  	}
>>> @@ -82,6 +156,18 @@ static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>>  	return 0;
>>>  }
>>>  
>>> +static void release_cmsg(SkPacketEntry *pe)
>>> +{
>>> +	int i;
>>> +
>>> +	for (i = 0; i < pe->n_scm; i++)
>>> +		xfree(pe->scm[i]);
>>> +	xfree(pe->scm);
>>> +
>>> +	pe->n_scm = 0;
>>> +	pe->scm = NULL;
>>> +}
>>> +
>>>  int dump_sk_queue(int sock_fd, int sock_id)
>>>  {
>>>  	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
>>> @@ -181,6 +267,9 @@ int dump_sk_queue(int sock_fd, int sock_id)
>>>  			ret = -EIO;
>>>  			goto err_set_sock;
>>>  		}
>>> +
>>> +		if (pe.scm)
>>> +			release_cmsg(&pe);
>>>  	}
>>>  	ret = 0;
>>>  
>>> @@ -197,6 +286,20 @@ err_brk:
>>>  	return ret;
>>>  }
>>>  
>>> +static void close_scm_fds(struct sk_packet *pkt)
>>> +{
>>> +	int i, *fds, n_fds;
>>> +	struct cmsghdr *ch = (struct cmsghdr *)pkt->scm;
>>> +
>>> +	fds = (int *)CMSG_DATA(ch);
>>> +	n_fds = (ch->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
>>> +
>>> +	for (i = 0; i < n_fds; i++) {
>>> +		if (close(fds[i]))
>>> +			pr_warn("scm: Error closing sent fd\n");
>>> +	}
>>> +}
>>> +
>>>  static int send_one_pkt(int fd, struct sk_packet *pkt)
>>>  {
>>>  	int ret;
>>> @@ -209,6 +312,11 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>>  	iov.iov_base = pkt->data;
>>>  	iov.iov_len = entry->length;
>>>  
>>> +	if (pkt->scm != NULL) {
>>> +		mh.msg_controllen = pkt->scm_len;
>>> +		mh.msg_control = pkt->scm;
>>> +	}
>>> +
>>>  	/*
>>>  	 * Don't try to use sendfile here, because it use sendpage() and
>>>  	 * all data are split on pages and a new skb is allocated for
>>> @@ -229,6 +337,9 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>>  		return -1;
>>>  	}
>>>  
>>> +	if (pkt->scm != NULL)
>>> +		close_scm_fds(pkt);
>>> +
>>>  	return 0;
>>>  }
>>>  
>>> @@ -264,3 +375,43 @@ int restore_sk_queue(int fd, unsigned int peer_id)
>>>  out:
>>>  	return ret;
>>>  }
>>
>> The previous patches don't make restore_sk_queue() find a fle,
>> which fd is used to set next packet in queue. This series does
>> not support multi-queuer queues?
> 
> No. You're talking about "restore sender address" patches :) I remember that,
> and that's a separate issue. Right now if we have several sockets sending
> data to one, on restore we pick the first peer and make it restore the whole
> queue.
> 
>> It seems, we don't need "unix: Split resolv and interconnect (v2)",
>> it's just need to select a correct fd from task fles, and to use
>> it in restore_sk_queue().
> 
> We do. Inside unix_note_scm_rights() I need to find a queuer for a queue
> and this lookup is only possible after the resolve and interconnect split
> I've made before.
> 
>>> +
>>> +int prepare_scms(void)
>>> +{
>>> +	struct sk_packet *pkt;
>>> +
>>> +	pr_info("Preparing SCMs\n");
>>> +	list_for_each_entry(pkt, &packets_list, list) {
>>> +		SkPacketEntry *pe = pkt->entry;
>>> +		ScmEntry *se;
>>> +		struct cmsghdr *ch;
>>> +
>>> +		if (!pe->n_scm)
>>> +			continue;
>>> +
>>> +		se = pe->scm[0]; /* Only 1 SCM is possible */
>>> +
>>> +		if (se->type == SCM_RIGHTS) {
>>> +			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
>>> +			pkt->scm = xmalloc(pkt->scm_len);
>>> +			if (!pkt->scm)
>>> +				return -1;
>>> +
>>> +			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
>>> +			ch->cmsg_level = SOL_SOCKET;
>>> +			ch->cmsg_type = SCM_RIGHTS;
>>> +			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
>>> +
>>> +			if (unix_note_scm_rights(pe->id_for, se->rights,
>>> +						(int *)CMSG_DATA(ch), se->n_rights))
>>> +				return -1;
>>> +
>>> +			continue;
>>> +		}
>>> +
>>> +		pr_err("Unsupported scm %d in image\n", se->type);
>>> +		return -1;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
>>> index 42ce1bb..165adc9 100644
>>> --- a/criu/sk-unix.c
>>> +++ b/criu/sk-unix.c
>>> @@ -798,6 +798,7 @@ struct unix_sk_info {
>>>  	struct file_desc d;
>>>  	struct list_head connected; /* List of sockets, connected to me */
>>>  	struct list_head node; /* To link in peer's connected list  */
>>> +	struct list_head scm_fles;
>>>  
>>>  	/*
>>>  	 * For DGRAM sockets with queues, we should only restore the queue
>>> @@ -809,6 +810,11 @@ struct unix_sk_info {
>>>  	u8 listen:1;
>>>  };
>>>  
>>> +struct scm_fle {
>>> +	struct list_head l;
>>> +	struct fdinfo_list_entry *fle;
>>> +};
>>> +
>>>  #define USK_PAIR_MASTER		0x1
>>>  #define USK_PAIR_SLAVE		0x2
>>>  
>>> @@ -824,6 +830,116 @@ static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>>>  	return NULL;
>>>  }
>>>  
>>> +static struct unix_sk_info *find_queuer_for(int id)
>>> +{
>>> +	struct unix_sk_info *ui;
>>> +
>>> +	list_for_each_entry(ui, &unix_sockets, list) {
>>> +		if (ui->queuer == id)
>>> +			return ui;
>>> +	}
>>> +
>>> +	return NULL;
>>> +}
>>> +
>>> +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
>>> +{
>>> +	struct unix_sk_info *ui;
>>> +	struct pstree_item *owner;
>>> +	int i;
>>> +
>>> +	ui = find_queuer_for(id_for);
>>> +	if (!ui) {
>>> +		pr_err("Can't find sender for %d\n", id_for);
>>> +		return -1;
>>> +	}
>>> +
>>> +	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
>>> +	/*
>>> +	 * This is the task that will restore this socket
>>> +	 */
>>> +	owner = file_master(&ui->d)->task;
>>> +
>>> +	pr_info("-> will set up deps\n");
>>> +	/*
>>> +	 * The ui will send data to the rights receiver. Add a fake fle
>>> +	 * for the file and a dependency.
>>> +	 */
>>> +	for (i = 0; i < n_ids; i++) {
>>> +		struct file_desc *tgt;
>>> +		struct fdinfo_list_entry *fle;
>>> +		struct scm_fle *sfle;
>>> +		FdinfoEntry *e;
>>> +		int fd;
>>> +
>>> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
>>> +		if (!tgt) {
>>> +			pr_err("Can't find fdesc to send\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		pr_info("`- Found %d file\n", file_ids[i]);
>>> +		fd = find_unused_fd(owner, -1);
>>
>> If I haven't missed something, I don't see, where you try
>> to found the fle in task's file descriptor  to use it for
>> queuing. It seems, we add a fake fle for every packet in
>> a queue. If it's so, it's not performance-good, and firstly
>> we should try to check, that task already owns the fle instead
>> of that.
> 
> Yes, I made this deliberately. Adding one more descriptor for a file
> will take us several syscalls more. Fortunately queues with scms will
> not be a frequent case.

Your series restore queues with file descriptors in scm, and one of
the queuers is used to to that. Task of this queuer obtains additional
file descriptors. Right? If so, why we don't try to analyze the task's
descriptors, if it already has the one, we add? I'm asking about that,
and this seems to not require excess file descriptors or system calls
(it's possible to just analyze fd identifiers). I'm not sure we understood
each other in question %) Do you mean the same?
 
>>> +
>>> +		fle = try_file_master(tgt);
>>> +		if (fle) {
>>> +			e = dup_fdinfo(fle->fe, fd, 0);
>>> +			if (!e) {
>>> +				pr_err("Can't duplicate fdinfo for scm\n");
>>> +				return -1;
>>> +			}
>>> +		} else {
>>> +			/*
>>> +			 * This can happen if the file in question is
>>> +			 * sent over the socket and closed. In this case
>>> +			 * we need to ... invent a new one!
>>> +			 */
>>> +
>>> +			e = xmalloc(sizeof(*e));
>>> +			if (!e)
>>> +				return -1;
>>> +
>>> +			fdinfo_entry__init(e);
>>> +			e->id = tgt->id;
>>> +			e->type = tgt->ops->type;
>>> +			e->fd = fd;
>>> +			e->flags = 0;
>>
>> I'd add the code around fdinfo_entry__init() in separate function
>> as we already have 3 places like it.
> 
> OK, will do. What are the others?

I mean the places, where fdinfo_entry__init() is used:
$ git grep fdinfo_entry__init
criu/files.c:   fdinfo_entry__init(e);
criu/files.c:   fdinfo_entry__init(e);
criu/sk-unix.c:                 fdinfo_entry__init(e);

Something like

e = create_fdinfo_entry(tgt->id, tgt->ops->type, fd, 0, false);
if (!e)
	return -1;

"false" is to use shmem or not.

> 
>>> +		}
>>> +
>>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>>> +		sfle = xmalloc(sizeof(*sfle));
>>> +		if (!sfle)
>>> +			return -1;
>>> +
>>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>>> +		if (!sfle->fle) {
>>> +			pr_err("Can't request new fle for scm\n");
>>> +			return -1;
>>> +		}
>>> +
>>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>>> +		fds[i] = fd;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int chk_restored_scms(struct unix_sk_info *ui)
>>> +{
>>> +	struct scm_fle *sf, *n;
>>> +
>>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>>> +		if (sf->fle->stage != FLE_RESTORED)
>>> +			return 1;
>>
>> It's need to wait for sf->fle->bound here instead. 
> 
> Why?

Then, what is scms? If these are the files, which contain in the
queue messages, we just need to wait till they open, don't we?
 
>> I suppose, the above introduces some circular dependencies for cases
>> with several sockets having epoll, which is listening them.
> 
> Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
> sockets. So we can only wait here for some other files, that cannot wait us.
> 
>>> +
>>> +		/* Optimization for the next pass */
>>> +		list_del(&sf->l);
>>> +		xfree(sf);
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>>  {
>>>  	struct fdinfo_list_entry *fle;
>>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>>  	struct unix_sk_info *ui;
>>>  	int ret;
>>>  
>>> +	ui = container_of(d, struct unix_sk_info, d);
>>> +
>>> +	/* FIXME -- only queue restore may be postponed */
>>> +	if (chk_restored_scms(ui)) {
>>> +		pr_info("scm: Wait for tgt to restore\n");
>>> +		return 1;
>>> +	}
>>
>> It's too strong limitation, we delay another tasks here.
> 
> Another tasks? No, we just ask files restore loop to go and try to open
> some other files.

But another tasks may wait file descriptors you're serving out.
This tasks will be delayed.
 
>> We may open the socket and to serve it out, and we do
>> not need wait queuers for these actions.
> 
> Not always. There are several cases when we do socketpair() then restore
> a queue then close one of the ends immediately.

Even if the second end is closed, anither tasks may want this
slave fle, and their eventpolls may depend on it. I think,
we should wait only in the moments, when we must wait, and do
not do that in another cases.
 
>> It's better to wait for the master fle of packet right
>> before we restore the queue, and to use it from our
>> task fle list.
> 
> Master may sit in some other's list.
> 
>>> +
>>>  	fle = file_master(d);
>>>  	if (fle->stage >= FLE_OPEN)
>>>  		return post_open_unix_sk(d, fle->fe->fd);
>>>  
>>> -	ui = container_of(d, struct unix_sk_info, d);
>>> -
>>>  	if (inherited_fd(d, new_fd)) {
>>>  		ui->ue->uflags |= USK_INHERIT;
>>>  		ret = *new_fd >= 0 ? 0 : -1;
>>> @@ -1440,6 +1562,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>>>  	ui->listen = 0;
>>>  	INIT_LIST_HEAD(&ui->connected);
>>>  	INIT_LIST_HEAD(&ui->node);
>>> +	INIT_LIST_HEAD(&ui->scm_fles);
>>>  	ui->flags = 0;
>>>  	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
>>>  
>>> diff --git a/images/sk-packet.proto b/images/sk-packet.proto
>>> index 27b48e4..009b461 100644
>>> --- a/images/sk-packet.proto
>>> +++ b/images/sk-packet.proto
>>> @@ -1,8 +1,14 @@
>>>  syntax = "proto2";
>>>  
>>> +message scm_entry {
>>> +	required uint32			type		= 1;
>>> +	repeated uint32			rights		= 2;
>>> +}
>>> +
>>>  message sk_packet_entry {
>>>  	required uint32		id_for		= 1;
>>>  	required uint32		length		= 2;
>>>  	// optional bytes		addr	= 3;
>>>  	// optional sk_ucred_entry	ucred	= 128;
>>> +	repeated scm_entry	scm		= 4;
>>>  }
>>> -- 
>>> 2.1.4
>>>
>>> _______________________________________________
>>> CRIU mailing list
>>> CRIU at openvz.org
>>> https://lists.openvz.org/mailman/listinfo/criu
>> .
>>
> 


More information about the CRIU mailing list