[CRIU] [PATCH 18/18] SCM: Dump and restore SCM_RIGHTs

Pavel Emelyanov xemul at virtuozzo.com
Tue Jul 11 18:49:35 MSK 2017


>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index e14fa06..b9ef49c 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -364,6 +364,14 @@ static int root_prepare_shared(void)
>>  	if (ret)
>>  		goto err;
>>  
>> +	/*
>> +	 * This should be called with all packets collected AND all
>> +	 * fdescs and fles prepared BUT post-prep-s not run.
>> +	 */
>> +	ret = prepare_scms();
>> +	if (ret)
>> +		goto err;
> 
> This should be called before add_fake_fds_masters():
> 
> 1)It may add a master file there, and this master
> must be resolved in add_fake_fds_masters(). Otherwise
> the task, which is the owner of this just added master,
> may not have rights to create the master (imagine,
> scm file is a socket of a net_ns, which can't be
> assigned by the task);
> 
> 2)Another case -- there was not a task, which has
> permittions to create a socket, and you added it
> in prepare_scms(). In this case, we mustn't add
> one more fle in add_fake_fds_masters() -- and
> if this function is called after prepare_scms(),
> it won't add anything. This will reduce number
> of fake files, we add.

OK, will move and add this text as code comment.

>> +
>>  	ret = run_post_prepare();
>>  	if (ret)
>>  		goto err;
>> diff --git a/criu/include/sockets.h b/criu/include/sockets.h
>> index 3fa8017..1bd5c67 100644
>> --- a/criu/include/sockets.h
>> +++ b/criu/include/sockets.h
>> @@ -38,6 +38,8 @@ extern int collect_sockets(struct ns_id *);
>>  extern struct collect_image_info inet_sk_cinfo;
>>  extern struct collect_image_info unix_sk_cinfo;
>>  extern int fix_external_unix_sockets(void);
>> +extern int prepare_scms(void);
>> +extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids);
>>  
>>  extern struct collect_image_info netlink_sk_cinfo;
>>  
>> diff --git a/criu/sk-queue.c b/criu/sk-queue.c
>> index 77e203e..953db66 100644
>> --- a/criu/sk-queue.c
>> +++ b/criu/sk-queue.c
>> @@ -18,9 +18,9 @@
>>  #include "util.h"
>>  #include "util-pie.h"
>>  #include "sockets.h"
>> -
>> +#include "xmalloc.h"
>>  #include "sk-queue.h"
>> -
>> +#include "files.h"
>>  #include "protobuf.h"
>>  #include "images/sk-packet.pb-c.h"
>>  
>> @@ -28,6 +28,8 @@ struct sk_packet {
>>  	struct list_head	list;
>>  	SkPacketEntry		*entry;
>>  	char        		*data;
>> +	unsigned		scm_len;
>> +	int			*scm;
>>  };
>>  
>>  static LIST_HEAD(packets_list);
>> @@ -37,12 +39,22 @@ static int collect_one_packet(void *obj, ProtobufCMessage *msg, struct cr_img *i
>>  	struct sk_packet *pkt = obj;
>>  
>>  	pkt->entry = pb_msg(msg, SkPacketEntry);
>> -
>> +	pkt->scm = NULL;
>>  	pkt->data = xmalloc(pkt->entry->length);
>>  	if (pkt->data ==NULL)
>>  		return -1;
>>  
>>  	/*
>> +	 * See dump_packet_cmsg() -- only SCM_RIGHTS are supported and
>> +	 * only 1 of that kind is possible, thus not more than 1 SCMs
>> +	 * on a packet.
>> +	 */
>> +	if (pkt->entry->n_scm > 1) {
>> +		pr_err("More than 1 SCM is not possible\n");
>> +		return -1;
>> +	}
>> +
>> +	/*
>>  	 * NOTE: packet must be added to the tail. Otherwise sequence
>>  	 * will be broken.
>>  	 */
>> @@ -64,6 +76,50 @@ struct collect_image_info sk_queues_cinfo = {
>>  	.collect = collect_one_packet,
>>  };
>>  
>> +static int dump_scm_rights(struct cmsghdr *ch, SkPacketEntry *pe)
>> +{
>> +	int nr_fds, *fds, i;
>> +	void *buf;
>> +	ScmEntry *scme;
>> +
>> +	nr_fds = (ch->cmsg_len - sizeof(*ch)) / sizeof(int);
>> +	fds = (int *)CMSG_DATA(ch);
>> +
>> +	buf = xmalloc(sizeof(ScmEntry) + nr_fds * sizeof(uint32_t));
>> +	if (!buf)
>> +		return -1;
>> +
>> +	scme = xptr_pull(&buf, ScmEntry);
>> +	scm_entry__init(scme);
>> +	scme->type = SCM_RIGHTS;
>> +	scme->n_rights = nr_fds;
>> +	scme->rights = xptr_pull_s(&buf, nr_fds * sizeof(uint32_t));
>> +
>> +	for (i = 0; i < nr_fds; i++) {
>> +		int ftyp;
>> +
>> +		if (dump_my_file(fds[i], &scme->rights[i], &ftyp))
>> +			return -1;
>> +
>> +		/*
>> +		 * Unix sent over Unix or Epoll with some other sh*t
>> +		 * sent over unix (maybe with this very unix polled)
>> +		 * are tricky and not supported for now. (XXX -- todo)
>> +		 */
>> +		if (ftyp == FD_TYPES__UNIXSK || ftyp == FD_TYPES__EVENTPOLL) {
>> +			pr_err("Can't dump send %d (unix/epoll) fd\n", ftyp);
>> +			return -1;
>> +		}
>> +	}
>> +
>> +	i = pe->n_scm++;
>> +	if (xrealloc_safe(&pe->scm, pe->n_scm * sizeof(ScmEntry*)))
>> +		return -1;
>> +
>> +	pe->scm[i] = scme;
>> +	return 0;
>> +}
>> +
>>  /*
>>   * Maximum size of the control messages. XXX -- is there any
>>   * way to get this value out of the kernel?
>> @@ -73,8 +129,26 @@ struct collect_image_info sk_queues_cinfo = {
>>  static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>  {
>>  	struct cmsghdr *ch;
>> +	int n_rights = 0;
>>  
>>  	for (ch = CMSG_FIRSTHDR(mh); ch; ch = CMSG_NXTHDR(mh, ch)) {
>> +		if (ch->cmsg_type == SCM_RIGHTS) {
>> +			if (n_rights) {
>> +				/*
>> +				 * Even if user is sending more than one cmsg with
>> +				 * rights, kernel merges them alltogether on recv.
>> +				 */
>> +				pr_err("Unexpected 2nd SCM_RIGHTS from the kernel\n");
>> +				return -1;
>> +			}
>> +
>> +			if (dump_scm_rights(ch, pe))
>> +				return -1;
>> +
>> +			n_rights++;
>> +			continue;
>> +		}
>> +
>>  		pr_err("Control messages in queue, not supported\n");
>>  		return -1;
>>  	}
>> @@ -82,6 +156,18 @@ static int dump_packet_cmsg(struct msghdr *mh, SkPacketEntry *pe)
>>  	return 0;
>>  }
>>  
>> +static void release_cmsg(SkPacketEntry *pe)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < pe->n_scm; i++)
>> +		xfree(pe->scm[i]);
>> +	xfree(pe->scm);
>> +
>> +	pe->n_scm = 0;
>> +	pe->scm = NULL;
>> +}
>> +
>>  int dump_sk_queue(int sock_fd, int sock_id)
>>  {
>>  	SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
>> @@ -181,6 +267,9 @@ int dump_sk_queue(int sock_fd, int sock_id)
>>  			ret = -EIO;
>>  			goto err_set_sock;
>>  		}
>> +
>> +		if (pe.scm)
>> +			release_cmsg(&pe);
>>  	}
>>  	ret = 0;
>>  
>> @@ -197,6 +286,20 @@ err_brk:
>>  	return ret;
>>  }
>>  
>> +static void close_scm_fds(struct sk_packet *pkt)
>> +{
>> +	int i, *fds, n_fds;
>> +	struct cmsghdr *ch = (struct cmsghdr *)pkt->scm;
>> +
>> +	fds = (int *)CMSG_DATA(ch);
>> +	n_fds = (ch->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
>> +
>> +	for (i = 0; i < n_fds; i++) {
>> +		if (close(fds[i]))
>> +			pr_warn("scm: Error closing sent fd\n");
>> +	}
>> +}
>> +
>>  static int send_one_pkt(int fd, struct sk_packet *pkt)
>>  {
>>  	int ret;
>> @@ -209,6 +312,11 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>  	iov.iov_base = pkt->data;
>>  	iov.iov_len = entry->length;
>>  
>> +	if (pkt->scm != NULL) {
>> +		mh.msg_controllen = pkt->scm_len;
>> +		mh.msg_control = pkt->scm;
>> +	}
>> +
>>  	/*
>>  	 * Don't try to use sendfile here, because it use sendpage() and
>>  	 * all data are split on pages and a new skb is allocated for
>> @@ -229,6 +337,9 @@ static int send_one_pkt(int fd, struct sk_packet *pkt)
>>  		return -1;
>>  	}
>>  
>> +	if (pkt->scm != NULL)
>> +		close_scm_fds(pkt);
>> +
>>  	return 0;
>>  }
>>  
>> @@ -264,3 +375,43 @@ int restore_sk_queue(int fd, unsigned int peer_id)
>>  out:
>>  	return ret;
>>  }
> 
> The previous patches don't make restore_sk_queue() find a fle,
> which fd is used to set next packet in queue. This series does
> not support multi-queuer queues?

No. You're talking about "restore sender address" patches :) I remember that,
and that's a separate issue. Right now if we have several sockets sending
data to one, on restore we pick the first peer and make it restore the whole
queue.

> It seems, we don't need "unix: Split resolv and interconnect (v2)",
> it's just need to select a correct fd from task fles, and to use
> it in restore_sk_queue().

We do. Inside unix_note_scm_rights() I need to find a queuer for a queue
and this lookup is only possible after the resolve and interconnect split
I've made before.

>> +
>> +int prepare_scms(void)
>> +{
>> +	struct sk_packet *pkt;
>> +
>> +	pr_info("Preparing SCMs\n");
>> +	list_for_each_entry(pkt, &packets_list, list) {
>> +		SkPacketEntry *pe = pkt->entry;
>> +		ScmEntry *se;
>> +		struct cmsghdr *ch;
>> +
>> +		if (!pe->n_scm)
>> +			continue;
>> +
>> +		se = pe->scm[0]; /* Only 1 SCM is possible */
>> +
>> +		if (se->type == SCM_RIGHTS) {
>> +			pkt->scm_len = CMSG_SPACE(se->n_rights * sizeof(int));
>> +			pkt->scm = xmalloc(pkt->scm_len);
>> +			if (!pkt->scm)
>> +				return -1;
>> +
>> +			ch = (struct cmsghdr *)pkt->scm; /* FIXME -- via msghdr */
>> +			ch->cmsg_level = SOL_SOCKET;
>> +			ch->cmsg_type = SCM_RIGHTS;
>> +			ch->cmsg_len = CMSG_LEN(se->n_rights * sizeof(int));
>> +
>> +			if (unix_note_scm_rights(pe->id_for, se->rights,
>> +						(int *)CMSG_DATA(ch), se->n_rights))
>> +				return -1;
>> +
>> +			continue;
>> +		}
>> +
>> +		pr_err("Unsupported scm %d in image\n", se->type);
>> +		return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> diff --git a/criu/sk-unix.c b/criu/sk-unix.c
>> index 42ce1bb..165adc9 100644
>> --- a/criu/sk-unix.c
>> +++ b/criu/sk-unix.c
>> @@ -798,6 +798,7 @@ struct unix_sk_info {
>>  	struct file_desc d;
>>  	struct list_head connected; /* List of sockets, connected to me */
>>  	struct list_head node; /* To link in peer's connected list  */
>> +	struct list_head scm_fles;
>>  
>>  	/*
>>  	 * For DGRAM sockets with queues, we should only restore the queue
>> @@ -809,6 +810,11 @@ struct unix_sk_info {
>>  	u8 listen:1;
>>  };
>>  
>> +struct scm_fle {
>> +	struct list_head l;
>> +	struct fdinfo_list_entry *fle;
>> +};
>> +
>>  #define USK_PAIR_MASTER		0x1
>>  #define USK_PAIR_SLAVE		0x2
>>  
>> @@ -824,6 +830,116 @@ static struct unix_sk_info *find_unix_sk_by_ino(int ino)
>>  	return NULL;
>>  }
>>  
>> +static struct unix_sk_info *find_queuer_for(int id)
>> +{
>> +	struct unix_sk_info *ui;
>> +
>> +	list_for_each_entry(ui, &unix_sockets, list) {
>> +		if (ui->queuer == id)
>> +			return ui;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids)
>> +{
>> +	struct unix_sk_info *ui;
>> +	struct pstree_item *owner;
>> +	int i;
>> +
>> +	ui = find_queuer_for(id_for);
>> +	if (!ui) {
>> +		pr_err("Can't find sender for %d\n", id_for);
>> +		return -1;
>> +	}
>> +
>> +	pr_info("Found queuer for %d -> %d\n", id_for, ui->ue->id);
>> +	/*
>> +	 * This is the task that will restore this socket
>> +	 */
>> +	owner = file_master(&ui->d)->task;
>> +
>> +	pr_info("-> will set up deps\n");
>> +	/*
>> +	 * The ui will send data to the rights receiver. Add a fake fle
>> +	 * for the file and a dependency.
>> +	 */
>> +	for (i = 0; i < n_ids; i++) {
>> +		struct file_desc *tgt;
>> +		struct fdinfo_list_entry *fle;
>> +		struct scm_fle *sfle;
>> +		FdinfoEntry *e;
>> +		int fd;
>> +
>> +		tgt = find_file_desc_raw(FD_TYPES__UND, file_ids[i]);
>> +		if (!tgt) {
>> +			pr_err("Can't find fdesc to send\n");
>> +			return -1;
>> +		}
>> +
>> +		pr_info("`- Found %d file\n", file_ids[i]);
>> +		fd = find_unused_fd(owner, -1);
> 
> If I haven't missed something, I don't see, where you try
> to found the fle in task's file descriptor  to use it for
> queuing. It seems, we add a fake fle for every packet in
> a queue. If it's so, it's not performance-good, and firstly
> we should try to check, that task already owns the fle instead
> of that.

Yes, I made this deliberately. Adding one more descriptor for a file
will take us several syscalls more. Fortunately queues with scms will
not be a frequent case.

>> +
>> +		fle = try_file_master(tgt);
>> +		if (fle) {
>> +			e = dup_fdinfo(fle->fe, fd, 0);
>> +			if (!e) {
>> +				pr_err("Can't duplicate fdinfo for scm\n");
>> +				return -1;
>> +			}
>> +		} else {
>> +			/*
>> +			 * This can happen if the file in question is
>> +			 * sent over the socket and closed. In this case
>> +			 * we need to ... invent a new one!
>> +			 */
>> +
>> +			e = xmalloc(sizeof(*e));
>> +			if (!e)
>> +				return -1;
>> +
>> +			fdinfo_entry__init(e);
>> +			e->id = tgt->id;
>> +			e->type = tgt->ops->type;
>> +			e->fd = fd;
>> +			e->flags = 0;
> 
> I'd add the code around fdinfo_entry__init() in separate function
> as we already have 3 places like it.

OK, will do. What are the others?

>> +		}
>> +
>> +		pr_info("scm: add %d -> %d.fd[%d]\n", tgt->id, vpid(owner), fd);
>> +		sfle = xmalloc(sizeof(*sfle));
>> +		if (!sfle)
>> +			return -1;
>> +
>> +		sfle->fle = collect_fd_to(vpid(owner), e, rsti(owner), tgt, false);
>> +		if (!sfle->fle) {
>> +			pr_err("Can't request new fle for scm\n");
>> +			return -1;
>> +		}
>> +
>> +		list_add_tail(&sfle->l, &ui->scm_fles);
>> +		fds[i] = fd;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int chk_restored_scms(struct unix_sk_info *ui)
>> +{
>> +	struct scm_fle *sf, *n;
>> +
>> +	list_for_each_entry_safe(sf, n, &ui->scm_fles, l) {
>> +		if (sf->fle->stage != FLE_RESTORED)
>> +			return 1;
> 
> It's need to wait for sf->fle->bound here instead. 

Why?

> I suppose, the above introduces some circular dependencies for cases
> with several sockets having epoll, which is listening them.

Nope :) On dump I explicitly ban unix sockets and epolls sent via unix
sockets. So we can only wait here for some other files, that cannot wait us.

>> +
>> +		/* Optimization for the next pass */
>> +		list_del(&sf->l);
>> +		xfree(sf);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  static int wake_connected_sockets(struct unix_sk_info *ui)
>>  {
>>  	struct fdinfo_list_entry *fle;
>> @@ -1322,12 +1438,18 @@ static int open_unix_sk(struct file_desc *d, int *new_fd)
>>  	struct unix_sk_info *ui;
>>  	int ret;
>>  
>> +	ui = container_of(d, struct unix_sk_info, d);
>> +
>> +	/* FIXME -- only queue restore may be postponed */
>> +	if (chk_restored_scms(ui)) {
>> +		pr_info("scm: Wait for tgt to restore\n");
>> +		return 1;
>> +	}
> 
> It's too strong limitation, we delay another tasks here.

Another tasks? No, we just ask files restore loop to go and try to open
some other files.

> We may open the socket and to serve it out, and we do
> not need wait queuers for these actions.

Not always. There are several cases when we do socketpair() then restore
a queue then close one of the ends immediately.

> It's better to wait for the master fle of packet right
> before we restore the queue, and to use it from our
> task fle list.

Master may sit in some other's list.

>> +
>>  	fle = file_master(d);
>>  	if (fle->stage >= FLE_OPEN)
>>  		return post_open_unix_sk(d, fle->fe->fd);
>>  
>> -	ui = container_of(d, struct unix_sk_info, d);
>> -
>>  	if (inherited_fd(d, new_fd)) {
>>  		ui->ue->uflags |= USK_INHERIT;
>>  		ret = *new_fd >= 0 ? 0 : -1;
>> @@ -1440,6 +1562,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i)
>>  	ui->listen = 0;
>>  	INIT_LIST_HEAD(&ui->connected);
>>  	INIT_LIST_HEAD(&ui->node);
>> +	INIT_LIST_HEAD(&ui->scm_fles);
>>  	ui->flags = 0;
>>  	fixup_sock_net_ns_id(&ui->ue->ns_id, &ui->ue->has_ns_id);
>>  
>> diff --git a/images/sk-packet.proto b/images/sk-packet.proto
>> index 27b48e4..009b461 100644
>> --- a/images/sk-packet.proto
>> +++ b/images/sk-packet.proto
>> @@ -1,8 +1,14 @@
>>  syntax = "proto2";
>>  
>> +message scm_entry {
>> +	required uint32			type		= 1;
>> +	repeated uint32			rights		= 2;
>> +}
>> +
>>  message sk_packet_entry {
>>  	required uint32		id_for		= 1;
>>  	required uint32		length		= 2;
>>  	// optional bytes		addr	= 3;
>>  	// optional sk_ucred_entry	ucred	= 128;
>> +	repeated scm_entry	scm		= 4;
>>  }
>> -- 
>> 2.1.4
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU at openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
> .
> 



More information about the CRIU mailing list