[CRIU] [PATCH 07/10] net: allow to dump and restore more than one network namespace

Andrei Vagin avagin at virtuozzo.com
Thu Oct 27 19:30:36 PDT 2016


On Mon, Oct 17, 2016 at 02:51:56PM +0300, Pavel Emelyanov wrote:
> On 09/01/2016 01:55 AM, Andrei Vagin wrote:
> > From: Andrei Vagin <avagin at virtuozzo.com>
> > 
> > Restore all network namespaces from the root task and then set
> > a proper namespace for each task after restoring sockets, because
> > we need to switch network namespaces to restore sockets.
> > 
> > Each socket has to be created in a proper network namespace.
> > 
> > Signed-off-by: Andrei Vagin <avagin at virtuozzo.com>
> > ---
> >  criu/cr-restore.c         |  10 ++++
> >  criu/include/namespaces.h |   4 +-
> >  criu/include/net.h        |   4 +-
> >  criu/namespaces.c         |   6 +--
> >  criu/net.c                | 127 +++++++++++++++++++++++++++++++++++++++++++++-
> >  criu/pstree.c             |   3 ++
> >  6 files changed, 148 insertions(+), 6 deletions(-)
> > 
> > diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> > index 0b77fb3..d6ceeb3 100644
> > --- a/criu/cr-restore.c
> > +++ b/criu/cr-restore.c
> > @@ -718,6 +718,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
> >  
> >  	close_service_fd(TRANSPORT_FD_OFF);
> >  
> > +	if (restore_task_net_ns(current))
> > +		return -1;
> 
> This happens after prepare_fds() which do restore sockets. Why do they
> happen in correct net namespaces?

A correct net namespace for a socket is a namespace where it was
created. It can be different with the task netns, so we switch to a
socket netns to restore a socket and then when all sockets have been
restored, we switch into a task netns. I will add a comment here.

> 
> > +
> >  	if (setup_uffd(pid, ta))
> >  		return -1;
> >  
> > @@ -2898,6 +2901,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> >  	if (rst_prep_creds(pid, core, &creds_pos))
> >  		goto err_nv;
> >  
> > +	if (current->parent == NULL) {
> > +		/* Wait when all tasks restored all files */
> > +		futex_wait_while_gt(&task_entries->nr_in_progress,
> > +						current->nr_threads);
> 
> I guess we need a helper for this.
> 
> > +		fini_net_namespaces();
> > +	}
> > +
> >  	/*
> >  	 * We're about to search for free VM area and inject the restorer blob
> >  	 * into it. No irrelevant mmaps/mremaps beyond this point, otherwise
> > diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> > index 2b183f2..7529495 100644
> > --- a/criu/include/namespaces.h
> > +++ b/criu/include/namespaces.h
> > @@ -36,7 +36,8 @@
> >  #define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER | CLONE_NEWCGROUP)
> >  
> >  /* Nested namespaces are supported only for these types */
> > -#define CLONE_SUBNS	(CLONE_NEWNS)
> > +#define CLONE_SUBNS	(CLONE_NEWNS | CLONE_NEWNET)
> > +
> >  #define EXTRA_SIZE	20
> >  
> >  struct ns_desc {
> > @@ -95,6 +96,7 @@ struct ns_id {
> >  		} mnt;
> >  
> >  		struct {
> > +			int ns_fd;
> 
> Document this field.
> 
> >  			int nlsk;	/* for sockets collection */
> >  			int seqsk;	/* to talk to parasite daemons */
> >  		} net;
> > diff --git a/criu/include/net.h b/criu/include/net.h
> > index ede380f..f88f876 100644
> > --- a/criu/include/net.h
> > +++ b/criu/include/net.h
> > @@ -5,8 +5,10 @@
> >  
> >  struct cr_imgset;
> >  extern int dump_net_ns(int ns_id);
> > -extern int prepare_net_ns(int pid);
> > +extern int prepare_net_namespaces(void);
> > +extern void fini_net_namespaces(void);
> >  extern int netns_keep_nsfd(void);
> > +extern int restore_task_net_ns(struct pstree_item *current);
> >  
> >  struct veth_pair {
> >  	struct list_head node;
> > diff --git a/criu/namespaces.c b/criu/namespaces.c
> > index 1d54a9f..974d1c6 100644
> > --- a/criu/namespaces.c
> > +++ b/criu/namespaces.c
> > @@ -1651,9 +1651,6 @@ int prepare_namespace(struct pstree_item *item, unsigned long clone_flags)
> >  	 * tree (i.e. -- mnt_ns restoring)
> >  	 */
> >  
> > -	id = ns_per_id ? item->ids->net_ns_id : pid;
> > -	if ((clone_flags & CLONE_NEWNET) && prepare_net_ns(id))
> > -		return -1;
> >  	id = ns_per_id ? item->ids->uts_ns_id : pid;
> >  	if ((clone_flags & CLONE_NEWUTS) && prepare_utsns(id))
> >  		return -1;
> > @@ -1661,6 +1658,9 @@ int prepare_namespace(struct pstree_item *item, unsigned long clone_flags)
> >  	if ((clone_flags & CLONE_NEWIPC) && prepare_ipc_ns(id))
> >  		return -1;
> >  
> > +	if (prepare_net_namespaces())
> > +		return -1;
> > +
> >  	/*
> >  	 * This one is special -- there can be several mount
> >  	 * namespaces and prepare_mnt_ns handles them itself.
> > diff --git a/criu/net.c b/criu/net.c
> > index 7a50640..c312961 100644
> > --- a/criu/net.c
> > +++ b/criu/net.c
> > @@ -1452,7 +1452,7 @@ int dump_net_ns(int ns_id)
> >  	return ret;
> >  }
> >  
> > -int prepare_net_ns(int pid)
> > +static int prepare_net_ns(int pid)
> >  {
> >  	int ret = 0;
> >  	NetnsEntry *netns = NULL;
> > @@ -1483,6 +1483,131 @@ int prepare_net_ns(int pid)
> >  	return ret;
> >  }
> >  
> > +static int open_net_ns(struct ns_id *nsid, struct rst_info *rst)
> > +{
> > +	int fd, tfd;
> > +
> > +	/* Pin one with a file descriptor */
> > +	fd = open_proc(PROC_SELF, "ns/net");
> > +	if (fd < 0)
> > +		return -1;
> > +	tfd = reopen_as_unused_fd(fd, rst);
> > +	if (tfd < 0) {
> > +		close(fd);
> > +		return -1;
> > +	}
> > +	nsid->net.ns_fd = tfd;
> > +
> > +	return 0;
> > +}
> > +
> > +int prepare_net_namespaces()
> > +{
> > +	struct ns_id *nsid;
> > +	int rst = -1;
> > +
> > +	if (!(root_ns_mask & CLONE_NEWNET))
> > +		return 0;
> > +
> > +	rst = open_proc(PROC_SELF, "ns/net");
> > +	if (rst < 0)
> > +		return -1;
> > +
> > +	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
> > +		if (nsid->nd != &net_ns_desc)
> > +			continue;
> > +
> > +		if (nsid->type != NS_ROOT && unshare(CLONE_NEWNS)) {
> 
> O_o If we hit the NS_ROOT thing here we ... proceed with prepare_net_ns()
> w/o any unshare? Is this correct?

It is correct, because we create the root netns when we create a root
task. Here is more interesting thing about CLONE_NEWNS ;). Looks like I
need to fix a test too. Thanks a lot!

> 
> > +			pr_perror("Unable to create a new mntns");
> > +			goto err;
> > +		}
> > +
> > +		if (prepare_net_ns(nsid->id))
> > +			goto err;
> > +
> > +		if (open_net_ns(nsid, rsti(root_item)))
> > +			goto err;
> > +
> > +		/* And return back to regain the access to the roots yard */
> 
> O_o Why getting back into netns retains us access to root yard?

It is a copy-past. Will fix
> 
> > +		if (setns(rst, CLONE_NEWNET)) {
> > +			pr_perror("Can't restore mntns back");
> > +			goto err;
> > +		}
> > +	}
> > +
> > +	close(rst);
> > +	return 0;
> > +err:
> > +	if (rst >= 0)
> > +		restore_ns(rst, &net_ns_desc);
> > +	return -1;
> > +}
> > +
> > +void fini_net_namespaces()
> > +{
> > +	struct ns_id *nsid;
> > +
> > +	if (!(root_ns_mask & CLONE_NEWNS))
> > +		return;
> > +
> > +	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
> > +		if (nsid->nd != &net_ns_desc)
> > +			continue;
> > +		close_safe(&nsid->net.ns_fd);
> 
> This is called by root task. Who will close the ns_fd-s for other tasks?

Only the root task has these descriptors

> 
> > +	}
> > +}
> > +
> > +static int do_restore_task_net_ns(struct ns_id *nsid, struct pstree_item *current)
> > +{
> > +	int fd;
> > +
> > +	fd = open_proc(root_item->pid.virt, "fd/%d", nsid->net.ns_fd);
> > +	if (fd < 0)
> > +		return -1;
> > +
> > +	if (setns(fd, CLONE_NEWNET)) {
> > +		pr_perror("Can't restore mntns");
> > +		close(fd);
> > +		return -1;
> > +	}
> > +	close(fd);
> > +
> > +	return 0;
> > +}
> > +
> > +int restore_task_net_ns(struct pstree_item *current)
> > +{
> > +	if (current->ids && current->ids->has_net_ns_id) {
> > +		unsigned int id = current->ids->net_ns_id;
> > +		struct ns_id *nsid;
> > +
> > +		/*
> > +		 * Regardless of the namespace a task wants to
> > +		 * live in, by that point they all will live in
> > +		 * root's one (see prepare_pstree_kobj_ids() +
> > +		 * get_clone_mask()). So if the current task's
> > +		 * target namespace is the root's one -- it's
> > +		 * already there, otherwise it will have to do
> > +		 * setns().
> > +		 */
> > +		if (!current->parent || id == current->parent->ids->net_ns_id)
> > +			return 0;
> > +
> > +		nsid = lookup_ns_by_id(id, &net_ns_desc);
> > +		if (nsid == NULL) {
> > +			pr_err("Can't find mount namespace %d\n", id);
> > +			return -1;
> > +		}
> > +
> > +		BUG_ON(nsid->type == NS_CRIU);
> > +
> > +		if (do_restore_task_net_ns(nsid, current))
> > +			return -1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  int netns_keep_nsfd(void)
> >  {
> >  	int ns_fd, ret;
> > diff --git a/criu/pstree.c b/criu/pstree.c
> > index d23b1f1..0b219aa 100644
> > --- a/criu/pstree.c
> > +++ b/criu/pstree.c
> > @@ -12,6 +12,7 @@
> >  #include "tty.h"
> >  #include "mount.h"
> >  #include "asm/dump.h"
> > +#include "net.h"
> >  
> >  #include "protobuf.h"
> >  #include "images/pstree.pb-c.h"
> > @@ -466,6 +467,8 @@ static int read_pstree_ids(struct pstree_item *pi)
> >  	if (pi->ids->has_mnt_ns_id) {
> >  		if (rst_add_ns_id(pi->ids->mnt_ns_id, pi, &mnt_ns_desc))
> >  			return -1;
> > +		if (rst_add_ns_id(pi->ids->net_ns_id, pi, &net_ns_desc))
> > +			return -1;
> >  	}
> >  
> >  	return 0;
> > 
> 


More information about the CRIU mailing list