[CRIU] [PATCH v4 32/41] pid: Create pid_ns helpers

Andrei Vagin avagin at virtuozzo.com
Thu May 4 22:51:23 PDT 2017


On Thu, May 04, 2017 at 07:10:37PM +0300, Kirill Tkhai wrote:
> Task may set last_pid only for its active pid namespace,
> so if NSpid of a child contains more then one level, we
> need external help to populate the whole pid hierarhy
> (pid in parent pid_ns, pid in grand parent etc). Pid ns
> helpers are used for that.
> 
> These are childred of usernsd, which are listening for
> socket, and setting requested last pid in their active
> pid_ns.
> 
> v4: Move destroy_pid_ns_helpers() before CR_STATE_RESTORE_SIGCHLD
> change, as they must die before zombies.
> 
> v3: Block SIGCHLD during stoppinig of pid_ns helpers.
> 
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
>  criu/cr-restore.c         |    7 +
>  criu/include/namespaces.h |    3 +
>  criu/namespaces.c         |  253 +++++++++++++++++++++++++++++++++++++++++++++
>  criu/ns-common.c          |   51 +++++++++
>  criu/pie/restorer.c       |    5 +
>  5 files changed, 319 insertions(+)
>  create mode 100644 criu/ns-common.c
> 
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 4c4ca37d7..3c35e5f08 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1605,6 +1605,8 @@ static int restore_task_with_children(void *_arg)
>  			pr_err("Can't add fd to fdstore\n");
>  			return -1;
>  		}
> +		if (create_pid_ns_helper(pid_ns) < 0)
> +			goto err;
>  	}
>  
>  	if (restore_task_mnt_ns(current))
> @@ -2038,6 +2040,10 @@ static int restore_root_task(struct pstree_item *init)
>  			task_entries->nr_threads--;
>  	}
>  
> +	ret = destroy_pid_ns_helpers();
> +	if (ret < 0)
> +		goto out_kill;
> +
>  	ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
>  	if (ret < 0)
>  		goto out_kill;
> @@ -2141,6 +2147,7 @@ static int restore_root_task(struct pstree_item *init)
>  	return 0;
>  
>  out_kill:
> +	destroy_pid_ns_helpers();
>  	/*
>  	 * The processes can be killed only when all of them have been created,
>  	 * otherwise an external proccesses can be killed.
> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> index 37b65b0db..b81957668 100644
> --- a/criu/include/namespaces.h
> +++ b/criu/include/namespaces.h
> @@ -267,5 +267,8 @@ static inline int pid_ns_root_off(void)
>  	return 0;
>  }
>  extern int reserve_pid_ns_helpers(void);
> +extern int create_pid_ns_helper(struct ns_id *ns);
> +extern int destroy_pid_ns_helpers(void);
> +extern int request_set_next_pid(int pid_ns_id, pid_t pid, int sk);
>  
>  #endif /* __CR_NS_H__ */
> diff --git a/criu/namespaces.c b/criu/namespaces.c
> index 97ea2b0e6..f65f06003 100644
> --- a/criu/namespaces.c
> +++ b/criu/namespaces.c
> @@ -15,6 +15,7 @@
>  #include <errno.h>
>  #include <sys/ioctl.h>
>  #include <sys/ptrace.h>
> +#include <sys/file.h>
>  
>  #include "page.h"
>  #include "rst-malloc.h"
> @@ -38,6 +39,11 @@
>  #include "fdstore.h"
>  #include "proc_parse.h"
>  
> +#define __sys(foo)	foo
> +#define __sys_err(ret)	(-errno)
> +
> +#include "ns-common.c"
> +
>  static struct ns_desc *ns_desc_array[] = {
>  	&net_ns_desc,
>  	&uts_ns_desc,
> @@ -49,6 +55,8 @@ static struct ns_desc *ns_desc_array[] = {
>  };
>  
>  static unsigned int join_ns_flags;
> +/* Creation of every helper are synchronized by userns_sync_lock */
> +static int nr_pid_ns_helper_created = 0;
>  
>  int check_namespace_opts(void)
>  {
> @@ -2532,5 +2540,250 @@ int reserve_pid_ns_helpers(void)
>  	return walk_namespaces(&pid_ns_desc, do_reserve_pid_ns_helpers, NULL);
>  }
>  
> +static int pid_ns_helper_sock(struct ns_id *ns)
> +{
> +	struct sockaddr_un addr;
> +	socklen_t len;
> +	int sk;
> +
> +	sk = socket(AF_UNIX, SOCK_DGRAM, 0);
> +	if (sk < 0) {
> +		pr_perror("Can't create helper socket");
> +		return -1;
> +	}
> +	pid_ns_helper_socket_name(&addr, &len, ns->id);
> +
> +	if (bind(sk, (struct sockaddr *)&addr, len) < 0) {
> +		pr_perror("Can't bind pid_ns sock");
> +		return -1;
> +	}
> +
> +	return sk;
> +}
> +
> +static int pid_ns_helper(struct ns_id *ns, int sk)
> +{
> +	struct sockaddr_un addr;
> +	struct msghdr msg = {0};
> +	struct iovec iov;
> +	pid_t pid;
> +
> +	msg.msg_name = &addr;
> +	msg.msg_iov = &iov;
> +	msg.msg_iovlen = 1;
> +
> +	while (1) {
> +		int answer = 0;
> +		msg.msg_namelen = sizeof(addr);
> +		iov.iov_base = &pid;
> +		iov.iov_len = sizeof(pid);
> +
> +		if (recvmsg(sk, &msg, 0) < 0) {
> +			pr_perror("recv() failed to read pid");
> +			break;
> +		}
> +
> +		if (pid != 0) {
> +			if (__set_next_pid(pid) < 0) {
> +				pr_err("Can't set next pid\n");
> +				answer = -1;
> +			}
> +		}
> +
> +		iov.iov_base = &answer;
> +		iov.iov_len = sizeof(answer);
> +		if (sendmsg(sk, &msg, 0) < 0) {
> +			pr_perror("Can't send answer");
> +			break;
> +		}
> +
> +		if (pid == 0)
> +			return 0;
> +	}
> +
> +	return -1;
> +}
> +
> +static int do_create_pid_ns_helper(void *arg, int unused_fd, pid_t unused_pid)
> +{
> +	int pid_ns_fd, mnt_ns_fd, sk, fd, i, lock_fd, transport_fd;
> +	struct ns_id *ns, *tmp;
> +	struct pid *pid;
> +	pid_t child;
> +
> +	pid_ns_fd = open_proc(PROC_SELF, "ns/pid");
> +	if (pid_ns_fd < 0) {
> +		pr_perror("Can't open pid ns");
> +		return -1;
> +	}
> +	ns = *(struct ns_id **)arg;
> +
> +	fd = fdstore_get(ns->pid.nsfd_id);
> +	if (fd < 0) {
> +		pr_err("Can't get pid_ns fd\n");
> +		return -1;
> +	}
> +	if (setns(fd, CLONE_NEWPID) < 0) {
> +		pr_perror("Can't setns");
> +		return -1;
> +	}
> +	close(fd);
> +
> +	sk = pid_ns_helper_sock(ns);
> +	if (sk < 0)
> +		return -1;
> +
> +	pid = __pstree_pid_by_virt(ns, ns->ns_pid);
> +	if (!pid) {
> +		pr_err("Can't find helper reserved pid\n");
> +		return -1;
> +	}
> +
> +	tmp = ns->parent;
> +	if (tmp) {
> +		futex_t *f = &tmp->pid.helper_created;
> +		futex_wait_while_eq(f, 0);
> +	}
> +
> +	if (switch_ns(root_item->pid->real, &mnt_ns_desc, &mnt_ns_fd) < 0) {
> +		pr_err("Can't set mnt_ns\n");
> +		return -1;
> +	}
> +
> +	lock_fd = open("/proc/" LAST_PID_PATH, O_RDONLY);
> +	if (lock_fd < 0)
> +		return -1;
> +
> +	if (restore_ns(mnt_ns_fd, &mnt_ns_desc) < 0) {
> +		pr_err("Can't restore ns\n");
> +		return -1;
> +	}
> +
> +	if (flock(lock_fd, LOCK_EX)) {
> +		close(lock_fd);
> +		pr_perror("Can't lock %s", LAST_PID_PATH);
> +		return -1;
> +	}
> +
> +	transport_fd = get_service_fd(TRANSPORT_FD_OFF);
> +	/*
> +	 * Starting not from pid->level - 1, as it's helper has not created yet
> +	 * (we're creating it in the moment), and the true pid for this level
> +	 * is set by the task, who does close(CLONE_NEWPID) (this task is sender of fd).
> +	 */
> +	for (i = pid->level - 2, tmp = ns->parent; i >= 0; i--, tmp = tmp->parent)
> +		if (request_set_next_pid(tmp->id, pid->ns[i].virt, transport_fd)) {
> +			pr_err("Can't set next pid using helper\n");
> +			flock(lock_fd, LOCK_UN);
> +			close(lock_fd);
> +			return -1;
> +		}
> +	child = fork();
> +	if (child < 0) {
> +		flock(lock_fd, LOCK_UN);
> +		close(lock_fd);
> +		pr_perror("Can't fork");
> +		return -1;
> +	} else if (!child) {
> +		close(lock_fd);
> +		exit(pid_ns_helper(ns, sk));
> +	}
> +	close(sk);
> +	futex_set_and_wake(&ns->pid.helper_created, 1);
> +	flock(lock_fd, LOCK_UN);
> +	close(lock_fd);
> +	nr_pid_ns_helper_created++;
> +
> +	if (setns(pid_ns_fd, CLONE_NEWPID) < 0) {
> +		pr_perror("Restore ns");
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * Task may set last_pid only for its active pid namespace,
> + * so if NSpid of a child contains more then one level, we
> + * need external help to populate the whole pid hierarhy
> + * (pid in parent pid_ns, pid in grand parent etc). Pid ns
> + * helpers are used for that.
> + *
> + * We need a task or tasks to be a parent of pid_ns helpers.
> + * To live in common hierarhy and to be a TASK_HELPER is not
> + * possible, because it introduces circular dependencies.
> + * The same is to be children of criu main task, because
> + * we already have dependencies between it and root_item
> + * (NO more dependencies!). So, we choose usernsd for that:
> + * it always exists and have command interface.
> + */
> +int create_pid_ns_helper(struct ns_id *ns)
> +{
> +	BUG_ON(getpid() != INIT_PID);
> +
> +	if (__set_next_pid(ns->ns_pid) < 0) {
> +		pr_err("Can't set next fd\n");
> +		return -1;
> +	}
> +	if (userns_call(do_create_pid_ns_helper, 0, &ns, sizeof(ns), -1) < 0) {
> +		pr_err("Can't create pid_ns helper\n");
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +static int do_destroy_pid_ns_helper(void *arg, int fd, pid_t pid)
> +{
> +	int i, sk, status, sig_blocked = true, nr_ok = 0, ret = 0;
> +	sigset_t sig_mask;
> +	struct ns_id *ns;
> +
> +	if (!nr_pid_ns_helper_created)
> +		return 0;
> +
> +	if (block_sigmask(&sig_mask, SIGCHLD)) {
> +		sig_blocked = false;
> +		ret = -1;
> +	}
> +
> +	sk = get_service_fd(TRANSPORT_FD_OFF);
> +
> +	for (ns = ns_ids; ns; ns = ns->next) {
> +		if (ns->nd != &pid_ns_desc)
> +			continue;
> +		if (request_set_next_pid(ns->id, 0, sk) == 0)
> +			nr_ok++;
> +	}
> +
> +	if (nr_ok != nr_pid_ns_helper_created) {
> +		pr_err("Not all pid_ns helpers killed\n");
> +		ret = -1;
> +	}
> +
> +	for (i = 0; i < nr_ok; i++) {
> +		if (waitpid(-1, &status, 0) < 0) {
> +			pr_perror("Error during waiting pid_ns helper");
> +			ret = -1;
> +		}
> +	}
> +	nr_pid_ns_helper_created = 0;
> +
> +	if (sig_blocked && restore_sigmask(&sig_mask))
> +		ret = -1;
> +
> +	return ret;
> +}
> +
> +int destroy_pid_ns_helpers(void)
> +{
> +	if (!(root_ns_mask & CLONE_NEWPID))
> +		return 0;
> +
> +	if (userns_call(do_destroy_pid_ns_helper, 0, NULL, 0, -1) < 0) {
> +		pr_err("Can't create pid_ns helper\n");
> +		return -1;
> +	}
> +	return 0;
> +}
> +
>  struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>  struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
> diff --git a/criu/ns-common.c b/criu/ns-common.c
> new file mode 100644
> index 000000000..a8e28aa00
> --- /dev/null
> +++ b/criu/ns-common.c
> @@ -0,0 +1,51 @@
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +
> +void pid_ns_helper_socket_name(struct sockaddr_un *addr, socklen_t *len, unsigned int id)
> +{
> +	const char prefix[] = "0/criu-pid-ns-";
> +	const char int_max[] = "2147483647";
> +
> +	*len = sizeof(*addr) - sizeof(addr->sun_path) +
> +	       sizeof(prefix) - 1 + sizeof(int_max) - 1;
> +
> +	addr->sun_family = AF_UNIX;
> +
> +	memset(addr->sun_path + sizeof(prefix) - 1, '\0', sizeof(int_max) - 1);
> +#ifdef CR_NOGLIBC
> +	std_sprintf(addr->sun_path, "%s%d", prefix, id);
> +#else
> +	sprintf(addr->sun_path, "%s%d", prefix, id);
> +#endif
> +	addr->sun_path[0] = '\0';
> +}
> +
> +/* Send helper a request to set next pid and receive success */
> +int request_set_next_pid(int pid_ns_id, pid_t pid, int sk)
> +{
> +	struct sockaddr_un addr;
> +	int answer, ret;
> +	socklen_t len;
> +
> +	BUG_ON(pid == -1);
> +
> +	pid_ns_helper_socket_name(&addr, &len, pid_ns_id);
> +	ret = __sys(sendto)(sk, &pid, sizeof(pid), 0, (struct sockaddr *)&addr, len);
> +	if (ret	< 0) {
> +		pr_err("Can't send request: err=%d\n", __sys_err(ret));
> +		return -1;
> +	}
> +
> +	ret = __sys(recvfrom)(sk, &answer, sizeof(answer), 0, NULL, NULL);
> +	if (ret < 0) {
> +		pr_err("Can't recv answer: err=%d\n", __sys_err(ret));
> +		return -1;
> +	}

criu hangs in this function in a error case.

[root at fc24 criu]# cat test/dump/zdtm/static/pty-console/31/1/restore.log  | grep -B 5 Error
(00.324568) uns: daemon calls 0x48eae0 (51, 8, 1)
(00.324583)      1: tty: Allocating fake descriptor for 0xb (reg_d 0x7f618453bff0)
(00.324605) uns: daemon calls 0x48eae0 (51, 8, 1)
(00.324639)      1: tty: Restore session 1 by 1 tty (index 0)
(00.324641)      1: Restoring resources
(00.324662)      1: Error (criu/tty.c:663): tty: Can't set sid on terminal fd 3: Operation not permitted


18064 pts/0    T      0:00  |           \_ python test/zdtm.py run -a --keep-going
20734 pts/0    T      0:00  |           |   \_ ./zdtm_ct zdtm.py
20737 pts/0    S      0:00  |           |       \_ python2 zdtm.py
20739 pts/0    T      0:00  |           |           \_ python2 zdtm.py
20784 pts/0    t      0:00  |           |               \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-console.
20786 pts/0    S      0:00  |           |                   \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-cons
20799 pts/0    Z      0:00  |           |                   |   \_ [criu] <defunct>
20801 pts/0    Z      0:00  |           |                   |   \_ [criu] <defunct>
20787 pts/0    S      0:00  |           |                   \_ [criu]
20800 pts/0    D      0:00  |           |                       \_ [criu]

[root at fc24 criu]# gdb -p 20786
(gdb) bt
#0  0x00007f6183ced9f3 in __recvfrom_nocancel () from target:/lib64/libpthread.so.0
#1  0x0000000000461662 in request_set_next_pid (pid_ns_id=<optimized out>, pid=<optimized out>, pid at entry=0, sk=sk at entry=1012) at criu/ns-common.c:39
#2  0x00000000004618ad in do_destroy_pid_ns_helper (pid=<optimized out>, fd=<optimized out>, arg=<optimized out>) at criu/namespaces.c:2753
#3  0x000000000042167e in usernsd (sk=5) at criu/namespaces.c:1600
#4  0x0000000000465a63 in start_usernsd () at criu/namespaces.c:1760
#5  prepare_namespace_before_tasks () at criu/namespaces.c:2431
#6  0x0000000000420746 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2118
#7  0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
#8  0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728

[root at fc24 criu]# gdb -p gdb -p 20784
(gdb) bt
#0  0x00007f6183cee2c7 in recvmsg () from target:/lib64/libpthread.so.0
#1  0x0000000000463c26 in __userns_call (func_name=func_name at entry=0x4dc101 "do_destroy_pid_ns_helper", call=<optimized out>, call at entry=0x461950 <do_destroy_pid_ns_helper>, 
    flags=<optimized out>, flags at entry=0, arg=arg at entry=0x0, arg_size=arg_size at entry=0, fd=fd at entry=-1) at criu/namespaces.c:1692
#2  0x0000000000465d40 in destroy_pid_ns_helpers () at criu/namespaces.c:2781
#3  0x0000000000420dd5 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2323
#4  0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
#5  0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728

> +
> +	if (answer != 0) {
> +		pr_err("Error answer\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 030c7ff42..3b0b35710 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -47,6 +47,11 @@
>  #include "restorer.h"
>  #include "namespaces.h"
>  
> +#define __sys(foo)	sys_##foo
> +#define __sys_err(ret)	ret
> +
> +#include "../ns-common.c"
> +
>  #ifndef PR_SET_PDEATHSIG
>  #define PR_SET_PDEATHSIG 1
>  #endif
> 


More information about the CRIU mailing list