[CRIU] [PATCH v4 32/41] pid: Create pid_ns helpers

Kirill Tkhai ktkhai at virtuozzo.com
Fri May 5 08:35:30 PDT 2017


On 05.05.2017 08:51, Andrei Vagin wrote:
> On Thu, May 04, 2017 at 07:10:37PM +0300, Kirill Tkhai wrote:
>> Task may set last_pid only for its active pid namespace,
>> so if NSpid of a child contains more then one level, we
>> need external help to populate the whole pid hierarhy
>> (pid in parent pid_ns, pid in grand parent etc). Pid ns
>> helpers are used for that.
>>
>> These are childred of usernsd, which are listening for
>> socket, and setting requested last pid in their active
>> pid_ns.
>>
>> v4: Move destroy_pid_ns_helpers() before CR_STATE_RESTORE_SIGCHLD
>> change, as they must die before zombies.
>>
>> v3: Block SIGCHLD during stoppinig of pid_ns helpers.
>>
>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>> ---
>>  criu/cr-restore.c         |    7 +
>>  criu/include/namespaces.h |    3 +
>>  criu/namespaces.c         |  253 +++++++++++++++++++++++++++++++++++++++++++++
>>  criu/ns-common.c          |   51 +++++++++
>>  criu/pie/restorer.c       |    5 +
>>  5 files changed, 319 insertions(+)
>>  create mode 100644 criu/ns-common.c
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index 4c4ca37d7..3c35e5f08 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -1605,6 +1605,8 @@ static int restore_task_with_children(void *_arg)
>>  			pr_err("Can't add fd to fdstore\n");
>>  			return -1;
>>  		}
>> +		if (create_pid_ns_helper(pid_ns) < 0)
>> +			goto err;
>>  	}
>>  
>>  	if (restore_task_mnt_ns(current))
>> @@ -2038,6 +2040,10 @@ static int restore_root_task(struct pstree_item *init)
>>  			task_entries->nr_threads--;
>>  	}
>>  
>> +	ret = destroy_pid_ns_helpers();
>> +	if (ret < 0)
>> +		goto out_kill;
>> +
>>  	ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
>>  	if (ret < 0)
>>  		goto out_kill;
>> @@ -2141,6 +2147,7 @@ static int restore_root_task(struct pstree_item *init)
>>  	return 0;
>>  
>>  out_kill:
>> +	destroy_pid_ns_helpers();
>>  	/*
>>  	 * The processes can be killed only when all of them have been created,
>>  	 * otherwise an external proccesses can be killed.
>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>> index 37b65b0db..b81957668 100644
>> --- a/criu/include/namespaces.h
>> +++ b/criu/include/namespaces.h
>> @@ -267,5 +267,8 @@ static inline int pid_ns_root_off(void)
>>  	return 0;
>>  }
>>  extern int reserve_pid_ns_helpers(void);
>> +extern int create_pid_ns_helper(struct ns_id *ns);
>> +extern int destroy_pid_ns_helpers(void);
>> +extern int request_set_next_pid(int pid_ns_id, pid_t pid, int sk);
>>  
>>  #endif /* __CR_NS_H__ */
>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>> index 97ea2b0e6..f65f06003 100644
>> --- a/criu/namespaces.c
>> +++ b/criu/namespaces.c
>> @@ -15,6 +15,7 @@
>>  #include <errno.h>
>>  #include <sys/ioctl.h>
>>  #include <sys/ptrace.h>
>> +#include <sys/file.h>
>>  
>>  #include "page.h"
>>  #include "rst-malloc.h"
>> @@ -38,6 +39,11 @@
>>  #include "fdstore.h"
>>  #include "proc_parse.h"
>>  
>> +#define __sys(foo)	foo
>> +#define __sys_err(ret)	(-errno)
>> +
>> +#include "ns-common.c"
>> +
>>  static struct ns_desc *ns_desc_array[] = {
>>  	&net_ns_desc,
>>  	&uts_ns_desc,
>> @@ -49,6 +55,8 @@ static struct ns_desc *ns_desc_array[] = {
>>  };
>>  
>>  static unsigned int join_ns_flags;
>> +/* Creation of every helper are synchronized by userns_sync_lock */
>> +static int nr_pid_ns_helper_created = 0;
>>  
>>  int check_namespace_opts(void)
>>  {
>> @@ -2532,5 +2540,250 @@ int reserve_pid_ns_helpers(void)
>>  	return walk_namespaces(&pid_ns_desc, do_reserve_pid_ns_helpers, NULL);
>>  }
>>  
>> +static int pid_ns_helper_sock(struct ns_id *ns)
>> +{
>> +	struct sockaddr_un addr;
>> +	socklen_t len;
>> +	int sk;
>> +
>> +	sk = socket(AF_UNIX, SOCK_DGRAM, 0);
>> +	if (sk < 0) {
>> +		pr_perror("Can't create helper socket");
>> +		return -1;
>> +	}
>> +	pid_ns_helper_socket_name(&addr, &len, ns->id);
>> +
>> +	if (bind(sk, (struct sockaddr *)&addr, len) < 0) {
>> +		pr_perror("Can't bind pid_ns sock");
>> +		return -1;
>> +	}
>> +
>> +	return sk;
>> +}
>> +
>> +static int pid_ns_helper(struct ns_id *ns, int sk)
>> +{
>> +	struct sockaddr_un addr;
>> +	struct msghdr msg = {0};
>> +	struct iovec iov;
>> +	pid_t pid;
>> +
>> +	msg.msg_name = &addr;
>> +	msg.msg_iov = &iov;
>> +	msg.msg_iovlen = 1;
>> +
>> +	while (1) {
>> +		int answer = 0;
>> +		msg.msg_namelen = sizeof(addr);
>> +		iov.iov_base = &pid;
>> +		iov.iov_len = sizeof(pid);
>> +
>> +		if (recvmsg(sk, &msg, 0) < 0) {
>> +			pr_perror("recv() failed to read pid");
>> +			break;
>> +		}
>> +
>> +		if (pid != 0) {
>> +			if (__set_next_pid(pid) < 0) {
>> +				pr_err("Can't set next pid\n");
>> +				answer = -1;
>> +			}
>> +		}
>> +
>> +		iov.iov_base = &answer;
>> +		iov.iov_len = sizeof(answer);
>> +		if (sendmsg(sk, &msg, 0) < 0) {
>> +			pr_perror("Can't send answer");
>> +			break;
>> +		}
>> +
>> +		if (pid == 0)
>> +			return 0;
>> +	}
>> +
>> +	return -1;
>> +}
>> +
>> +static int do_create_pid_ns_helper(void *arg, int unused_fd, pid_t unused_pid)
>> +{
>> +	int pid_ns_fd, mnt_ns_fd, sk, fd, i, lock_fd, transport_fd;
>> +	struct ns_id *ns, *tmp;
>> +	struct pid *pid;
>> +	pid_t child;
>> +
>> +	pid_ns_fd = open_proc(PROC_SELF, "ns/pid");
>> +	if (pid_ns_fd < 0) {
>> +		pr_perror("Can't open pid ns");
>> +		return -1;
>> +	}
>> +	ns = *(struct ns_id **)arg;
>> +
>> +	fd = fdstore_get(ns->pid.nsfd_id);
>> +	if (fd < 0) {
>> +		pr_err("Can't get pid_ns fd\n");
>> +		return -1;
>> +	}
>> +	if (setns(fd, CLONE_NEWPID) < 0) {
>> +		pr_perror("Can't setns");
>> +		return -1;
>> +	}
>> +	close(fd);
>> +
>> +	sk = pid_ns_helper_sock(ns);
>> +	if (sk < 0)
>> +		return -1;
>> +
>> +	pid = __pstree_pid_by_virt(ns, ns->ns_pid);
>> +	if (!pid) {
>> +		pr_err("Can't find helper reserved pid\n");
>> +		return -1;
>> +	}
>> +
>> +	tmp = ns->parent;
>> +	if (tmp) {
>> +		futex_t *f = &tmp->pid.helper_created;
>> +		futex_wait_while_eq(f, 0);
>> +	}
>> +
>> +	if (switch_ns(root_item->pid->real, &mnt_ns_desc, &mnt_ns_fd) < 0) {
>> +		pr_err("Can't set mnt_ns\n");
>> +		return -1;
>> +	}
>> +
>> +	lock_fd = open("/proc/" LAST_PID_PATH, O_RDONLY);
>> +	if (lock_fd < 0)
>> +		return -1;
>> +
>> +	if (restore_ns(mnt_ns_fd, &mnt_ns_desc) < 0) {
>> +		pr_err("Can't restore ns\n");
>> +		return -1;
>> +	}
>> +
>> +	if (flock(lock_fd, LOCK_EX)) {
>> +		close(lock_fd);
>> +		pr_perror("Can't lock %s", LAST_PID_PATH);
>> +		return -1;
>> +	}
>> +
>> +	transport_fd = get_service_fd(TRANSPORT_FD_OFF);
>> +	/*
>> +	 * Starting not from pid->level - 1, as it's helper has not created yet
>> +	 * (we're creating it in the moment), and the true pid for this level
>> +	 * is set by the task, who does close(CLONE_NEWPID) (this task is sender of fd).
>> +	 */
>> +	for (i = pid->level - 2, tmp = ns->parent; i >= 0; i--, tmp = tmp->parent)
>> +		if (request_set_next_pid(tmp->id, pid->ns[i].virt, transport_fd)) {
>> +			pr_err("Can't set next pid using helper\n");
>> +			flock(lock_fd, LOCK_UN);
>> +			close(lock_fd);
>> +			return -1;
>> +		}
>> +	child = fork();
>> +	if (child < 0) {
>> +		flock(lock_fd, LOCK_UN);
>> +		close(lock_fd);
>> +		pr_perror("Can't fork");
>> +		return -1;
>> +	} else if (!child) {
>> +		close(lock_fd);
>> +		exit(pid_ns_helper(ns, sk));
>> +	}
>> +	close(sk);
>> +	futex_set_and_wake(&ns->pid.helper_created, 1);
>> +	flock(lock_fd, LOCK_UN);
>> +	close(lock_fd);
>> +	nr_pid_ns_helper_created++;
>> +
>> +	if (setns(pid_ns_fd, CLONE_NEWPID) < 0) {
>> +		pr_perror("Restore ns");
>> +		return -1;
>> +	}
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Task may set last_pid only for its active pid namespace,
>> + * so if NSpid of a child contains more then one level, we
>> + * need external help to populate the whole pid hierarhy
>> + * (pid in parent pid_ns, pid in grand parent etc). Pid ns
>> + * helpers are used for that.
>> + *
>> + * We need a task or tasks to be a parent of pid_ns helpers.
>> + * To live in common hierarhy and to be a TASK_HELPER is not
>> + * possible, because it introduces circular dependencies.
>> + * The same is to be children of criu main task, because
>> + * we already have dependencies between it and root_item
>> + * (NO more dependencies!). So, we choose usernsd for that:
>> + * it always exists and have command interface.
>> + */
>> +int create_pid_ns_helper(struct ns_id *ns)
>> +{
>> +	BUG_ON(getpid() != INIT_PID);
>> +
>> +	if (__set_next_pid(ns->ns_pid) < 0) {
>> +		pr_err("Can't set next fd\n");
>> +		return -1;
>> +	}
>> +	if (userns_call(do_create_pid_ns_helper, 0, &ns, sizeof(ns), -1) < 0) {
>> +		pr_err("Can't create pid_ns helper\n");
>> +		return -1;
>> +	}
>> +	return 0;
>> +}
>> +
>> +static int do_destroy_pid_ns_helper(void *arg, int fd, pid_t pid)
>> +{
>> +	int i, sk, status, sig_blocked = true, nr_ok = 0, ret = 0;
>> +	sigset_t sig_mask;
>> +	struct ns_id *ns;
>> +
>> +	if (!nr_pid_ns_helper_created)
>> +		return 0;
>> +
>> +	if (block_sigmask(&sig_mask, SIGCHLD)) {
>> +		sig_blocked = false;
>> +		ret = -1;
>> +	}
>> +
>> +	sk = get_service_fd(TRANSPORT_FD_OFF);
>> +
>> +	for (ns = ns_ids; ns; ns = ns->next) {
>> +		if (ns->nd != &pid_ns_desc)
>> +			continue;
>> +		if (request_set_next_pid(ns->id, 0, sk) == 0)
>> +			nr_ok++;
>> +	}
>> +
>> +	if (nr_ok != nr_pid_ns_helper_created) {
>> +		pr_err("Not all pid_ns helpers killed\n");
>> +		ret = -1;
>> +	}
>> +
>> +	for (i = 0; i < nr_ok; i++) {
>> +		if (waitpid(-1, &status, 0) < 0) {
>> +			pr_perror("Error during waiting pid_ns helper");
>> +			ret = -1;
>> +		}
>> +	}
>> +	nr_pid_ns_helper_created = 0;
>> +
>> +	if (sig_blocked && restore_sigmask(&sig_mask))
>> +		ret = -1;
>> +
>> +	return ret;
>> +}
>> +
>> +int destroy_pid_ns_helpers(void)
>> +{
>> +	if (!(root_ns_mask & CLONE_NEWPID))
>> +		return 0;
>> +
>> +	if (userns_call(do_destroy_pid_ns_helper, 0, NULL, 0, -1) < 0) {
>> +		pr_err("Can't create pid_ns helper\n");
>> +		return -1;
>> +	}
>> +	return 0;
>> +}
>> +
>>  struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>>  struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
>> diff --git a/criu/ns-common.c b/criu/ns-common.c
>> new file mode 100644
>> index 000000000..a8e28aa00
>> --- /dev/null
>> +++ b/criu/ns-common.c
>> @@ -0,0 +1,51 @@
>> +#include <sys/socket.h>
>> +#include <sys/un.h>
>> +
>> +void pid_ns_helper_socket_name(struct sockaddr_un *addr, socklen_t *len, unsigned int id)
>> +{
>> +	const char prefix[] = "0/criu-pid-ns-";
>> +	const char int_max[] = "2147483647";
>> +
>> +	*len = sizeof(*addr) - sizeof(addr->sun_path) +
>> +	       sizeof(prefix) - 1 + sizeof(int_max) - 1;
>> +
>> +	addr->sun_family = AF_UNIX;
>> +
>> +	memset(addr->sun_path + sizeof(prefix) - 1, '\0', sizeof(int_max) - 1);
>> +#ifdef CR_NOGLIBC
>> +	std_sprintf(addr->sun_path, "%s%d", prefix, id);
>> +#else
>> +	sprintf(addr->sun_path, "%s%d", prefix, id);
>> +#endif
>> +	addr->sun_path[0] = '\0';
>> +}
>> +
>> +/* Send helper a request to set next pid and receive success */
>> +int request_set_next_pid(int pid_ns_id, pid_t pid, int sk)
>> +{
>> +	struct sockaddr_un addr;
>> +	int answer, ret;
>> +	socklen_t len;
>> +
>> +	BUG_ON(pid == -1);
>> +
>> +	pid_ns_helper_socket_name(&addr, &len, pid_ns_id);
>> +	ret = __sys(sendto)(sk, &pid, sizeof(pid), 0, (struct sockaddr *)&addr, len);
>> +	if (ret	< 0) {
>> +		pr_err("Can't send request: err=%d\n", __sys_err(ret));
>> +		return -1;
>> +	}
>> +
>> +	ret = __sys(recvfrom)(sk, &answer, sizeof(answer), 0, NULL, NULL);
>> +	if (ret < 0) {
>> +		pr_err("Can't recv answer: err=%d\n", __sys_err(ret));
>> +		return -1;
>> +	}
> 
> criu hangs in this function in a error case.

I'll find a solution for this case, but not in next v5.
v5 is just for transport sockets problem resolution.
 
> [root at fc24 criu]# cat test/dump/zdtm/static/pty-console/31/1/restore.log  | grep -B 5 Error
> (00.324568) uns: daemon calls 0x48eae0 (51, 8, 1)
> (00.324583)      1: tty: Allocating fake descriptor for 0xb (reg_d 0x7f618453bff0)
> (00.324605) uns: daemon calls 0x48eae0 (51, 8, 1)
> (00.324639)      1: tty: Restore session 1 by 1 tty (index 0)
> (00.324641)      1: Restoring resources
> (00.324662)      1: Error (criu/tty.c:663): tty: Can't set sid on terminal fd 3: Operation not permitted
> 
> 
> 18064 pts/0    T      0:00  |           \_ python test/zdtm.py run -a --keep-going
> 20734 pts/0    T      0:00  |           |   \_ ./zdtm_ct zdtm.py
> 20737 pts/0    S      0:00  |           |       \_ python2 zdtm.py
> 20739 pts/0    T      0:00  |           |           \_ python2 zdtm.py
> 20784 pts/0    t      0:00  |           |               \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-console.
> 20786 pts/0    S      0:00  |           |                   \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-cons
> 20799 pts/0    Z      0:00  |           |                   |   \_ [criu] <defunct>
> 20801 pts/0    Z      0:00  |           |                   |   \_ [criu] <defunct>
> 20787 pts/0    S      0:00  |           |                   \_ [criu]
> 20800 pts/0    D      0:00  |           |                       \_ [criu]
> 
> [root at fc24 criu]# gdb -p 20786
> (gdb) bt
> #0  0x00007f6183ced9f3 in __recvfrom_nocancel () from target:/lib64/libpthread.so.0
> #1  0x0000000000461662 in request_set_next_pid (pid_ns_id=<optimized out>, pid=<optimized out>, pid at entry=0, sk=sk at entry=1012) at criu/ns-common.c:39
> #2  0x00000000004618ad in do_destroy_pid_ns_helper (pid=<optimized out>, fd=<optimized out>, arg=<optimized out>) at criu/namespaces.c:2753
> #3  0x000000000042167e in usernsd (sk=5) at criu/namespaces.c:1600
> #4  0x0000000000465a63 in start_usernsd () at criu/namespaces.c:1760
> #5  prepare_namespace_before_tasks () at criu/namespaces.c:2431
> #6  0x0000000000420746 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2118
> #7  0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
> #8  0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728
> 
> [root at fc24 criu]# gdb -p gdb -p 20784
> (gdb) bt
> #0  0x00007f6183cee2c7 in recvmsg () from target:/lib64/libpthread.so.0
> #1  0x0000000000463c26 in __userns_call (func_name=func_name at entry=0x4dc101 "do_destroy_pid_ns_helper", call=<optimized out>, call at entry=0x461950 <do_destroy_pid_ns_helper>, 
>     flags=<optimized out>, flags at entry=0, arg=arg at entry=0x0, arg_size=arg_size at entry=0, fd=fd at entry=-1) at criu/namespaces.c:1692
> #2  0x0000000000465d40 in destroy_pid_ns_helpers () at criu/namespaces.c:2781
> #3  0x0000000000420dd5 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2323
> #4  0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
> #5  0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728
> 
>> +
>> +	if (answer != 0) {
>> +		pr_err("Error answer\n");
>> +		return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>> index 030c7ff42..3b0b35710 100644
>> --- a/criu/pie/restorer.c
>> +++ b/criu/pie/restorer.c
>> @@ -47,6 +47,11 @@
>>  #include "restorer.h"
>>  #include "namespaces.h"
>>  
>> +#define __sys(foo)	sys_##foo
>> +#define __sys_err(ret)	ret
>> +
>> +#include "../ns-common.c"
>> +
>>  #ifndef PR_SET_PDEATHSIG
>>  #define PR_SET_PDEATHSIG 1
>>  #endif
>>


More information about the CRIU mailing list