[CRIU] [PATCH v4 32/41] pid: Create pid_ns helpers
Andrei Vagin
avagin at virtuozzo.com
Thu May 4 22:51:23 PDT 2017
On Thu, May 04, 2017 at 07:10:37PM +0300, Kirill Tkhai wrote:
> Task may set last_pid only for its active pid namespace,
> so if NSpid of a child contains more then one level, we
> need external help to populate the whole pid hierarhy
> (pid in parent pid_ns, pid in grand parent etc). Pid ns
> helpers are used for that.
>
> These are childred of usernsd, which are listening for
> socket, and setting requested last pid in their active
> pid_ns.
>
> v4: Move destroy_pid_ns_helpers() before CR_STATE_RESTORE_SIGCHLD
> change, as they must die before zombies.
>
> v3: Block SIGCHLD during stoppinig of pid_ns helpers.
>
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
> criu/cr-restore.c | 7 +
> criu/include/namespaces.h | 3 +
> criu/namespaces.c | 253 +++++++++++++++++++++++++++++++++++++++++++++
> criu/ns-common.c | 51 +++++++++
> criu/pie/restorer.c | 5 +
> 5 files changed, 319 insertions(+)
> create mode 100644 criu/ns-common.c
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 4c4ca37d7..3c35e5f08 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -1605,6 +1605,8 @@ static int restore_task_with_children(void *_arg)
> pr_err("Can't add fd to fdstore\n");
> return -1;
> }
> + if (create_pid_ns_helper(pid_ns) < 0)
> + goto err;
> }
>
> if (restore_task_mnt_ns(current))
> @@ -2038,6 +2040,10 @@ static int restore_root_task(struct pstree_item *init)
> task_entries->nr_threads--;
> }
>
> + ret = destroy_pid_ns_helpers();
> + if (ret < 0)
> + goto out_kill;
> +
> ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
> if (ret < 0)
> goto out_kill;
> @@ -2141,6 +2147,7 @@ static int restore_root_task(struct pstree_item *init)
> return 0;
>
> out_kill:
> + destroy_pid_ns_helpers();
> /*
> * The processes can be killed only when all of them have been created,
> * otherwise an external proccesses can be killed.
> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
> index 37b65b0db..b81957668 100644
> --- a/criu/include/namespaces.h
> +++ b/criu/include/namespaces.h
> @@ -267,5 +267,8 @@ static inline int pid_ns_root_off(void)
> return 0;
> }
> extern int reserve_pid_ns_helpers(void);
> +extern int create_pid_ns_helper(struct ns_id *ns);
> +extern int destroy_pid_ns_helpers(void);
> +extern int request_set_next_pid(int pid_ns_id, pid_t pid, int sk);
>
> #endif /* __CR_NS_H__ */
> diff --git a/criu/namespaces.c b/criu/namespaces.c
> index 97ea2b0e6..f65f06003 100644
> --- a/criu/namespaces.c
> +++ b/criu/namespaces.c
> @@ -15,6 +15,7 @@
> #include <errno.h>
> #include <sys/ioctl.h>
> #include <sys/ptrace.h>
> +#include <sys/file.h>
>
> #include "page.h"
> #include "rst-malloc.h"
> @@ -38,6 +39,11 @@
> #include "fdstore.h"
> #include "proc_parse.h"
>
> +#define __sys(foo) foo
> +#define __sys_err(ret) (-errno)
> +
> +#include "ns-common.c"
> +
> static struct ns_desc *ns_desc_array[] = {
> &net_ns_desc,
> &uts_ns_desc,
> @@ -49,6 +55,8 @@ static struct ns_desc *ns_desc_array[] = {
> };
>
> static unsigned int join_ns_flags;
> +/* Creation of every helper are synchronized by userns_sync_lock */
> +static int nr_pid_ns_helper_created = 0;
>
> int check_namespace_opts(void)
> {
> @@ -2532,5 +2540,250 @@ int reserve_pid_ns_helpers(void)
> return walk_namespaces(&pid_ns_desc, do_reserve_pid_ns_helpers, NULL);
> }
>
> +static int pid_ns_helper_sock(struct ns_id *ns)
> +{
> + struct sockaddr_un addr;
> + socklen_t len;
> + int sk;
> +
> + sk = socket(AF_UNIX, SOCK_DGRAM, 0);
> + if (sk < 0) {
> + pr_perror("Can't create helper socket");
> + return -1;
> + }
> + pid_ns_helper_socket_name(&addr, &len, ns->id);
> +
> + if (bind(sk, (struct sockaddr *)&addr, len) < 0) {
> + pr_perror("Can't bind pid_ns sock");
> + return -1;
> + }
> +
> + return sk;
> +}
> +
> +static int pid_ns_helper(struct ns_id *ns, int sk)
> +{
> + struct sockaddr_un addr;
> + struct msghdr msg = {0};
> + struct iovec iov;
> + pid_t pid;
> +
> + msg.msg_name = &addr;
> + msg.msg_iov = &iov;
> + msg.msg_iovlen = 1;
> +
> + while (1) {
> + int answer = 0;
> + msg.msg_namelen = sizeof(addr);
> + iov.iov_base = &pid;
> + iov.iov_len = sizeof(pid);
> +
> + if (recvmsg(sk, &msg, 0) < 0) {
> + pr_perror("recv() failed to read pid");
> + break;
> + }
> +
> + if (pid != 0) {
> + if (__set_next_pid(pid) < 0) {
> + pr_err("Can't set next pid\n");
> + answer = -1;
> + }
> + }
> +
> + iov.iov_base = &answer;
> + iov.iov_len = sizeof(answer);
> + if (sendmsg(sk, &msg, 0) < 0) {
> + pr_perror("Can't send answer");
> + break;
> + }
> +
> + if (pid == 0)
> + return 0;
> + }
> +
> + return -1;
> +}
> +
> +static int do_create_pid_ns_helper(void *arg, int unused_fd, pid_t unused_pid)
> +{
> + int pid_ns_fd, mnt_ns_fd, sk, fd, i, lock_fd, transport_fd;
> + struct ns_id *ns, *tmp;
> + struct pid *pid;
> + pid_t child;
> +
> + pid_ns_fd = open_proc(PROC_SELF, "ns/pid");
> + if (pid_ns_fd < 0) {
> + pr_perror("Can't open pid ns");
> + return -1;
> + }
> + ns = *(struct ns_id **)arg;
> +
> + fd = fdstore_get(ns->pid.nsfd_id);
> + if (fd < 0) {
> + pr_err("Can't get pid_ns fd\n");
> + return -1;
> + }
> + if (setns(fd, CLONE_NEWPID) < 0) {
> + pr_perror("Can't setns");
> + return -1;
> + }
> + close(fd);
> +
> + sk = pid_ns_helper_sock(ns);
> + if (sk < 0)
> + return -1;
> +
> + pid = __pstree_pid_by_virt(ns, ns->ns_pid);
> + if (!pid) {
> + pr_err("Can't find helper reserved pid\n");
> + return -1;
> + }
> +
> + tmp = ns->parent;
> + if (tmp) {
> + futex_t *f = &tmp->pid.helper_created;
> + futex_wait_while_eq(f, 0);
> + }
> +
> + if (switch_ns(root_item->pid->real, &mnt_ns_desc, &mnt_ns_fd) < 0) {
> + pr_err("Can't set mnt_ns\n");
> + return -1;
> + }
> +
> + lock_fd = open("/proc/" LAST_PID_PATH, O_RDONLY);
> + if (lock_fd < 0)
> + return -1;
> +
> + if (restore_ns(mnt_ns_fd, &mnt_ns_desc) < 0) {
> + pr_err("Can't restore ns\n");
> + return -1;
> + }
> +
> + if (flock(lock_fd, LOCK_EX)) {
> + close(lock_fd);
> + pr_perror("Can't lock %s", LAST_PID_PATH);
> + return -1;
> + }
> +
> + transport_fd = get_service_fd(TRANSPORT_FD_OFF);
> + /*
> + * Starting not from pid->level - 1, as it's helper has not created yet
> + * (we're creating it in the moment), and the true pid for this level
> + * is set by the task, who does close(CLONE_NEWPID) (this task is sender of fd).
> + */
> + for (i = pid->level - 2, tmp = ns->parent; i >= 0; i--, tmp = tmp->parent)
> + if (request_set_next_pid(tmp->id, pid->ns[i].virt, transport_fd)) {
> + pr_err("Can't set next pid using helper\n");
> + flock(lock_fd, LOCK_UN);
> + close(lock_fd);
> + return -1;
> + }
> + child = fork();
> + if (child < 0) {
> + flock(lock_fd, LOCK_UN);
> + close(lock_fd);
> + pr_perror("Can't fork");
> + return -1;
> + } else if (!child) {
> + close(lock_fd);
> + exit(pid_ns_helper(ns, sk));
> + }
> + close(sk);
> + futex_set_and_wake(&ns->pid.helper_created, 1);
> + flock(lock_fd, LOCK_UN);
> + close(lock_fd);
> + nr_pid_ns_helper_created++;
> +
> + if (setns(pid_ns_fd, CLONE_NEWPID) < 0) {
> + pr_perror("Restore ns");
> + return -1;
> + }
> + return 0;
> +}
> +
> +/*
> + * Task may set last_pid only for its active pid namespace,
> + * so if NSpid of a child contains more then one level, we
> + * need external help to populate the whole pid hierarhy
> + * (pid in parent pid_ns, pid in grand parent etc). Pid ns
> + * helpers are used for that.
> + *
> + * We need a task or tasks to be a parent of pid_ns helpers.
> + * To live in common hierarhy and to be a TASK_HELPER is not
> + * possible, because it introduces circular dependencies.
> + * The same is to be children of criu main task, because
> + * we already have dependencies between it and root_item
> + * (NO more dependencies!). So, we choose usernsd for that:
> + * it always exists and have command interface.
> + */
> +int create_pid_ns_helper(struct ns_id *ns)
> +{
> + BUG_ON(getpid() != INIT_PID);
> +
> + if (__set_next_pid(ns->ns_pid) < 0) {
> + pr_err("Can't set next fd\n");
> + return -1;
> + }
> + if (userns_call(do_create_pid_ns_helper, 0, &ns, sizeof(ns), -1) < 0) {
> + pr_err("Can't create pid_ns helper\n");
> + return -1;
> + }
> + return 0;
> +}
> +
> +static int do_destroy_pid_ns_helper(void *arg, int fd, pid_t pid)
> +{
> + int i, sk, status, sig_blocked = true, nr_ok = 0, ret = 0;
> + sigset_t sig_mask;
> + struct ns_id *ns;
> +
> + if (!nr_pid_ns_helper_created)
> + return 0;
> +
> + if (block_sigmask(&sig_mask, SIGCHLD)) {
> + sig_blocked = false;
> + ret = -1;
> + }
> +
> + sk = get_service_fd(TRANSPORT_FD_OFF);
> +
> + for (ns = ns_ids; ns; ns = ns->next) {
> + if (ns->nd != &pid_ns_desc)
> + continue;
> + if (request_set_next_pid(ns->id, 0, sk) == 0)
> + nr_ok++;
> + }
> +
> + if (nr_ok != nr_pid_ns_helper_created) {
> + pr_err("Not all pid_ns helpers killed\n");
> + ret = -1;
> + }
> +
> + for (i = 0; i < nr_ok; i++) {
> + if (waitpid(-1, &status, 0) < 0) {
> + pr_perror("Error during waiting pid_ns helper");
> + ret = -1;
> + }
> + }
> + nr_pid_ns_helper_created = 0;
> +
> + if (sig_blocked && restore_sigmask(&sig_mask))
> + ret = -1;
> +
> + return ret;
> +}
> +
> +int destroy_pid_ns_helpers(void)
> +{
> + if (!(root_ns_mask & CLONE_NEWPID))
> + return 0;
> +
> + if (userns_call(do_destroy_pid_ns_helper, 0, NULL, 0, -1) < 0) {
> + pr_err("Can't create pid_ns helper\n");
> + return -1;
> + }
> + return 0;
> +}
> +
> struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
> struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
> diff --git a/criu/ns-common.c b/criu/ns-common.c
> new file mode 100644
> index 000000000..a8e28aa00
> --- /dev/null
> +++ b/criu/ns-common.c
> @@ -0,0 +1,51 @@
> +#include <sys/socket.h>
> +#include <sys/un.h>
> +
> +void pid_ns_helper_socket_name(struct sockaddr_un *addr, socklen_t *len, unsigned int id)
> +{
> + const char prefix[] = "0/criu-pid-ns-";
> + const char int_max[] = "2147483647";
> +
> + *len = sizeof(*addr) - sizeof(addr->sun_path) +
> + sizeof(prefix) - 1 + sizeof(int_max) - 1;
> +
> + addr->sun_family = AF_UNIX;
> +
> + memset(addr->sun_path + sizeof(prefix) - 1, '\0', sizeof(int_max) - 1);
> +#ifdef CR_NOGLIBC
> + std_sprintf(addr->sun_path, "%s%d", prefix, id);
> +#else
> + sprintf(addr->sun_path, "%s%d", prefix, id);
> +#endif
> + addr->sun_path[0] = '\0';
> +}
> +
> +/* Send helper a request to set next pid and receive success */
> +int request_set_next_pid(int pid_ns_id, pid_t pid, int sk)
> +{
> + struct sockaddr_un addr;
> + int answer, ret;
> + socklen_t len;
> +
> + BUG_ON(pid == -1);
> +
> + pid_ns_helper_socket_name(&addr, &len, pid_ns_id);
> + ret = __sys(sendto)(sk, &pid, sizeof(pid), 0, (struct sockaddr *)&addr, len);
> + if (ret < 0) {
> + pr_err("Can't send request: err=%d\n", __sys_err(ret));
> + return -1;
> + }
> +
> + ret = __sys(recvfrom)(sk, &answer, sizeof(answer), 0, NULL, NULL);
> + if (ret < 0) {
> + pr_err("Can't recv answer: err=%d\n", __sys_err(ret));
> + return -1;
> + }
criu hangs in this function in a error case.
[root at fc24 criu]# cat test/dump/zdtm/static/pty-console/31/1/restore.log | grep -B 5 Error
(00.324568) uns: daemon calls 0x48eae0 (51, 8, 1)
(00.324583) 1: tty: Allocating fake descriptor for 0xb (reg_d 0x7f618453bff0)
(00.324605) uns: daemon calls 0x48eae0 (51, 8, 1)
(00.324639) 1: tty: Restore session 1 by 1 tty (index 0)
(00.324641) 1: Restoring resources
(00.324662) 1: Error (criu/tty.c:663): tty: Can't set sid on terminal fd 3: Operation not permitted
18064 pts/0 T 0:00 | \_ python test/zdtm.py run -a --keep-going
20734 pts/0 T 0:00 | | \_ ./zdtm_ct zdtm.py
20737 pts/0 S 0:00 | | \_ python2 zdtm.py
20739 pts/0 T 0:00 | | \_ python2 zdtm.py
20784 pts/0 t 0:00 | | \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-console.
20786 pts/0 S 0:00 | | \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-cons
20799 pts/0 Z 0:00 | | | \_ [criu] <defunct>
20801 pts/0 Z 0:00 | | | \_ [criu] <defunct>
20787 pts/0 S 0:00 | | \_ [criu]
20800 pts/0 D 0:00 | | \_ [criu]
[root at fc24 criu]# gdb -p 20786
(gdb) bt
#0 0x00007f6183ced9f3 in __recvfrom_nocancel () from target:/lib64/libpthread.so.0
#1 0x0000000000461662 in request_set_next_pid (pid_ns_id=<optimized out>, pid=<optimized out>, pid at entry=0, sk=sk at entry=1012) at criu/ns-common.c:39
#2 0x00000000004618ad in do_destroy_pid_ns_helper (pid=<optimized out>, fd=<optimized out>, arg=<optimized out>) at criu/namespaces.c:2753
#3 0x000000000042167e in usernsd (sk=5) at criu/namespaces.c:1600
#4 0x0000000000465a63 in start_usernsd () at criu/namespaces.c:1760
#5 prepare_namespace_before_tasks () at criu/namespaces.c:2431
#6 0x0000000000420746 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2118
#7 0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
#8 0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728
[root at fc24 criu]# gdb -p gdb -p 20784
(gdb) bt
#0 0x00007f6183cee2c7 in recvmsg () from target:/lib64/libpthread.so.0
#1 0x0000000000463c26 in __userns_call (func_name=func_name at entry=0x4dc101 "do_destroy_pid_ns_helper", call=<optimized out>, call at entry=0x461950 <do_destroy_pid_ns_helper>,
flags=<optimized out>, flags at entry=0, arg=arg at entry=0x0, arg_size=arg_size at entry=0, fd=fd at entry=-1) at criu/namespaces.c:1692
#2 0x0000000000465d40 in destroy_pid_ns_helpers () at criu/namespaces.c:2781
#3 0x0000000000420dd5 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2323
#4 0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
#5 0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728
> +
> + if (answer != 0) {
> + pr_err("Error answer\n");
> + return -1;
> + }
> +
> + return 0;
> +}
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 030c7ff42..3b0b35710 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -47,6 +47,11 @@
> #include "restorer.h"
> #include "namespaces.h"
>
> +#define __sys(foo) sys_##foo
> +#define __sys_err(ret) ret
> +
> +#include "../ns-common.c"
> +
> #ifndef PR_SET_PDEATHSIG
> #define PR_SET_PDEATHSIG 1
> #endif
>
More information about the CRIU
mailing list