[CRIU] [PATCH v4 32/41] pid: Create pid_ns helpers
Kirill Tkhai
ktkhai at virtuozzo.com
Fri May 5 08:35:30 PDT 2017
On 05.05.2017 08:51, Andrei Vagin wrote:
> On Thu, May 04, 2017 at 07:10:37PM +0300, Kirill Tkhai wrote:
>> Task may set last_pid only for its active pid namespace,
>> so if NSpid of a child contains more then one level, we
>> need external help to populate the whole pid hierarhy
>> (pid in parent pid_ns, pid in grand parent etc). Pid ns
>> helpers are used for that.
>>
>> These are childred of usernsd, which are listening for
>> socket, and setting requested last pid in their active
>> pid_ns.
>>
>> v4: Move destroy_pid_ns_helpers() before CR_STATE_RESTORE_SIGCHLD
>> change, as they must die before zombies.
>>
>> v3: Block SIGCHLD during stoppinig of pid_ns helpers.
>>
>> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>> ---
>> criu/cr-restore.c | 7 +
>> criu/include/namespaces.h | 3 +
>> criu/namespaces.c | 253 +++++++++++++++++++++++++++++++++++++++++++++
>> criu/ns-common.c | 51 +++++++++
>> criu/pie/restorer.c | 5 +
>> 5 files changed, 319 insertions(+)
>> create mode 100644 criu/ns-common.c
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index 4c4ca37d7..3c35e5f08 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -1605,6 +1605,8 @@ static int restore_task_with_children(void *_arg)
>> pr_err("Can't add fd to fdstore\n");
>> return -1;
>> }
>> + if (create_pid_ns_helper(pid_ns) < 0)
>> + goto err;
>> }
>>
>> if (restore_task_mnt_ns(current))
>> @@ -2038,6 +2040,10 @@ static int restore_root_task(struct pstree_item *init)
>> task_entries->nr_threads--;
>> }
>>
>> + ret = destroy_pid_ns_helpers();
>> + if (ret < 0)
>> + goto out_kill;
>> +
>> ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
>> if (ret < 0)
>> goto out_kill;
>> @@ -2141,6 +2147,7 @@ static int restore_root_task(struct pstree_item *init)
>> return 0;
>>
>> out_kill:
>> + destroy_pid_ns_helpers();
>> /*
>> * The processes can be killed only when all of them have been created,
>> * otherwise an external proccesses can be killed.
>> diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
>> index 37b65b0db..b81957668 100644
>> --- a/criu/include/namespaces.h
>> +++ b/criu/include/namespaces.h
>> @@ -267,5 +267,8 @@ static inline int pid_ns_root_off(void)
>> return 0;
>> }
>> extern int reserve_pid_ns_helpers(void);
>> +extern int create_pid_ns_helper(struct ns_id *ns);
>> +extern int destroy_pid_ns_helpers(void);
>> +extern int request_set_next_pid(int pid_ns_id, pid_t pid, int sk);
>>
>> #endif /* __CR_NS_H__ */
>> diff --git a/criu/namespaces.c b/criu/namespaces.c
>> index 97ea2b0e6..f65f06003 100644
>> --- a/criu/namespaces.c
>> +++ b/criu/namespaces.c
>> @@ -15,6 +15,7 @@
>> #include <errno.h>
>> #include <sys/ioctl.h>
>> #include <sys/ptrace.h>
>> +#include <sys/file.h>
>>
>> #include "page.h"
>> #include "rst-malloc.h"
>> @@ -38,6 +39,11 @@
>> #include "fdstore.h"
>> #include "proc_parse.h"
>>
>> +#define __sys(foo) foo
>> +#define __sys_err(ret) (-errno)
>> +
>> +#include "ns-common.c"
>> +
>> static struct ns_desc *ns_desc_array[] = {
>> &net_ns_desc,
>> &uts_ns_desc,
>> @@ -49,6 +55,8 @@ static struct ns_desc *ns_desc_array[] = {
>> };
>>
>> static unsigned int join_ns_flags;
>> +/* Creation of every helper are synchronized by userns_sync_lock */
>> +static int nr_pid_ns_helper_created = 0;
>>
>> int check_namespace_opts(void)
>> {
>> @@ -2532,5 +2540,250 @@ int reserve_pid_ns_helpers(void)
>> return walk_namespaces(&pid_ns_desc, do_reserve_pid_ns_helpers, NULL);
>> }
>>
>> +static int pid_ns_helper_sock(struct ns_id *ns)
>> +{
>> + struct sockaddr_un addr;
>> + socklen_t len;
>> + int sk;
>> +
>> + sk = socket(AF_UNIX, SOCK_DGRAM, 0);
>> + if (sk < 0) {
>> + pr_perror("Can't create helper socket");
>> + return -1;
>> + }
>> + pid_ns_helper_socket_name(&addr, &len, ns->id);
>> +
>> + if (bind(sk, (struct sockaddr *)&addr, len) < 0) {
>> + pr_perror("Can't bind pid_ns sock");
>> + return -1;
>> + }
>> +
>> + return sk;
>> +}
>> +
>> +static int pid_ns_helper(struct ns_id *ns, int sk)
>> +{
>> + struct sockaddr_un addr;
>> + struct msghdr msg = {0};
>> + struct iovec iov;
>> + pid_t pid;
>> +
>> + msg.msg_name = &addr;
>> + msg.msg_iov = &iov;
>> + msg.msg_iovlen = 1;
>> +
>> + while (1) {
>> + int answer = 0;
>> + msg.msg_namelen = sizeof(addr);
>> + iov.iov_base = &pid;
>> + iov.iov_len = sizeof(pid);
>> +
>> + if (recvmsg(sk, &msg, 0) < 0) {
>> + pr_perror("recv() failed to read pid");
>> + break;
>> + }
>> +
>> + if (pid != 0) {
>> + if (__set_next_pid(pid) < 0) {
>> + pr_err("Can't set next pid\n");
>> + answer = -1;
>> + }
>> + }
>> +
>> + iov.iov_base = &answer;
>> + iov.iov_len = sizeof(answer);
>> + if (sendmsg(sk, &msg, 0) < 0) {
>> + pr_perror("Can't send answer");
>> + break;
>> + }
>> +
>> + if (pid == 0)
>> + return 0;
>> + }
>> +
>> + return -1;
>> +}
>> +
>> +static int do_create_pid_ns_helper(void *arg, int unused_fd, pid_t unused_pid)
>> +{
>> + int pid_ns_fd, mnt_ns_fd, sk, fd, i, lock_fd, transport_fd;
>> + struct ns_id *ns, *tmp;
>> + struct pid *pid;
>> + pid_t child;
>> +
>> + pid_ns_fd = open_proc(PROC_SELF, "ns/pid");
>> + if (pid_ns_fd < 0) {
>> + pr_perror("Can't open pid ns");
>> + return -1;
>> + }
>> + ns = *(struct ns_id **)arg;
>> +
>> + fd = fdstore_get(ns->pid.nsfd_id);
>> + if (fd < 0) {
>> + pr_err("Can't get pid_ns fd\n");
>> + return -1;
>> + }
>> + if (setns(fd, CLONE_NEWPID) < 0) {
>> + pr_perror("Can't setns");
>> + return -1;
>> + }
>> + close(fd);
>> +
>> + sk = pid_ns_helper_sock(ns);
>> + if (sk < 0)
>> + return -1;
>> +
>> + pid = __pstree_pid_by_virt(ns, ns->ns_pid);
>> + if (!pid) {
>> + pr_err("Can't find helper reserved pid\n");
>> + return -1;
>> + }
>> +
>> + tmp = ns->parent;
>> + if (tmp) {
>> + futex_t *f = &tmp->pid.helper_created;
>> + futex_wait_while_eq(f, 0);
>> + }
>> +
>> + if (switch_ns(root_item->pid->real, &mnt_ns_desc, &mnt_ns_fd) < 0) {
>> + pr_err("Can't set mnt_ns\n");
>> + return -1;
>> + }
>> +
>> + lock_fd = open("/proc/" LAST_PID_PATH, O_RDONLY);
>> + if (lock_fd < 0)
>> + return -1;
>> +
>> + if (restore_ns(mnt_ns_fd, &mnt_ns_desc) < 0) {
>> + pr_err("Can't restore ns\n");
>> + return -1;
>> + }
>> +
>> + if (flock(lock_fd, LOCK_EX)) {
>> + close(lock_fd);
>> + pr_perror("Can't lock %s", LAST_PID_PATH);
>> + return -1;
>> + }
>> +
>> + transport_fd = get_service_fd(TRANSPORT_FD_OFF);
>> + /*
>> + * Starting not from pid->level - 1, as it's helper has not created yet
>> + * (we're creating it in the moment), and the true pid for this level
>> + * is set by the task, who does close(CLONE_NEWPID) (this task is sender of fd).
>> + */
>> + for (i = pid->level - 2, tmp = ns->parent; i >= 0; i--, tmp = tmp->parent)
>> + if (request_set_next_pid(tmp->id, pid->ns[i].virt, transport_fd)) {
>> + pr_err("Can't set next pid using helper\n");
>> + flock(lock_fd, LOCK_UN);
>> + close(lock_fd);
>> + return -1;
>> + }
>> + child = fork();
>> + if (child < 0) {
>> + flock(lock_fd, LOCK_UN);
>> + close(lock_fd);
>> + pr_perror("Can't fork");
>> + return -1;
>> + } else if (!child) {
>> + close(lock_fd);
>> + exit(pid_ns_helper(ns, sk));
>> + }
>> + close(sk);
>> + futex_set_and_wake(&ns->pid.helper_created, 1);
>> + flock(lock_fd, LOCK_UN);
>> + close(lock_fd);
>> + nr_pid_ns_helper_created++;
>> +
>> + if (setns(pid_ns_fd, CLONE_NEWPID) < 0) {
>> + pr_perror("Restore ns");
>> + return -1;
>> + }
>> + return 0;
>> +}
>> +
>> +/*
>> + * Task may set last_pid only for its active pid namespace,
>> + * so if NSpid of a child contains more then one level, we
>> + * need external help to populate the whole pid hierarhy
>> + * (pid in parent pid_ns, pid in grand parent etc). Pid ns
>> + * helpers are used for that.
>> + *
>> + * We need a task or tasks to be a parent of pid_ns helpers.
>> + * To live in common hierarhy and to be a TASK_HELPER is not
>> + * possible, because it introduces circular dependencies.
>> + * The same is to be children of criu main task, because
>> + * we already have dependencies between it and root_item
>> + * (NO more dependencies!). So, we choose usernsd for that:
>> + * it always exists and have command interface.
>> + */
>> +int create_pid_ns_helper(struct ns_id *ns)
>> +{
>> + BUG_ON(getpid() != INIT_PID);
>> +
>> + if (__set_next_pid(ns->ns_pid) < 0) {
>> + pr_err("Can't set next fd\n");
>> + return -1;
>> + }
>> + if (userns_call(do_create_pid_ns_helper, 0, &ns, sizeof(ns), -1) < 0) {
>> + pr_err("Can't create pid_ns helper\n");
>> + return -1;
>> + }
>> + return 0;
>> +}
>> +
>> +static int do_destroy_pid_ns_helper(void *arg, int fd, pid_t pid)
>> +{
>> + int i, sk, status, sig_blocked = true, nr_ok = 0, ret = 0;
>> + sigset_t sig_mask;
>> + struct ns_id *ns;
>> +
>> + if (!nr_pid_ns_helper_created)
>> + return 0;
>> +
>> + if (block_sigmask(&sig_mask, SIGCHLD)) {
>> + sig_blocked = false;
>> + ret = -1;
>> + }
>> +
>> + sk = get_service_fd(TRANSPORT_FD_OFF);
>> +
>> + for (ns = ns_ids; ns; ns = ns->next) {
>> + if (ns->nd != &pid_ns_desc)
>> + continue;
>> + if (request_set_next_pid(ns->id, 0, sk) == 0)
>> + nr_ok++;
>> + }
>> +
>> + if (nr_ok != nr_pid_ns_helper_created) {
>> + pr_err("Not all pid_ns helpers killed\n");
>> + ret = -1;
>> + }
>> +
>> + for (i = 0; i < nr_ok; i++) {
>> + if (waitpid(-1, &status, 0) < 0) {
>> + pr_perror("Error during waiting pid_ns helper");
>> + ret = -1;
>> + }
>> + }
>> + nr_pid_ns_helper_created = 0;
>> +
>> + if (sig_blocked && restore_sigmask(&sig_mask))
>> + ret = -1;
>> +
>> + return ret;
>> +}
>> +
>> +int destroy_pid_ns_helpers(void)
>> +{
>> + if (!(root_ns_mask & CLONE_NEWPID))
>> + return 0;
>> +
>> + if (userns_call(do_destroy_pid_ns_helper, 0, NULL, 0, -1) < 0) {
>> + pr_err("Can't create pid_ns helper\n");
>> + return -1;
>> + }
>> + return 0;
>> +}
>> +
>> struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>> struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
>> diff --git a/criu/ns-common.c b/criu/ns-common.c
>> new file mode 100644
>> index 000000000..a8e28aa00
>> --- /dev/null
>> +++ b/criu/ns-common.c
>> @@ -0,0 +1,51 @@
>> +#include <sys/socket.h>
>> +#include <sys/un.h>
>> +
>> +void pid_ns_helper_socket_name(struct sockaddr_un *addr, socklen_t *len, unsigned int id)
>> +{
>> + const char prefix[] = "0/criu-pid-ns-";
>> + const char int_max[] = "2147483647";
>> +
>> + *len = sizeof(*addr) - sizeof(addr->sun_path) +
>> + sizeof(prefix) - 1 + sizeof(int_max) - 1;
>> +
>> + addr->sun_family = AF_UNIX;
>> +
>> + memset(addr->sun_path + sizeof(prefix) - 1, '\0', sizeof(int_max) - 1);
>> +#ifdef CR_NOGLIBC
>> + std_sprintf(addr->sun_path, "%s%d", prefix, id);
>> +#else
>> + sprintf(addr->sun_path, "%s%d", prefix, id);
>> +#endif
>> + addr->sun_path[0] = '\0';
>> +}
>> +
>> +/* Send helper a request to set next pid and receive success */
>> +int request_set_next_pid(int pid_ns_id, pid_t pid, int sk)
>> +{
>> + struct sockaddr_un addr;
>> + int answer, ret;
>> + socklen_t len;
>> +
>> + BUG_ON(pid == -1);
>> +
>> + pid_ns_helper_socket_name(&addr, &len, pid_ns_id);
>> + ret = __sys(sendto)(sk, &pid, sizeof(pid), 0, (struct sockaddr *)&addr, len);
>> + if (ret < 0) {
>> + pr_err("Can't send request: err=%d\n", __sys_err(ret));
>> + return -1;
>> + }
>> +
>> + ret = __sys(recvfrom)(sk, &answer, sizeof(answer), 0, NULL, NULL);
>> + if (ret < 0) {
>> + pr_err("Can't recv answer: err=%d\n", __sys_err(ret));
>> + return -1;
>> + }
>
> criu hangs in this function in a error case.
I'll find a solution for this case, but not in next v5.
v5 is just for transport sockets problem resolution.
> [root at fc24 criu]# cat test/dump/zdtm/static/pty-console/31/1/restore.log | grep -B 5 Error
> (00.324568) uns: daemon calls 0x48eae0 (51, 8, 1)
> (00.324583) 1: tty: Allocating fake descriptor for 0xb (reg_d 0x7f618453bff0)
> (00.324605) uns: daemon calls 0x48eae0 (51, 8, 1)
> (00.324639) 1: tty: Restore session 1 by 1 tty (index 0)
> (00.324641) 1: Restoring resources
> (00.324662) 1: Error (criu/tty.c:663): tty: Can't set sid on terminal fd 3: Operation not permitted
>
>
> 18064 pts/0 T 0:00 | \_ python test/zdtm.py run -a --keep-going
> 20734 pts/0 T 0:00 | | \_ ./zdtm_ct zdtm.py
> 20737 pts/0 S 0:00 | | \_ python2 zdtm.py
> 20739 pts/0 T 0:00 | | \_ python2 zdtm.py
> 20784 pts/0 t 0:00 | | \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-console.
> 20786 pts/0 S 0:00 | | \_ ../criu/criu restore -o restore.log -D dump/zdtm/static/pty-console/31/1 -v4 --pidfile /root/git/criu/test/zdtm/static/pty-cons
> 20799 pts/0 Z 0:00 | | | \_ [criu] <defunct>
> 20801 pts/0 Z 0:00 | | | \_ [criu] <defunct>
> 20787 pts/0 S 0:00 | | \_ [criu]
> 20800 pts/0 D 0:00 | | \_ [criu]
>
> [root at fc24 criu]# gdb -p 20786
> (gdb) bt
> #0 0x00007f6183ced9f3 in __recvfrom_nocancel () from target:/lib64/libpthread.so.0
> #1 0x0000000000461662 in request_set_next_pid (pid_ns_id=<optimized out>, pid=<optimized out>, pid at entry=0, sk=sk at entry=1012) at criu/ns-common.c:39
> #2 0x00000000004618ad in do_destroy_pid_ns_helper (pid=<optimized out>, fd=<optimized out>, arg=<optimized out>) at criu/namespaces.c:2753
> #3 0x000000000042167e in usernsd (sk=5) at criu/namespaces.c:1600
> #4 0x0000000000465a63 in start_usernsd () at criu/namespaces.c:1760
> #5 prepare_namespace_before_tasks () at criu/namespaces.c:2431
> #6 0x0000000000420746 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2118
> #7 0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
> #8 0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728
>
> [root at fc24 criu]# gdb -p gdb -p 20784
> (gdb) bt
> #0 0x00007f6183cee2c7 in recvmsg () from target:/lib64/libpthread.so.0
> #1 0x0000000000463c26 in __userns_call (func_name=func_name at entry=0x4dc101 "do_destroy_pid_ns_helper", call=<optimized out>, call at entry=0x461950 <do_destroy_pid_ns_helper>,
> flags=<optimized out>, flags at entry=0, arg=arg at entry=0x0, arg_size=arg_size at entry=0, fd=fd at entry=-1) at criu/namespaces.c:1692
> #2 0x0000000000465d40 in destroy_pid_ns_helpers () at criu/namespaces.c:2781
> #3 0x0000000000420dd5 in restore_root_task (init=0x7f618453b4d0) at criu/cr-restore.c:2323
> #4 0x000000000043da13 in cr_restore_tasks () at criu/cr-restore.c:2433
> #5 0x0000000000422928 in main (argc=<optimized out>, argv=0x7ffe63fe3d18, envp=<optimized out>) at criu/crtools.c:728
>
>> +
>> + if (answer != 0) {
>> + pr_err("Error answer\n");
>> + return -1;
>> + }
>> +
>> + return 0;
>> +}
>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>> index 030c7ff42..3b0b35710 100644
>> --- a/criu/pie/restorer.c
>> +++ b/criu/pie/restorer.c
>> @@ -47,6 +47,11 @@
>> #include "restorer.h"
>> #include "namespaces.h"
>>
>> +#define __sys(foo) sys_##foo
>> +#define __sys_err(ret) ret
>> +
>> +#include "../ns-common.c"
>> +
>> #ifndef PR_SET_PDEATHSIG
>> #define PR_SET_PDEATHSIG 1
>> #endif
>>
More information about the CRIU
mailing list