[CRIU] [PATCH 1/4] Support for dumping/restoring user namespaces

Andrew Vagin avagin at parallels.com
Wed Aug 13 07:22:00 PDT 2014


On Tue, Aug 12, 2014 at 03:58:48PM -0700, Sophie Blee-Goldman wrote:
> Here is the patch with the fixes you pointed out. I am still looking at the
> zdtm tests at the moment.

You have done the good and big job. It's excellent result.

Unfortunately I did the same job. Now we need to merge our code.

If you want push this patch in the upstream repo, you need to split
it on a few smaller patches. It's hard to review one big patch. Then
you will need to execute the zdtm test suite on it.

I suggest once again to not waste time and to take my patches instead of
this one. I have done all this work already.

We have a number of open questions, so I think it's better to
direct our forces to these tasks.

Thanks,
Andrew.

> 
> 
> On Tue, Aug 12, 2014 at 3:56 PM, Sophie Blee-Goldman <ableegoldman at google.com>
> wrote:
> 
>     Adds basic support for user namespaces by dumping and restoring
>     the namespace itself and the uid/gid maps of the root process.
> 
>     Currently depends on a kernel patch to avoid failing on the prctl
>     syscall by checking for CAP_SYS_RESOURCE in the user namespace
>     instead of in the global one.
> 
>     Signed-off-by: Sophie Blee-Goldman <ableegoldman at google.com>
>     ---
>      Makefile.crtools        |   1 +
>      cr-restore.c            |   7 ++
>      cr-show.c               |   2 +
>      image-desc.c            |   1 +
>      include/image-desc.h    |   1 +
>      include/magic.h         |   1 +
>      include/namespaces.h    |   1 -
>      include/protobuf-desc.h |   5 +-
>      include/syscall-types.h |   6 +-
>      include/user_ns.h       |   9 ++
>      namespaces.c            |  27 +++++-
>      protobuf-desc.c         |   1 +
>      protobuf/Makefile       |   1 +
>      protobuf/core.proto     |   1 +
>      protobuf/userns.proto   |   9 ++
>      pstree.c                |   2 +
>      user_ns.c               | 227
>     ++++++++++++++++++++++++++++++++++++++++++++++++
>      17 files changed, 296 insertions(+), 6 deletions(-)
>      create mode 100644 include/user_ns.h
>      create mode 100644 protobuf/userns.proto
>      create mode 100644 user_ns.c
> 
>     diff --git a/Makefile.crtools b/Makefile.crtools
>     index 6033b2c..8e680d6 100644
>     --- a/Makefile.crtools
>     +++ b/Makefile.crtools
>     @@ -34,6 +34,7 @@ obj-y += pipes.o
>      obj-y  += fifo.o
>      obj-y  += file-ids.o
>      obj-y  += namespaces.o
>     +obj-y  += user_ns.o
>      obj-y  += uts_ns.o
>      obj-y  += ipc_ns.o
>      obj-y  += netfilter.o
>     diff --git a/cr-restore.c b/cr-restore.c
>     index 2bc98e8..a93fa74 100644
>     --- a/cr-restore.c
>     +++ b/cr-restore.c
>     @@ -52,6 +52,7 @@
>      #include "restorer-blob.h"
>      #include "crtools.h"
>      #include "namespaces.h"
>     +#include "user_ns.h"
>      #include "mem.h"
>      #include "mount.h"
>      #include "fsnotify.h"
>     @@ -1612,6 +1613,12 @@ static int restore_root_task(struct pstree_item
>     *init)
>             if (ret)
>                     goto out;
> 
>     +       if (root_ns_mask & CLONE_NEWUSER) {
>     +               ret = restore_user_ns(init->pid.real, init->ids->
>     user_ns_id);
>     +               if (ret < 0)
>     +                       goto out;
>     +       }
>     +
>             ret = run_scripts("setup-namespaces");
>             if (ret)
>                     goto out;
>     diff --git a/cr-show.c b/cr-show.c
>     index 0e1a2c6..2b28746 100644
>     --- a/cr-show.c
>     +++ b/cr-show.c
>     @@ -21,6 +21,7 @@
>      #include "util.h"
>      #include "sockets.h"
>      #include "image.h"
>     +#include "user_ns.h"
>      #include "uts_ns.h"
>      #include "ipc_ns.h"
>      #include "pstree.h"
>     @@ -291,6 +292,7 @@ static struct show_image_info show_infos[] = {
>             SHOW_VERT(CORE),
>             SHOW_VERT(IDS),
>             SHOW_VERT(CREDS),
>     +       SHOW_VERT(USERNS),
>             SHOW_VERT(UTSNS),
>             SHOW_VERT(IPC_VAR),
>             SHOW_VERT(FS),
>     diff --git a/image-desc.c b/image-desc.c
>     index 49dc29d..814c3b2 100644
>     --- a/image-desc.c
>     +++ b/image-desc.c
>     @@ -52,6 +52,7 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
>             FD_ENTRY(POSIX_TIMERS,  "posix-timers-%d"),
>             FD_ENTRY(CREDS,         "creds-%d"),
>             FD_ENTRY(UTSNS,         "utsns-%d"),
>     +       FD_ENTRY(USERNS,        "userns-%d"),
>             FD_ENTRY(IPC_VAR,       "ipcns-var-%d"),
>             FD_ENTRY(IPCNS_SHM,     "ipcns-shm-%d"),
>             FD_ENTRY(IPCNS_MSG,     "ipcns-msg-%d"),
>     diff --git a/include/image-desc.h b/include/image-desc.h
>     index 93b3392..18535e1 100644
>     --- a/include/image-desc.h
>     +++ b/include/image-desc.h
>     @@ -25,6 +25,7 @@ enum {
>             /*
>              * NS entries
>              */
>     +       CR_FD_USERNS,
>             CR_FD_UTSNS,
>             CR_FD_MNTS,
> 
>     diff --git a/include/magic.h b/include/magic.h
>     index 5192a60..06db3e3 100644
>     --- a/include/magic.h
>     +++ b/include/magic.h
>     @@ -40,6 +40,7 @@
>      #define ITIMERS_MAGIC          0x57464056 /* Kostroma */
>      #define POSIX_TIMERS_MAGIC     0x52603957 /* Lipetsk */
>      #define SK_QUEUES_MAGIC                0x56264026 /* Suzdal */
>     +#define USERNS_MAGIC           0x55474908 /* Kazan */
>      #define UTSNS_MAGIC            0x54473203 /* Smolensk */
>      #define CREDS_MAGIC            0x54023547 /* Kozelsk */
>      #define IPC_VAR_MAGIC          0x53115007 /* Samara */
>     diff --git a/include/namespaces.h b/include/namespaces.h
>     index 350b8b4..bc67519 100644
>     --- a/include/namespaces.h
>     +++ b/include/namespaces.h
>     @@ -34,7 +34,6 @@ extern struct ns_id *ns_ids;
>      extern bool check_ns_proc(struct fd_link *link);
> 
>      extern struct ns_desc pid_ns_desc;
>     -extern struct ns_desc user_ns_desc;
>      extern unsigned long root_ns_mask;
> 
>      extern const struct fdtype_ops nsfile_dump_ops;
>     diff --git a/include/protobuf-desc.h b/include/protobuf-desc.h
>     index 01c9f4c..1c8f9ce 100644
>     --- a/include/protobuf-desc.h
>     +++ b/include/protobuf-desc.h
>     @@ -52,14 +52,15 @@ enum {
>             PB_IRMAP_CACHE,
>             PB_CGROUP,
>             PB_TIMERFD,
>     +       PB_USERNS,
> 
>             /* PB_AUTOGEN_STOP */
> 
>             PB_PAGEMAP_HEAD,
>             PB_IDS,
>             PB_SIGACT,
>     -       PB_NETDEV,
>     -       PB_REMAP_FPATH,         /* 50 */
>     +       PB_NETDEV,              /* 50 */
>     +       PB_REMAP_FPATH,
>             PB_SK_QUEUES,
>             PB_IPCNS_MSG,
>             PB_IPCNS_MSG_ENT,
>     diff --git a/include/syscall-types.h b/include/syscall-types.h
>     index bab3dba..eb270b3 100644
>     --- a/include/syscall-types.h
>     +++ b/include/syscall-types.h
>     @@ -57,7 +57,11 @@ struct itimerspec;
>      #define CLONE_NEWNET   0x40000000
>      #endif
> 
>     -#define CLONE_ALLNS    (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC |
>     CLONE_NEWUTS | CLONE_NEWNS)
>     +#ifndef CLONE_NEWUSER
>     +#define CLONE_NEWUSER  0x10000000
>     +#endif
>     +
>     +#define CLONE_ALLNS    (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC |
>     CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
> 
>      /* Nested namespaces are supported only for these types */
>      #define CLONE_SUBNS    (CLONE_NEWNS)
>     diff --git a/include/user_ns.h b/include/user_ns.h
>     new file mode 100644
>     index 0000000..715b155
>     --- /dev/null
>     +++ b/include/user_ns.h
>     @@ -0,0 +1,9 @@
>     +#ifndef __CR_USER_NS_H__
>     +#define __CR_USER_NS_H__
>     +
>     +extern int dump_user_ns(int ns_pid, int ns_id);
>     +extern int restore_user_ns(int real_pid, int ns_id);
>     +
>     +extern struct ns_desc user_ns_desc;
>     +
>     +#endif /* __CR_USER_NS_H__ */
>     diff --git a/namespaces.c b/namespaces.c
>     index 6be030f..8c0d842 100644
>     --- a/namespaces.c
>     +++ b/namespaces.c
>     @@ -9,6 +9,7 @@
>      #include "uts_ns.h"
>      #include "ipc_ns.h"
>      #include "mount.h"
>     +#include "user_ns.h"
>      #include "pstree.h"
>      #include "namespaces.h"
>      #include "net.h"
>     @@ -271,7 +272,7 @@ struct ns_file_info {
>      static int open_ns_fd(struct file_desc *d)
>      {
>             struct ns_file_info *nfi = container_of(d, struct ns_file_info, d);
>     -       struct pstree_item *item, *t;
>     +       struct pstree_item *item = NULL, *t;
>             struct ns_desc *nd = NULL;
>             char path[64];
>             int fd;
>     @@ -304,6 +305,10 @@ static int open_ns_fd(struct file_desc *d)
>                             item = t;
>                             nd = &mnt_ns_desc;
>                             break;
>     +               } else if (ids->user_ns_id == nfi->nfe->ns_id) {
>     +                       item = t;
>     +                       nd = &user_ns_desc;
>     +                       break;
>                     }
>             }
> 
>     @@ -391,6 +396,13 @@ int dump_task_ns_ids(struct pstree_item *item)
>                     return -1;
>             }
> 
>     +       ids->has_user_ns_id = true;
>     +       ids->user_ns_id = get_ns_id(pid, &user_ns_desc);
>     +       if (!ids->user_ns_id) {
>     +               pr_err("Can't make userns id\n");
>     +               return -1;
>     +       }
>     +
>             return 0;
>      }
> 
>     @@ -446,6 +458,11 @@ static int do_dump_namespaces(struct ns_id *ns)
>                                     ns->id, ns->pid);
>                     ret = dump_net_ns(ns->pid, ns->id);
>                     break;
>     +       case CLONE_NEWUSER:
>     +               pr_info("Dump USER namespace info %d via %d\n",
>     +                               ns->id, ns->pid);
>     +               ret = dump_user_ns(ns->pid, ns->id);
>     +               break;
>             default:
>                     pr_err("Unknown namespace flag %x", ns->nd->cflag);
>                     break;
>     @@ -604,9 +621,15 @@ int try_show_namespaces(int ns_pid)
>                     close(fd);
>             }
> 
>     +       fd = open_image(CR_FD_USERNS, O_SHOW, ids->user_ns_id);
>     +       if (fd > 0) {
>     +               pr_msg("-------------------USERNS---------------------\n");
>     +               cr_parse_fd(fd, fdset_template[CR_FD_USERNS].magic);
>     +               close(fd);
>     +       }
>     +
>             pr_msg("---[ end of %d namespaces ]---\n", ns_pid);
>             return 0;
>      }
> 
>      struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
>     -struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
>     diff --git a/protobuf-desc.c b/protobuf-desc.c
>     index b97418b..9199b09 100644
>     --- a/protobuf-desc.c
>     +++ b/protobuf-desc.c
>     @@ -38,6 +38,7 @@
>      #include "protobuf/sk-packet.pb-c.h"
>      #include "protobuf/creds.pb-c.h"
>      #include "protobuf/timer.pb-c.h"
>     +#include "protobuf/userns.pb-c.h"
>      #include "protobuf/utsns.pb-c.h"
>      #include "protobuf/ipc-var.pb-c.h"
>      #include "protobuf/ipc-shm.pb-c.h"
>     diff --git a/protobuf/Makefile b/protobuf/Makefile
>     index 7f6485b..cd2b854 100644
>     --- a/protobuf/Makefile
>     +++ b/protobuf/Makefile
>     @@ -50,6 +50,7 @@ proto-obj-y   += ipc-shm.o
>      proto-obj-y    += ipc-msg.o
>      proto-obj-y    += ipc-sem.o
>      proto-obj-y    += utsns.o
>     +proto-obj-y    += userns.o
>      proto-obj-y    += creds.o
>      proto-obj-y    += vma.o
>      proto-obj-y    += netdev.o
>     diff --git a/protobuf/core.proto b/protobuf/core.proto
>     index d850e2e..8810376 100644
>     --- a/protobuf/core.proto
>     +++ b/protobuf/core.proto
>     @@ -32,6 +32,7 @@ message task_kobj_ids_entry {
>             optional uint32                 ipc_ns_id       = 7;
>             optional uint32                 uts_ns_id       = 8;
>             optional uint32                 mnt_ns_id       = 9;
>     +       optional uint32                 user_ns_id      = 10;
>      }
> 
>      message thread_sas_entry {
>     diff --git a/protobuf/userns.proto b/protobuf/userns.proto
>     new file mode 100644
>     index 0000000..31d7718
>     --- /dev/null
>     +++ b/protobuf/userns.proto
>     @@ -0,0 +1,9 @@
>     +message userns_entry {
>     +       message map_entry {
>     +               required uint32 id_in   = 1;
>     +               required uint32 id_out  = 2;
>     +               required uint32 length  = 3;
>     +       }
>     +       repeated map_entry uid_map = 1;
>     +       repeated map_entry gid_map = 2;
>     +}
>     \ No newline at end of file
>     diff --git a/pstree.c b/pstree.c
>     index d005b64..c905317 100644
>     --- a/pstree.c
>     +++ b/pstree.c
>     @@ -603,6 +603,8 @@ static unsigned long get_clone_mask(TaskKobjIdsEntry
>     *i,
>                     mask |= CLONE_NEWUTS;
>             if (i->mnt_ns_id != p->mnt_ns_id)
>                     mask |= CLONE_NEWNS;
>     +       if (i->user_ns_id != p->user_ns_id)
>     +               mask |= CLONE_NEWUSER;
> 
>             return mask;
>      }
>     diff --git a/user_ns.c b/user_ns.c
>     new file mode 100644
>     index 0000000..e90f068
>     --- /dev/null
>     +++ b/user_ns.c
>     @@ -0,0 +1,227 @@
>     +#include <unistd.h>
>     +
>     +#include "namespaces.h"
>     +#include "user_ns.h"
>     +#include "list.h"
>     +
>     +#include "protobuf.h"
>     +#include "protobuf/userns.pb-c.h"
>     +
>     +struct map_entry {
>     +       UsernsEntry__MapEntry   entry;
>     +       struct list_head        list;
>     +};
>     +
>     +static void cleanup(int *fd, struct list_head *uid_list,
>     +                   struct list_head *gid_list)
>     +{
>     +       struct map_entry *pos, *tmp;
>     +
>     +       /* free uid entries */
>     +       list_for_each_entry_safe(pos, tmp, uid_list, list) {
>     +               list_del(&pos->list);
>     +               xfree(pos);
>     +       }
>     +
>     +       /* free gid entries */
>     +       list_for_each_entry_safe(pos, tmp, gid_list, list) {
>     +               list_del(&pos->list);
>     +               xfree(pos);
>     +       }
>     +
>     +       close_safe(fd);
>     +}
>     +
>     +static void fill_map(int n_entries, UsernsEntry__MapEntry **map,
>     +                    struct list_head *head)
>     +{
>     +       struct map_entry *tmp;
>     +
>     +       list_for_each_entry(tmp, head, list) {
>     +               map[--n_entries] = &(tmp->entry);
>     +       }
>     +}
>     +
>     +static int write_pb(int fd, int n_uid_entries, int n_gid_entries,
>     +                   struct list_head *uid_list, struct list_head *gid_list)
>     +{
>     +       UsernsEntry ue = USERNS_ENTRY__INIT;
>     +
>     +       UsernsEntry__MapEntry *uidmap[n_uid_entries];
>     +       fill_map(n_uid_entries, uidmap, uid_list);
>     +       ue.uid_map = uidmap;
>     +       ue.n_uid_map = n_uid_entries;
>     +
>     +       UsernsEntry__MapEntry *gidmap[n_gid_entries];
>     +       fill_map(n_gid_entries, gidmap, gid_list);
>     +       ue.gid_map = gidmap;
>     +       ue.n_gid_map = n_gid_entries;
>     +
>     +       return pb_write_one(fd, &ue, PB_USERNS);
>     +}
>     +
>     +static int read_map_entries(int pid, const char *id_map, struct list_head
>     *head)
>     +{
>     +       int n_read, n_entries = 0;
>     +       FILE *fp;
>     +       struct map_entry *tmp;
>     +       UsernsEntry__MapEntry entry = USERNS_ENTRY__MAP_ENTRY__INIT;
>     +
>     +       pr_debug("Reading entries from /proc/%d/%s\n", pid, id_map);
>     +
>     +       fp = fopen_proc(pid, "%s", id_map);
>     +       if (!fp) {
>     +               pr_perror("Error opening /proc/%d/%s\n", pid, id_map);
>     +               return -1;
>     +       }
>     +
>     +       while ((n_read = fscanf(fp, "%u %u %u\n", &entry.id_in,
>     +                               &entry.id_out, &entry.length)) != EOF) {
>     +               if (n_read != 3) {
>     +                       pr_perror("Error reading /proc/%d/%s, fscanf
>     returned %d",
>     +                                 pid, id_map, n_read);
>     +                       fclose(fp);
>     +                       return -1;
>     +               }
>     +
>     +               tmp = (struct map_entry *)xmalloc(sizeof(struct
>     map_entry));
>     +               if (!tmp) {
>     +                       fclose(fp);
>     +                       return -1;
>     +               }
>     +
>     +               memcpy(&tmp->entry, &entry, sizeof(UsernsEntry__MapEntry));
>     +               list_add(&(tmp->list), head);
>     +               n_entries++;
>     +       }
>     +
>     +       if (fclose(fp) != 0) {
>     +               pr_perror("fclose(/proc/%d/%s) failed", pid, id_map);
>     +               return -1;
>     +       }
>     +
>     +       return n_entries;
>     +}
>     +
>     +static int write_map_entries(int pid, const char *id_map,
>     +                             UsernsEntry__MapEntry **map,
>     +                             size_t n_entries)
>     +{
>     +       int i, fd, n_written, total = 0;
>     +       char buf[PAGE_SIZE];
>     +
>     +       pr_debug("Writing entries to /proc/%d/%s, n_entries=%lu\n",
>     +                pid, id_map, n_entries);
>     +
>     +       for (i = 0; i < n_entries; i++) {
>     +               n_written = snprintf(buf + total, sizeof(buf) - total,
>     +                                    "%u %u %u\n",
>     +                                     map[i]->id_in,
>     +                                     map[i]->id_out,
>     +                                     map[i]->length);
>     +               if (n_written < 0) {
>     +                       pr_err("snprintf failed for %s of pid: %d\n",
>     id_map, pid);
>     +                       return -1;
>     +               }
>     +               total += n_written;
>     +       }
>     +
>     +       /* id_maps can only be written to once */
>     +       fd = open_proc_rw(pid, "%s", id_map);
>     +       if (fd < 0) {
>     +               pr_perror("Unable to open /proc/%d/%s\n", pid, id_map);
>     +               return -1;
>     +       }
>     +
>     +       if (write(fd, buf, total) != total) {
>     +               pr_perror("Failed to write all %d bytes to /proc/%d/%s",
>     +                         total, pid, id_map);
>     +               close_safe(&fd);
>     +               return -1;
>     +       }
>     +
>     +       close_safe(&fd);
>     +
>     +       return 0;
>     +}
>     +
>     +int dump_user_ns(int ns_pid, int ns_id)
>     +{
>     +       int fd, ret, n_uid_entries, n_gid_entries;
>     +
>     +       LIST_HEAD(uid_list);
>     +       LIST_HEAD(gid_list);
>     +
>     +       fd = open_image(CR_FD_USERNS, O_DUMP, ns_id);
>     +       if (fd < 0) {
>     +               pr_err("Error opening userns image");
>     +               return -1;
>     +       }
>     +
>     +       /* read uid map */
>     +       n_uid_entries = read_map_entries(ns_pid, "uid_map", &uid_list);
>     +       if (n_uid_entries < 0) {
>     +               pr_err("Error reading uid_map\n");
>     +               ret = -1;
>     +               goto out;
>     +       }
>     +
>     +       /* read gid map */
>     +       n_gid_entries = read_map_entries(ns_pid, "gid_map", &gid_list);
>     +       if (n_gid_entries < 0) {
>     +               pr_err("Error reading gid_map\n");
>     +               ret = -1;
>     +               goto out;
>     +       }
>     +
>     +       ret = write_pb(fd, n_uid_entries, n_gid_entries, &uid_list, &
>     gid_list);
>     +
>     +out:
>     +       cleanup(&fd, &uid_list, &gid_list);
>     +       return ret;
>     +}
>     +
>     +/*
>     + * Restore uid_map and gid_map file for the init process. Since this is
>     called
>     + * from the parent, we access these files using the 'real_pid' of the
>     process.
>     + */
>     +int restore_user_ns(int real_pid, int ns_id)
>     +{
>     +       int fd, ret = 0;
>     +       UsernsEntry *ue;
>     +
>     +       pr_info("Restoring user namespace for real_pid:%d\n", real_pid);
>     +
>     +       fd = open_image(CR_FD_USERNS, O_RSTR, ns_id);
>     +       if (fd < 0)
>     +               return -1;
>     +
>     +       ret = pb_read_one(fd, &ue, PB_USERNS);
>     +       if (ret < 0)
>     +               return -1;
>     +
>     +       pr_info("userns restoring: n_uid_map:%lu ; n_gid_map:%lu\n",
>     +               ue->n_uid_map, ue->n_gid_map);
>     +
>     +       /* restore uid_map */
>     +       ret = write_map_entries(real_pid, "uid_map", ue->uid_map, ue->
>     n_uid_map);
>     +       if (ret < 0) {
>     +               pr_err("Failed to restore /proc/%d/uid_map\n", real_pid);
>     +               goto out;
>     +       }
>     +
>     +       /* restore gid_map */
>     +       ret = write_map_entries(real_pid, "gid_map", ue->gid_map, ue->
>     n_gid_map);
>     +       if (ret < 0) {
>     +               pr_err("Failed to restore /proc/%d/gid_map", real_pid);
>     +               goto out;
>     +       }
>     +
>     +out:
>     +       userns_entry__free_unpacked(ue, NULL);
>     +
>     +       close_safe(&fd);
>     +       return ret;
>     +}
>     +
>     +struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
>     --
>     2.1.0.rc2.206.gedb03e5
> 
> 
> 


More information about the CRIU mailing list