[CRIU] [PATCH 1/4] Support for dumping/restoring user namespaces

Andrew Vagin avagin at parallels.com
Mon Aug 11 06:17:10 PDT 2014


Hi Sophie,

On Fri, Aug 08, 2014 at 10:21:19PM -0700, Sophie Blee-Goldman wrote:
> Adds basic support for user namespaces by dumping and restoring
> the namespace itself and the uid/gid maps of the root process.

How do you test your patches? ZDTM test suite can execute tests in
namespaces, but the current version knows nothing about userns. Have you
try to add userns in ZDTM lib?

> 
> Currently depends on a kernel patch to avoid failing on the prctl
> syscall by checking for CAP_SYS_RESOURCE in the user namespace
> instead of in the global one.

It isn't so simple.
Kirill is trying to fix this issue: https://lkml.org/lkml/2014/8/4/570

We have a number of other kernel issues, which are described here:
http://criu.org/UserNamespace

Have you seen my patches for userns?
http://lists.openvz.org/pipermail/criu/2014-February/012399.html

and here is updated version:
https://github.com/avagin/criu/tree/userns2

I suggest to find the difference between our patch sets and make a new one,
which will contain best things from both ones.

Thanks,
Andrew.

> 
> Signed-off-by: Sophie Blee-Goldman <ableegoldman at google.com>
> 
> diff --git a/Makefile.crtools b/Makefile.crtools
> index 6033b2c..8e680d6 100644
> --- a/Makefile.crtools
> +++ b/Makefile.crtools
> @@ -34,6 +34,7 @@ obj-y	+= pipes.o
>  obj-y	+= fifo.o
>  obj-y	+= file-ids.o
>  obj-y	+= namespaces.o
> +obj-y	+= user_ns.o
>  obj-y	+= uts_ns.o
>  obj-y	+= ipc_ns.o
>  obj-y	+= netfilter.o
> diff --git a/cr-restore.c b/cr-restore.c
> index 3c36323..3c94b93 100644
> --- a/cr-restore.c
> +++ b/cr-restore.c
> @@ -52,6 +52,7 @@
>  #include "restorer-blob.h"
>  #include "crtools.h"
>  #include "namespaces.h"
> +#include "user_ns.h"
>  #include "mem.h"
>  #include "mount.h"
>  #include "fsnotify.h"
> @@ -1630,6 +1631,12 @@ static int restore_root_task(struct pstree_item *init)
>  	if (ret)
>  		goto out;
>  
> +	if (root_ns_mask & CLONE_NEWUSER) {
> +		ret = restore_user_ns(init->pid.real, init->ids->user_ns_id);
> +		if (ret < 0)
> +			goto out;
> +	}
> +
>  	ret = run_scripts("setup-namespaces");
>  	if (ret)
>  		goto out;
> diff --git a/cr-show.c b/cr-show.c
> index 5549c8d..fd33196 100644
> --- a/cr-show.c
> +++ b/cr-show.c
> @@ -21,6 +21,7 @@
>  #include "util.h"
>  #include "sockets.h"
>  #include "image.h"
> +#include "user_ns.h"
>  #include "uts_ns.h"
>  #include "ipc_ns.h"
>  #include "pstree.h"
> @@ -291,6 +292,7 @@ static struct show_image_info show_infos[] = {
>  	SHOW_VERT(CORE),
>  	SHOW_VERT(IDS),
>  	SHOW_VERT(CREDS),
> +	SHOW_VERT(USERNS),
>  	SHOW_VERT(UTSNS),
>  	SHOW_VERT(IPC_VAR),
>  	SHOW_VERT(FS),
> diff --git a/image-desc.c b/image-desc.c
> index 1e0e3f0..a9859f3 100644
> --- a/image-desc.c
> +++ b/image-desc.c
> @@ -52,6 +52,7 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
>  	FD_ENTRY(POSIX_TIMERS,	"posix-timers-%d"),
>  	FD_ENTRY(CREDS,		"creds-%d"),
>  	FD_ENTRY(UTSNS,		"utsns-%d"),
> +	FD_ENTRY(USERNS,	"userns-%d"),
>  	FD_ENTRY(IPC_VAR,	"ipcns-var-%d"),
>  	FD_ENTRY(IPCNS_SHM,	"ipcns-shm-%d"),
>  	FD_ENTRY(IPCNS_MSG,	"ipcns-msg-%d"),
> diff --git a/include/image-desc.h b/include/image-desc.h
> index eb42990..2db5237 100644
> --- a/include/image-desc.h
> +++ b/include/image-desc.h
> @@ -26,6 +26,7 @@ enum {
>  	/*
>  	 * NS entries
>  	 */
> +	CR_FD_USERNS,
>  	CR_FD_UTSNS,
>  	CR_FD_MNTS,
>  
> diff --git a/include/magic.h b/include/magic.h
> index 5192a60..06db3e3 100644
> --- a/include/magic.h
> +++ b/include/magic.h
> @@ -40,6 +40,7 @@
>  #define ITIMERS_MAGIC		0x57464056 /* Kostroma */
>  #define POSIX_TIMERS_MAGIC	0x52603957 /* Lipetsk */
>  #define SK_QUEUES_MAGIC		0x56264026 /* Suzdal */
> +#define USERNS_MAGIC		0x55474908 /* Kazan */

You may use any town, which you like;).

>  #define UTSNS_MAGIC		0x54473203 /* Smolensk */
>  #define CREDS_MAGIC		0x54023547 /* Kozelsk */
>  #define IPC_VAR_MAGIC		0x53115007 /* Samara */
> diff --git a/include/namespaces.h b/include/namespaces.h
> index 350b8b4..bc67519 100644
> --- a/include/namespaces.h
> +++ b/include/namespaces.h
> @@ -34,7 +34,6 @@ extern struct ns_id *ns_ids;
>  extern bool check_ns_proc(struct fd_link *link);
>  
>  extern struct ns_desc pid_ns_desc;
> -extern struct ns_desc user_ns_desc;
>  extern unsigned long root_ns_mask;
>  
>  extern const struct fdtype_ops nsfile_dump_ops;
> diff --git a/include/protobuf-desc.h b/include/protobuf-desc.h
> index 01c9f4c..1c8f9ce 100644
> --- a/include/protobuf-desc.h
> +++ b/include/protobuf-desc.h
> @@ -52,14 +52,15 @@ enum {
>  	PB_IRMAP_CACHE,
>  	PB_CGROUP,
>  	PB_TIMERFD,
> +	PB_USERNS,
>  
>  	/* PB_AUTOGEN_STOP */
>  
>  	PB_PAGEMAP_HEAD,
>  	PB_IDS,
>  	PB_SIGACT,
> -	PB_NETDEV,
> -	PB_REMAP_FPATH,		/* 50 */
> +	PB_NETDEV,		/* 50 */
> +	PB_REMAP_FPATH,
>  	PB_SK_QUEUES,
>  	PB_IPCNS_MSG,
>  	PB_IPCNS_MSG_ENT,
> diff --git a/include/syscall-types.h b/include/syscall-types.h
> index bab3dba..eb270b3 100644
> --- a/include/syscall-types.h
> +++ b/include/syscall-types.h
> @@ -57,7 +57,11 @@ struct itimerspec;
>  #define CLONE_NEWNET	0x40000000
>  #endif
>  
> -#define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS)
> +#ifndef CLONE_NEWUSER
> +#define CLONE_NEWUSER	0x10000000
> +#endif
> +
> +#define CLONE_ALLNS	(CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
>  
>  /* Nested namespaces are supported only for these types */
>  #define CLONE_SUBNS	(CLONE_NEWNS)
> diff --git a/include/user_ns.h b/include/user_ns.h
> new file mode 100644
> index 0000000..715b155
> --- /dev/null
> +++ b/include/user_ns.h
> @@ -0,0 +1,9 @@
> +#ifndef __CR_USER_NS_H__
> +#define __CR_USER_NS_H__
> +
> +extern int dump_user_ns(int ns_pid, int ns_id);
> +extern int restore_user_ns(int real_pid, int ns_id);
> +
> +extern struct ns_desc user_ns_desc;
> +
> +#endif /* __CR_USER_NS_H__ */
> diff --git a/namespaces.c b/namespaces.c
> index 6be030f..8c0d842 100644
> --- a/namespaces.c
> +++ b/namespaces.c
> @@ -9,6 +9,7 @@
>  #include "uts_ns.h"
>  #include "ipc_ns.h"
>  #include "mount.h"
> +#include "user_ns.h"
>  #include "pstree.h"
>  #include "namespaces.h"
>  #include "net.h"
> @@ -271,7 +272,7 @@ struct ns_file_info {
>  static int open_ns_fd(struct file_desc *d)
>  {
>  	struct ns_file_info *nfi = container_of(d, struct ns_file_info, d);
> -	struct pstree_item *item, *t;
> +	struct pstree_item *item = NULL, *t;
>  	struct ns_desc *nd = NULL;
>  	char path[64];
>  	int fd;
> @@ -304,6 +305,10 @@ static int open_ns_fd(struct file_desc *d)
>  			item = t;
>  			nd = &mnt_ns_desc;
>  			break;
> +		} else if (ids->user_ns_id == nfi->nfe->ns_id) {
> +			item = t;
> +			nd = &user_ns_desc;
> +			break;
>  		}
>  	}
>  
> @@ -391,6 +396,13 @@ int dump_task_ns_ids(struct pstree_item *item)
>  		return -1;
>  	}
>  
> +	ids->has_user_ns_id = true;
> +	ids->user_ns_id = get_ns_id(pid, &user_ns_desc);
> +	if (!ids->user_ns_id) {
> +		pr_err("Can't make userns id\n");
> +		return -1;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -446,6 +458,11 @@ static int do_dump_namespaces(struct ns_id *ns)
>  				ns->id, ns->pid);
>  		ret = dump_net_ns(ns->pid, ns->id);
>  		break;
> +	case CLONE_NEWUSER:
> +		pr_info("Dump USER namespace info %d via %d\n",
> +				ns->id, ns->pid);
> +		ret = dump_user_ns(ns->pid, ns->id);
> +		break;
>  	default:
>  		pr_err("Unknown namespace flag %x", ns->nd->cflag);
>  		break;
> @@ -604,9 +621,15 @@ int try_show_namespaces(int ns_pid)
>  		close(fd);
>  	}
>  
> +	fd = open_image(CR_FD_USERNS, O_SHOW, ids->user_ns_id);
> +	if (fd > 0) {
> +		pr_msg("-------------------USERNS---------------------\n");
> +		cr_parse_fd(fd, fdset_template[CR_FD_USERNS].magic);
> +		close(fd);
> +	}
> +
>  	pr_msg("---[ end of %d namespaces ]---\n", ns_pid);
>  	return 0;
>  }
>  
>  struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
> -struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
> diff --git a/protobuf-desc.c b/protobuf-desc.c
> index b97418b..9199b09 100644
> --- a/protobuf-desc.c
> +++ b/protobuf-desc.c
> @@ -38,6 +38,7 @@
>  #include "protobuf/sk-packet.pb-c.h"
>  #include "protobuf/creds.pb-c.h"
>  #include "protobuf/timer.pb-c.h"
> +#include "protobuf/userns.pb-c.h"
>  #include "protobuf/utsns.pb-c.h"
>  #include "protobuf/ipc-var.pb-c.h"
>  #include "protobuf/ipc-shm.pb-c.h"
> diff --git a/protobuf/Makefile b/protobuf/Makefile
> index 7f6485b..cd2b854 100644
> --- a/protobuf/Makefile
> +++ b/protobuf/Makefile
> @@ -50,6 +50,7 @@ proto-obj-y	+= ipc-shm.o
>  proto-obj-y	+= ipc-msg.o
>  proto-obj-y	+= ipc-sem.o
>  proto-obj-y	+= utsns.o
> +proto-obj-y	+= userns.o
>  proto-obj-y	+= creds.o
>  proto-obj-y	+= vma.o
>  proto-obj-y	+= netdev.o
> diff --git a/protobuf/core.proto b/protobuf/core.proto
> index d850e2e..8810376 100644
> --- a/protobuf/core.proto
> +++ b/protobuf/core.proto
> @@ -32,6 +32,7 @@ message task_kobj_ids_entry {
>  	optional uint32			ipc_ns_id	= 7;
>  	optional uint32			uts_ns_id	= 8;
>  	optional uint32			mnt_ns_id	= 9;
> +	optional uint32			user_ns_id	= 10;
>  }
>  
>  message thread_sas_entry {
> diff --git a/protobuf/userns.proto b/protobuf/userns.proto
> new file mode 100644
> index 0000000..31d7718
> --- /dev/null
> +++ b/protobuf/userns.proto
> @@ -0,0 +1,9 @@
> +message userns_entry {
> +	message map_entry {
> +		required uint32 id_in	= 1;
> +		required uint32 id_out	= 2;
> +		required uint32 length	= 3;
> +	}
> +	repeated map_entry uid_map = 1;
> +	repeated map_entry gid_map = 2;
> +}
> \ No newline at end of file
> diff --git a/pstree.c b/pstree.c
> index d005b64..c905317 100644
> --- a/pstree.c
> +++ b/pstree.c
> @@ -603,6 +603,8 @@ static unsigned long get_clone_mask(TaskKobjIdsEntry *i,
>  		mask |= CLONE_NEWUTS;
>  	if (i->mnt_ns_id != p->mnt_ns_id)
>  		mask |= CLONE_NEWNS;
> +	if (i->user_ns_id != p->user_ns_id)
> +		mask |= CLONE_NEWUSER;
>  
>  	return mask;
>  }
> diff --git a/user_ns.c b/user_ns.c
> new file mode 100644
> index 0000000..ef92f2d
> --- /dev/null
> +++ b/user_ns.c
> @@ -0,0 +1,228 @@
> +#include <unistd.h>
> +
> +#include "namespaces.h"
> +#include "user_ns.h"
> +#include "list.h"
> +
> +#include "protobuf.h"
> +#include "protobuf/userns.pb-c.h"
> +
> +struct map_entry {
> +	UsernsEntry__MapEntry   entry;
> +	struct list_head	list;
> +};
> +
> +static void cleanup(int *fd, struct list_head *uid_list,
> +		    struct list_head *gid_list)
> +{
> +	struct map_entry *pos, *tmp;
> +
> +	/* free uid entries */
> +	list_for_each_entry_safe(pos, tmp, uid_list, list) {
> +		list_del(&pos->list);
> +		xfree(pos);
> +	}
> +
> +	/* free gid entries */
> +	list_for_each_entry_safe(pos, tmp, gid_list, list) {
> +		list_del(&pos->list);
> +		xfree(pos);
> +	}
> +
> +	close_safe(fd);
> +}
> +
> +static void fill_map(int n_entries, UsernsEntry__MapEntry **map,
> +		     struct list_head *head)
> +{
> +	struct map_entry *tmp;
> +
> +	list_for_each_entry(tmp, head, list) {
> +		map[--n_entries] = &(tmp->entry);
> +	}
> +}
> +
> +static int write_pb(int fd, int n_uid_entries, int n_gid_entries,
> +		    struct list_head *uid_list, struct list_head *gid_list)
> +{
> +	UsernsEntry ue = USERNS_ENTRY__INIT;
> +
> +	UsernsEntry__MapEntry *uidmap[n_uid_entries];
> +	fill_map(n_uid_entries, uidmap, uid_list);
> +	ue.uid_map = uidmap;
> +	ue.n_uid_map = n_uid_entries;
> +
> +	UsernsEntry__MapEntry *gidmap[n_gid_entries];
> +	fill_map(n_gid_entries, gidmap, gid_list);
> +	ue.gid_map = gidmap;
> +	ue.n_gid_map = n_gid_entries;
> +
> +	return pb_write_one(fd, &ue, PB_USERNS);
> +}
> +
> +static int read_map_entries(const char *map_fname, struct list_head *head)
> +{
> +	int n_read, n_entries = 0;
> +	FILE *fp;
> +	struct map_entry *tmp;
> +	UsernsEntry__MapEntry entry = USERNS_ENTRY__MAP_ENTRY__INIT;
> +
> +	pr_debug("Reading entries from %s\n", map_fname);
> +
> +	fp = fopen(map_fname, "r");
> +	if (!fp) {
> +		pr_perror("Error opening %s\n", map_fname);
> +		return -1;
> +	}
> +
> +	while ((n_read = fscanf(fp, "%u %u %u\n", &entry.id_in,
> +				&entry.id_out, &entry.length)) != EOF) {
> +		if (n_read != 3) {
> +			pr_perror("Error reading %s, fscanf returned %d",
> +				  map_fname, n_read);
> +			fclose(fp);
> +			return -1;
> +		}
> +
> +		tmp = (struct map_entry *)xmalloc(sizeof(struct map_entry));
> +		if (!tmp) {
> +			fclose(fp);
> +			return -1;
> +		}
> +
> +		memcpy(&tmp->entry, &entry, sizeof(UsernsEntry__MapEntry));
> +		list_add(&(tmp->list), head);
> +		n_entries++;
> +	}
> +
> +	if (fclose(fp) != 0) {
> +		pr_perror("fclose(%s) failed", map_fname);
> +		return -1;
> +	}
> +
> +	return n_entries;
> +}
> +
> +static int write_map_entries(const char *map_fname,
> +                             UsernsEntry__MapEntry **map,
> +                             size_t n_entries)
> +{
> +	int i = 0, bytes_written;
> +	FILE *fp;
> +
> +	pr_debug("Writing entries to %s, n_entries=%lu\n",
> +                 map_fname, n_entries);
> +
> +	fp = fopen(map_fname, "w");
> +	if (!fp) {
> +		pr_perror("Unable to open %s\n", map_fname);
> +		return -1;
> +	}
> +
> +	while (i < n_entries) {
> +		bytes_written = fprintf(fp, "%u %u %u\n",
> +					map[i]->id_in,
> +					map[i]->id_out,
> +					map[i]->length);

{u,g}id_map files can be written only once. If you use fprintf, you
don't know how many times write will be called.

kernel/user_namespace.c:
static ssize_t map_write(struct file *file, const char __user *buf,
...
        ret = -EPERM;
        /* Only allow one successful write to the map */
        if (map->nr_extents != 0)
                goto out;

...

> +		if (bytes_written < 0) {
> +			pr_err("fprintf to %s failed.\n", map_fname);
> +			fclose(fp);
> +			return -1;
> +		}
> +		++i;
> +	}
> +
> +	if (fclose(fp) != 0) {
> +		pr_perror("fclose(%s) failed", map_fname);
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +int dump_user_ns(int ns_pid, int ns_id)
> +{
> +	int fd, ret, n_uid_entries, n_gid_entries;
> +	char map_fname[PATH_MAX];
> +
> +	LIST_HEAD(uid_list);

Why do we need this list? Can we write uid_maps in an image without
collection in the list?

> +	LIST_HEAD(gid_list);
> +
> +	fd = open_image(CR_FD_USERNS, O_DUMP, ns_id);
> +	if (fd < 0) {
> +		pr_err("Error opening userns image");
> +		return -1;
> +	}
> +
> +	/* read uid map */
> +	sprintf(map_fname, "/proc/%d/uid_map", ns_pid);

can we use fopen_proc here?

> +	n_uid_entries = read_map_entries(map_fname, &uid_list);
> +	if (n_uid_entries < 0) {
> +		pr_err("Error reading uid_map\n");
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	/* read gid map */
> +	sprintf(map_fname, "/proc/%d/gid_map", ns_pid);
> +	n_gid_entries = read_map_entries(map_fname, &gid_list);
> +	if (n_gid_entries < 0) {
> +		pr_err("Error reading gid_map\n");
> +		ret = -1;
> +		goto out;
> +	}
> +
> +	ret = write_pb(fd, n_uid_entries, n_gid_entries, &uid_list, &gid_list);
> +
> +out:
> +	cleanup(&fd, &uid_list, &gid_list);
> +	return ret;
> +}
> +
> +/*
> + * Restore uid_map and gid_map file for the init process. Since this is called
> + * from the parent, we access these files using the 'real_pid' of the process.
> + */
> +int restore_user_ns(int real_pid, int ns_id)
> +{
> +	int fd, ret = 0;
> +	UsernsEntry *ue;
> +	char map_fname[PATH_MAX];
> +
> +	pr_info("Restoring user namespace for real_pid:%d\n", real_pid);
> +
> +	fd = open_image(CR_FD_USERNS, O_RSTR, ns_id);
> +	if (fd < 0)
> +		return -1;
> +
> +	ret = pb_read_one(fd, &ue, PB_USERNS);
> +	if (ret < 0)
> +		return -1;
> +
> +	pr_info("userns restoring: n_uid_map:%lu ; n_gid_map:%lu\n",
> +		ue->n_uid_map, ue->n_gid_map);
> +
> +	/* restore uid_map */
> +	sprintf(map_fname, "/proc/%d/uid_map", real_pid);
> +	ret = write_map_entries(map_fname, ue->uid_map, ue->n_uid_map);
> +	if (ret < 0) {
> +		pr_err("Failed to restore %s\n", map_fname);
> +		goto out;
> +	}
> +
> +	/* restore gid_map */
> +	sprintf(map_fname, "/proc/%d/gid_map", real_pid);
> +	ret = write_map_entries(map_fname, ue->gid_map, ue->n_gid_map);
> +	if (ret < 0) {
> +		pr_err("Failed to restore %s\n", map_fname);
> +		goto out;
> +	}
> +
> +out:
> +	userns_entry__free_unpacked(ue, NULL);
> +
> +	close_safe(&fd);
> +	return ret;
> +}
> +
> +struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
> -- 
> 2.0.0.526.g5318336
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu


More information about the CRIU mailing list