[CRIU] Fwd: [PATCH 4/5] cr: Task creds support

Pavel Emelyanov xemul at openvz.org
Fri Jan 27 12:46:02 EST 2012


Resend from @openvz.org account. The criu list still has the 
pre-moderation for us being @parallels.com guys :)

-------- Original Message --------
Subject: [PATCH 4/5] cr: Task creds support
Date: Fri, 27 Jan 2012 21:43:32 +0400
From: Pavel Emelyanov <xemul at parallels.com>
To: Pavel Emelyanov <xemul at openvz.org>
CC: Cyrill Gorcunov <gorcunov at openvz.org>,  "criu at openvz.org" <criu at openvz.org>

Dumping is simple. All but secbits can be read from proc, secbits
are got from parasite.

Restoring is a bit tricky -- when you change anything on kernel
cred's struct it performs sophisticated checks and can change
some more stuff than requested, so the creds restoration procedure
is carefully commented step-by-step.

Another thing to mention is that creds are restored after everything
else, i.e. right before performing final threads sync and sigreturns.
This is done to avoid potential problems with insufficient caps for
restoring other stuff (e.g. CAP_DAC_OVERRIDE or zero euid is most 
likely required for opening any image file and the notorious control
/proc/sys/kernel/ns_last_pid, which in turn is performed till the 
very last moment).

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>

---

-------------- next part --------------
 cr-dump.c          |   48 ++++++++++++++++++++++++++++
 cr-restore.c       |   21 ++++++++++++
 cr-show.c          |   38 ++++++++++++++++++++++
 crtools.c          |    6 +++
 include/crtools.h  |    5 ++-
 include/image.h    |   21 ++++++++++++
 include/parasite.h |    1 +
 include/restorer.h |    2 +
 parasite.c         |    2 +
 restorer.c         |   87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 230 insertions(+), 1 deletions(-)

diff --git a/cr-dump.c b/cr-dump.c
index df12fd2..dbae3d6 100644
--- a/cr-dump.c
+++ b/cr-dump.c
@@ -432,6 +432,48 @@ err:
 	return ret;
 }
 
+static int dump_task_creds(pid_t pid, int pid_dir,
+		struct parasite_dump_misc *misc, struct cr_fdset *fds)
+{
+	int ret, i;
+	struct proc_status_creds cr;
+	struct creds_entry ce;
+
+	pr_info("\n");
+	pr_info("Dumping creds for %d)\n", pid);
+	pr_info("----------------------------------------\n");
+
+	ret = parse_pid_status(pid_dir, &cr);
+	if (ret < 0)
+		return ret;
+
+	ce.uid   = cr.uids[0];
+	ce.gid   = cr.gids[0];
+	ce.euid  = cr.uids[1];
+	ce.egid  = cr.gids[1];
+	ce.suid  = cr.uids[2];
+	ce.sgid  = cr.gids[2];
+	ce.fsuid = cr.uids[3];
+	ce.fsgid = cr.gids[3];
+
+	BUILD_BUG_ON(CR_CAP_SIZE != PROC_CAP_SIZE);
+
+	for (i = 0; i < CR_CAP_SIZE; i++) {
+		ce.cap_inh[i] = cr.cap_inh[i];
+		ce.cap_prm[i] = cr.cap_prm[i];
+		ce.cap_eff[i] = cr.cap_eff[i];
+		ce.cap_bnd[i] = cr.cap_bnd[i];
+	}
+
+	ce.secbits = misc->secbits;
+
+	ret = write_img(fds->fds[CR_FD_CREDS], &ce);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
 #define assign_reg(dst, src, e)		dst.e = (__typeof__(dst.e))src.e
 #define assign_array(dst, src, e)	memcpy(&dst.e, &src.e, sizeof(dst.e))
 
@@ -1249,6 +1291,12 @@ static int dump_one_task(struct pstree_item *item, struct cr_fdset *cr_fdset)
 		goto err;
 	}
 
+	ret = dump_task_creds(pid, pid_dir, &misc, cr_fdset);
+	if (ret) {
+		pr_err("Dump creds (pid: %d) failed with %d\n", pid, ret);
+		goto err;
+	}
+
 	ret = finalize_core(pid, &vma_area_list, cr_fdset);
 	if (ret) {
 		pr_err("Finalizing core (pid: %d) failed with %d\n", pid, ret);
diff --git a/cr-restore.c b/cr-restore.c
index e75b605..5f98c16 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -1619,6 +1619,23 @@ static int prepare_itimers(int pid, struct task_restore_core_args *args)
 	return ret;
 }
 
+static int prepare_creds(int pid, struct task_restore_core_args *args)
+{
+	int fd, ret;
+
+	fd = open_image_ro(CR_FD_CREDS, pid);
+	if (fd < 0)
+		return fd;
+
+	ret = read_img(fd, &args->creds);
+
+	close(fd);
+
+	/* XXX -- validate creds here? */
+
+	return ret > 0 ? 0 : -1;
+}
+
 static void sigreturn_restore(pid_t pstree_pid, pid_t pid)
 {
 	long restore_code_len, restore_task_vma_len;
@@ -1841,6 +1858,10 @@ static void sigreturn_restore(pid_t pstree_pid, pid_t pid)
 	if (ret < 0)
 		goto err;
 
+	ret = prepare_creds(pid, task_args);
+	if (ret < 0)
+		goto err;
+
 	cr_mutex_init(&task_args->rst_lock);
 
 	if (pstree_entry.nr_threads) {
diff --git a/cr-show.c b/cr-show.c
index c35a4a2..c012e69 100644
--- a/cr-show.c
+++ b/cr-show.c
@@ -249,6 +249,39 @@ out:
 	pr_img_tail(CR_FD_ITIMERS);
 }
 
+static void show_cap(char *name, u32 *v)
+{
+	int i;
+
+	pr_info("%s: ", name);
+	for (i = CR_CAP_SIZE - 1; i >= 0; i--)
+		pr_info("%08x", v[i]);
+	pr_info("\n");
+}
+
+static void show_creds(int fd)
+{
+	struct creds_entry ce;
+
+	pr_img_head(CR_FD_CREDS);
+	if (read_img(fd, &ce) < 0)
+		goto out;
+
+	pr_info("uid %u  euid %u  suid %u  fsuid %u\n",
+			ce.uid, ce.euid, ce.suid, ce.fsuid);
+	pr_info("gid %u  egid %u  sgid %u  fsgid %u\n",
+			ce.gid, ce.egid, ce.sgid, ce.fsgid);
+
+	show_cap("Inh", ce.cap_inh);
+	show_cap("Eff", ce.cap_eff);
+	show_cap("Prm", ce.cap_prm);
+	show_cap("Bnd", ce.cap_bnd);
+
+	pr_info("secbits: %x\n", ce.secbits);
+out:
+	pr_img_tail(CR_FD_CREDS);
+}
+
 static int show_pstree(int fd_pstree, struct list_head *collect)
 {
 	struct pstree_entry e;
@@ -477,6 +510,9 @@ static int cr_parse_file(struct cr_options *opts)
 	case UTSNS_MAGIC:
 		show_utsns(fd);
 		break;
+	case CREDS_MAGIC:
+		show_creds(fd);
+		break;
 	default:
 		pr_err("Unknown magic %x on %s\n", magic, opts->show_dump_file);
 		goto err;
@@ -556,6 +592,8 @@ static int cr_show_all(unsigned long pid, struct cr_options *opts)
 
 		show_itimers(cr_fdset->fds[CR_FD_ITIMERS]);
 
+		show_creds(cr_fdset->fds[CR_FD_CREDS]);
+
 		close_cr_fdset(&cr_fdset);
 
 		if (opts->leader_only)
diff --git a/crtools.c b/crtools.c
index 964bc1f..cde561b 100644
--- a/crtools.c
+++ b/crtools.c
@@ -98,6 +98,12 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
 		.magic	= ITIMERS_MAGIC,
 	},
 
+	/* creds */
+	[CR_FD_CREDS] = {
+		.fmt	= FMT_FNAME_CREDS,
+		.magic	= CREDS_MAGIC,
+	},
+
 	/* UTS namespace */
 	[CR_FD_UTSNS] = {
 		.fmt	= FMT_FNAME_UTSNS,
diff --git a/include/crtools.h b/include/crtools.h
index 75e5d10..dd66e67 100644
--- a/include/crtools.h
+++ b/include/crtools.h
@@ -29,6 +29,7 @@ enum {
 	CR_FD_UNIXSK,
 	CR_FD_INETSK,
 	CR_FD_ITIMERS,
+	CR_FD_CREDS,
 
 	/*
 	 * Global entries
@@ -76,6 +77,7 @@ extern struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX];
 #define FMT_FNAME_UNIXSK	"unixsk-%d.img"
 #define FMT_FNAME_INETSK	"inetsk-%d.img"
 #define FMT_FNAME_ITIMERS	"itimers-%d.img"
+#define FMT_FNAME_CREDS		"creds-%d.img"
 #define FMT_FNAME_UTSNS		"utsns-%d.img"
 
 extern int get_image_path(char *path, int size, const char *fmt, int pid);
@@ -104,7 +106,8 @@ struct cr_fdset {
 	CR_FD_DESC_USE(CR_FD_SIGACT)		|\
 	CR_FD_DESC_USE(CR_FD_UNIXSK)		|\
 	CR_FD_DESC_USE(CR_FD_INETSK)		|\
-	CR_FD_DESC_USE(CR_FD_ITIMERS)		)
+	CR_FD_DESC_USE(CR_FD_ITIMERS)		|\
+	CR_FD_DESC_USE(CR_FD_CREDS)		)
 #define CR_FD_DESC_NS				(\
 	CR_FD_DESC_USE(CR_FD_UTSNS)		)
 #define CR_FD_DESC_NONE			(0)
diff --git a/include/image.h b/include/image.h
index a7a998a..2f11167 100644
--- a/include/image.h
+++ b/include/image.h
@@ -20,6 +20,7 @@
 #define INETSK_MAGIC	0x56443851 /* Pereslavl */
 #define ITIMERS_MAGIC	0x57464056 /* Kostroma */
 #define UTSNS_MAGIC	0x54473203 /* Smolensk */
+#define CREDS_MAGIC	0x54023547 /* Kozelsk */
 
 #define PIPEFS_MAGIC	0x50495045
 
@@ -140,6 +141,26 @@ struct itimer_entry {
 	u64		vusec;
 } __packed;
 
+#define CR_CAP_SIZE	2
+
+struct creds_entry {
+	u32	uid;
+	u32	gid;
+	u32	euid;
+	u32	egid;
+	u32	suid;
+	u32	sgid;
+	u32	fsuid;
+	u32	fsgid;
+
+	u32	cap_inh[CR_CAP_SIZE];
+	u32	cap_prm[CR_CAP_SIZE];
+	u32	cap_eff[CR_CAP_SIZE];
+	u32	cap_bnd[CR_CAP_SIZE];
+
+	u32	secbits;
+} __packed;
+
 #define HEADER_VERSION		1
 #define HEADER_ARCH_X86_64	1
 
diff --git a/include/parasite.h b/include/parasite.h
index dc124a7..0fa6fdc 100644
--- a/include/parasite.h
+++ b/include/parasite.h
@@ -81,6 +81,7 @@ struct parasite_dump_pages_args {
 
 struct parasite_dump_misc {
 	parasite_status_t	status;
+	unsigned int		secbits;
 };
 
 /*
diff --git a/include/restorer.h b/include/restorer.h
index 371e7b6..7e4fa55 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -79,6 +79,8 @@ struct task_restore_core_args {
 	rt_sigaction_t			sigchld_act;
 
 	struct itimerval		itimers[3];
+
+	struct creds_entry		creds;
 } __aligned(sizeof(long));
 
 struct pt_regs {
diff --git a/parasite.c b/parasite.c
index 583604b..e3eb254 100644
--- a/parasite.c
+++ b/parasite.c
@@ -352,6 +352,8 @@ static int dump_misc(struct parasite_dump_misc *args)
 {
 	parasite_status_t *st = &args->status;
 
+	args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0);
+
 	SET_PARASITE_STATUS(st, 0, 0);
 	return 0;
 }
diff --git a/restorer.c b/restorer.c
index 8ef2406..a6ce16a 100644
--- a/restorer.c
+++ b/restorer.c
@@ -41,6 +41,78 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
 	sys_exit(1);
 }
 
+static void restore_creds(struct creds_entry *ce)
+{
+	int b, i;
+	struct cap_header hdr;
+	struct cap_data data[_LINUX_CAPABILITY_U32S_3];
+
+	/*
+	 * We're still root here and thus can do it without failures.
+	 */
+
+	if (ce == NULL)
+		return;
+
+	/*
+	 * First -- set the SECURE_NO_SETUID_FIXUP bit not to
+	 * lose caps bits when changing xids.
+	 */
+
+	sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
+
+	/*
+	 * Second -- restore xids. Since we still have the CAP_SETUID
+	 * capability nothing should fail. But call the setfsXid last
+	 * to override the setresXid settings.
+	 */
+
+	sys_setresuid(ce->uid, ce->euid, ce->suid);
+	sys_setfsuid(ce->fsuid);
+	sys_setresgid(ce->gid, ce->egid, ce->sgid);
+	sys_setfsgid(ce->fsgid);
+
+	/*
+	 * Third -- restore securebits. We don't need them in any
+	 * special state any longer.
+	 */
+
+	sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
+
+	/*
+	 * Fourth -- trim bset. This can only be done while
+	 * having the CAP_SETPCAP capablity.
+	 */
+
+	for (b = 0; b < CR_CAP_SIZE; b++) {
+		for (i = 0; i < 32; i++) {
+			if (ce->cap_bnd[b] & (1 << i))
+				/* already set */
+				continue;
+
+			sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
+		}
+	}
+
+	/*
+	 * Fifth -- restore caps. Nothing but cap bits are changed
+	 * at this stage, so just do it.
+	 */
+
+	hdr.version = _LINUX_CAPABILITY_VERSION_3;
+	hdr.pid = 0;
+
+	BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
+
+	for (i = 0; i < CR_CAP_SIZE; i++) {
+		data[i].eff = ce->cap_eff[i];
+		data[i].prm = ce->cap_prm[i];
+		data[i].inh = ce->cap_inh[i];
+	}
+
+	sys_capset(&hdr, data);
+}
+
 /*
  * Threads restoration via sigreturn. Note it's locked
  * routine and calls for unlock at the end.
@@ -117,6 +189,14 @@ long restore_thread(struct thread_restore_args *args)
 
 		cr_mutex_unlock(args->rst_lock);
 
+		/*
+		 * FIXME -- threads do not share creds, but it looks like
+		 * nobody tries to mess with this crap. That said we should
+		 * pass the master thread creds here
+		 */
+
+		restore_creds(NULL);
+
 		new_sp = (long)rt_sigframe + 8;
 		asm volatile(
 			"movq %0, %%rax					\n"
@@ -537,6 +617,13 @@ long restore_task(struct task_restore_core_args *args)
 		sys_close(fd);
 	}
 
+	/*
+	 * Restore creds late to avoid potential problems with
+	 * insufficient caps for restoring this or that before
+	 */
+
+	restore_creds(&args->creds);
+
 	task_entry = task_get_entry(args->task_entries, my_pid);
 
 	cr_wait_dec(&args->task_entries->nr_in_progress);
-- 
1.6.5.2


More information about the CRIU mailing list