[CRIU] [PATCH 8/9] creds: restore -- Implement per-thread restore of credentials

Cyrill Gorcunov gorcunov at openvz.org
Thu Dec 17 01:14:16 PST 2015


Because the creds parameters are to be passed inside pie/restorer
code but read before thread_restore_args and task_restore_args
structures are allocated we need a small trick and prepare
creds int several stages

 - collect all creds data into separate private memory blobs
 - once all memory needed for restorer is allocated we relocate
   pointers in this blocks and setup
   thread_restore_args::thread_creds_args to appropriate
   address
 - restorer works as usual and setup creds parameters as before

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 cr-restore.c       | 297 ++++++++++++++++++++++++++++++++---------------------
 include/restorer.h |  29 ++++--
 pie/restorer.c     |  33 ++++--
 3 files changed, 224 insertions(+), 135 deletions(-)

diff --git a/cr-restore.c b/cr-restore.c
index aade3bc0c6a1..bd77eb3744cb 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -2460,73 +2460,6 @@ static inline int verify_cap_size(CredsEntry *ce)
 		(ce->n_cap_prm == CR_CAP_SIZE) && (ce->n_cap_bnd == CR_CAP_SIZE));
 }
 
-static CredsEntry *read_creds(int pid)
-{
-	int ret;
-	struct cr_img *img;
-	CredsEntry *ce = NULL;
-
-	img = open_image(CR_FD_CREDS, O_RSTR, pid);
-	if (!img)
-		return NULL;
-
-	ret = pb_read_one(img, &ce, PB_CREDS);
-	close_image(img);
-
-	if (ret < 0) {
-		creds_entry__free_unpacked(ce, NULL);
-		return NULL;
-	}
-
-	if (!verify_cap_size(ce)) {
-		pr_err("Caps size mismatch %d %d %d %d\n",
-		       (int)ce->n_cap_inh, (int)ce->n_cap_eff,
-		       (int)ce->n_cap_prm, (int)ce->n_cap_bnd);
-		creds_entry__free_unpacked(ce, NULL);
-		return NULL;
-	}
-
-	if (!may_restore(ce)) {
-		creds_entry__free_unpacked(ce, NULL);
-		return NULL;
-	}
-
-	return ce;
-}
-
-static int prepare_creds(CredsEntry *ce, struct task_restore_args *args)
-{
-	args->creds = *ce;
-	args->creds.cap_inh = args->cap_inh;
-	memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh));
-	args->creds.cap_eff = args->cap_eff;
-	memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff));
-	args->creds.cap_prm = args->cap_prm;
-	memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm));
-	args->creds.cap_bnd = args->cap_bnd;
-	memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd));
-
-	/*
-	 * We can set supplementary groups here. This won't affect any
-	 * permission checks for us (we're still root) and will not be
-	 * reset by subsequent creds changes in restorer.
-	 */
-
-	BUILD_BUG_ON(sizeof(*ce->groups) != sizeof(gid_t));
-	if (setgroups(ce->n_groups, ce->groups) < 0) {
-		pr_perror("Can't set supplementary groups");
-		return -1;
-	}
-
-	creds_entry__free_unpacked(ce, NULL);
-
-	args->cap_last_cap = kdat.last_cap;
-
-	/* XXX -- validate creds here? */
-
-	return 0;
-}
-
 static int prepare_mm(pid_t pid, struct task_restore_args *args)
 {
 	int exe_fd, i, ret = -1;
@@ -2823,6 +2756,175 @@ out:
 extern void __gcov_flush(void) __attribute__((weak));
 void __gcov_flush(void) {}
 
+static void rst_reloc_creds(struct thread_restore_args *thread_args,
+			    unsigned long *creds_pos_next)
+{
+	struct thread_creds_args *args;
+
+	if (unlikely(!*creds_pos_next))
+		return;
+
+	args = rst_mem_remap_ptr(*creds_pos_next, RM_PRIVATE);
+
+	if (args->lsm_profile)
+		args->lsm_profile = rst_mem_remap_ptr(args->mem_lsm_profile_pos, RM_PRIVATE);
+	if (args->groups)
+		args->groups = rst_mem_remap_ptr(args->mem_groups_pos, RM_PRIVATE);
+
+	*creds_pos_next = args->mem_pos_next;
+	thread_args->creds_args = args;
+}
+
+static struct thread_creds_args *
+rst_prep_creds_args(struct thread_creds_args *prev, CredsEntry *ce)
+{
+	unsigned long this_pos = rst_mem_cpos(RM_PRIVATE);
+	struct thread_creds_args *args;
+
+	if (!verify_cap_size(ce)) {
+		pr_err("Caps size mismatch %d %d %d %d\n",
+		       (int)ce->n_cap_inh, (int)ce->n_cap_eff,
+		       (int)ce->n_cap_prm, (int)ce->n_cap_bnd);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!may_restore(ce))
+		return ERR_PTR(-EINVAL);
+
+	args = rst_mem_alloc(sizeof(*args), RM_PRIVATE);
+	if (!args)
+		return ERR_PTR(-ENOMEM);
+
+	args->cap_last_cap = kdat.last_cap;
+	memcpy(&args->creds, ce, sizeof(args->creds));
+
+	if (ce->lsm_profile || opts.lsm_supplied) {
+		char *rendered, *profile;
+
+		profile = ce->lsm_profile;
+		if (opts.lsm_supplied)
+			profile = opts.lsm_profile;
+
+		if (validate_lsm(profile) < 0)
+			return ERR_PTR(-EINVAL);
+
+		if (profile) {
+			size_t lsm_profile_len;
+
+			if (render_lsm_profile(profile, &rendered))
+				return ERR_PTR(-EINVAL);
+
+			args->mem_lsm_profile_pos = rst_mem_cpos(RM_PRIVATE);
+			lsm_profile_len = strlen(rendered);
+			args->lsm_profile = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE);
+			if (!args->lsm_profile) {
+				xfree(rendered);
+				return ERR_PTR(-ENOMEM);
+			}
+
+			strncpy(args->lsm_profile, rendered, lsm_profile_len);
+			xfree(rendered);
+		}
+	} else {
+		args->lsm_profile = NULL;
+		args->mem_lsm_profile_pos = 0;
+	}
+
+	/*
+	 * Zap fields which we cant use.
+	 */
+	args->creds.cap_inh = NULL;
+	args->creds.cap_eff = NULL;
+	args->creds.cap_prm = NULL;
+	args->creds.cap_bnd = NULL;
+	args->creds.groups = NULL;
+	args->creds.lsm_profile = NULL;
+
+	memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh));
+	memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff));
+	memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm));
+	memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd));
+
+	if (ce->n_groups) {
+		args->mem_groups_pos = rst_mem_cpos(RM_PRIVATE);
+		args->groups = rst_mem_alloc(ce->n_groups * sizeof(u32), RM_PRIVATE);
+		if (!args->groups)
+			return ERR_PTR(-ENOMEM);
+		memcpy(args->groups, ce->groups, ce->n_groups * sizeof(u32));
+	} else {
+		args->groups = NULL;
+		args->mem_groups_pos = 0;
+	}
+
+	args->mem_pos_next = 0;
+
+	if (prev)
+		prev->mem_pos_next = this_pos;
+	return args;
+}
+
+static int rst_prep_creds_from_img(pid_t pid)
+{
+	CredsEntry *ce = NULL;
+	struct cr_img *img;
+	int ret;
+
+	img = open_image(CR_FD_CREDS, O_RSTR, pid);
+	if (!img)
+		return -ENOENT;
+
+	ret = pb_read_one(img, &ce, PB_CREDS);
+	close_image(img);
+
+	if (ret > 0) {
+		struct thread_creds_args *args = NULL;
+
+		args = rst_prep_creds_args(NULL, ce);
+		if (IS_ERR(args))
+			ret = PTR_ERR(args);
+		else
+			ret = 0;
+	}
+	creds_entry__free_unpacked(ce, NULL);
+	return ret;
+}
+
+static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos)
+{
+	size_t i;
+
+	/*
+	 * This is _really_ very old image
+	 * format where @thread_core were not
+	 * present. It means we don't have
+	 * creds either, just ignore and exit
+	 * early.
+	 */
+	if (unlikely(!core->thread_core)) {
+		*creds_pos = 0;
+		return 0;
+	}
+
+	*creds_pos = rst_mem_cpos(RM_PRIVATE);
+
+	/*
+	 * Old format: one Creds per task carried in own image file.
+	 */
+	if (!core->thread_core->creds)
+		return rst_prep_creds_from_img(pid);
+
+	for (i = 0; i < current->nr_threads; i++) {
+		CredsEntry *ce = current->core[i]->thread_core->creds;
+		struct thread_creds_args *args = NULL;
+
+		args = rst_prep_creds_args(args, ce);
+		if (IS_ERR(args))
+			return PTR_ERR(args);
+	}
+
+	return 0;
+}
+
 static int sigreturn_restore(pid_t pid, CoreEntry *core)
 {
 	void *mem = MAP_FAILED;
@@ -2850,10 +2952,6 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	unsigned long aio_rings;
 	MmEntry *mm = rsti(current)->mm;
 
-	char *lsm = NULL;
-	int lsm_profile_len = 0;
-	unsigned long lsm_pos = 0;
-
 	int n_seccomp_filters = 0;
 	unsigned long seccomp_filter_pos = 0;
 
@@ -2861,7 +2959,7 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	struct vm_area_list *vmas = &rsti(current)->vmas;
 	int i;
 
-	CredsEntry *creds;
+	unsigned long creds_pos = 0;
 
 	pr_info("Restore via sigreturn\n");
 
@@ -2925,6 +3023,13 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 		goto err_nv;
 
 	/*
+	 * Read creds info for every thread and allocate memory
+	 * needed so we can use this data inside restorer.
+	 */
+	if (rst_prep_creds(pid, core, &creds_pos))
+		goto err_nv;
+
+	/*
 	 * We're about to search for free VM area and inject the restorer blob
 	 * into it. No irrelevent mmaps/mremaps beyond this point, otherwise
 	 * this unwanted mapping might get overlapped by the restorer.
@@ -2934,45 +3039,9 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	if (ret < 0)
 		goto err;
 
-	creds = read_creds(pid);
-	if (!creds)
-		goto err;
-
-	if (creds->lsm_profile || opts.lsm_supplied) {
-		char *rendered, *profile;
-		int ret;
-
-		profile = creds->lsm_profile;
-		if (opts.lsm_supplied)
-			profile = opts.lsm_profile;
-
-		if (validate_lsm(profile) < 0)
-			return -1;
-
-		if (profile) {
-			ret = render_lsm_profile(profile, &rendered);
-			if (ret < 0) {
-				goto err_nv;
-			}
-
-			lsm_pos = rst_mem_cpos(RM_PRIVATE);
-			lsm_profile_len = strlen(rendered);
-			lsm = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE);
-			if (!lsm) {
-				xfree(rendered);
-				goto err_nv;
-			}
-
-			strncpy(lsm, rendered, lsm_profile_len);
-			xfree(rendered);
-		}
-
-	}
-
 	if (seccomp_filters_get_rst_pos(core, &n_seccomp_filters, &seccomp_filter_pos) < 0)
 		goto err;
 
-
 	rst_mem_size = rst_mem_lock();
 	restore_bootstrap_len = restorer_len + args_len + rst_mem_size;
 
@@ -3048,10 +3117,6 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 		goto err;
 	}
 
-	ret = prepare_creds(creds, task_args);
-	if (ret < 0)
-		goto err;
-
 	/*
 	 * Get a reference to shared memory area which is
 	 * used to signal if shmem restoration complete
@@ -3102,11 +3167,6 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	if (core->tc->has_seccomp_mode)
 		task_args->seccomp_mode = core->tc->seccomp_mode;
 
-	if (lsm)
-		task_args->creds.lsm_profile = rst_mem_remap_ptr(lsm_pos, RM_PRIVATE);
-	else
-		task_args->creds.lsm_profile = NULL;
-
 	/*
 	 * Arguments for task restoration.
 	 */
@@ -3124,6 +3184,7 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	 * Fill up per-thread data.
 	 */
 	for (i = 0; i < current->nr_threads; i++) {
+		unsigned long creds_pos_next = creds_pos;
 		CoreEntry *tcore;
 		struct rt_sigframe *sigframe;
 
@@ -3157,6 +3218,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 		thread_args[i].clear_tid_addr	= CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
 		core_get_tls(tcore, &thread_args[i].tls);
 
+		rst_reloc_creds(&thread_args[i], &creds_pos_next);
+
 		if (tcore->thread_core) {
 			thread_args[i].has_futex	= true;
 			thread_args[i].futex_rla	= tcore->thread_core->futex_rla;
diff --git a/include/restorer.h b/include/restorer.h
index 74be81dacb9b..4570909f10b8 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -72,6 +72,26 @@ struct task_restore_core_args;
  * simpler, force both _args alignment be 64 bytes.
  */
 
+struct thread_creds_args {
+	CredsEntry			creds;
+
+	unsigned int			cap_last_cap;
+
+	u32				cap_inh[CR_CAP_SIZE];
+	u32				cap_prm[CR_CAP_SIZE];
+	u32				cap_eff[CR_CAP_SIZE];
+	u32				cap_bnd[CR_CAP_SIZE];
+
+	unsigned int			secbits;
+	char				*lsm_profile;
+	unsigned int			*groups;
+
+	unsigned long			mem_lsm_profile_pos;
+	unsigned long			mem_groups_pos;
+
+	unsigned long			mem_pos_next;
+};
+
 struct thread_restore_args {
 	struct restore_mem_zone		mem_zone;
 
@@ -93,6 +113,8 @@ struct thread_restore_args {
 	unsigned int			siginfo_n;
 
 	int				pdeath_sig;
+
+	struct thread_creds_args	*creds_args;
 } __aligned(64);
 
 struct task_restore_args {
@@ -153,13 +175,6 @@ struct task_restore_args {
 
 	struct itimerval		itimers[3];
 
-	CredsEntry			creds;
-	u32				cap_inh[CR_CAP_SIZE];
-	u32				cap_prm[CR_CAP_SIZE];
-	u32				cap_eff[CR_CAP_SIZE];
-	u32				cap_bnd[CR_CAP_SIZE];
-	u32				cap_last_cap;
-
 	MmEntry				mm;
 	auxv_t				mm_saved_auxv[AT_VECTOR_SIZE];
 	u32				mm_saved_auxv_size;
diff --git a/pie/restorer.c b/pie/restorer.c
index 4665c5d78872..1cce88d4c322 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -121,8 +121,9 @@ static int lsm_set_label(char *label, int procfd)
 	return 0;
 }
 
-static int restore_creds(CredsEntry *ce, int procfd)
+static int restore_creds(struct thread_creds_args *args, int procfd)
 {
+	CredsEntry *ce = &args->creds;
 	int b, i, ret;
 	struct cap_header hdr;
 	struct cap_data data[_LINUX_CAPABILITY_U32S_3];
@@ -132,6 +133,17 @@ static int restore_creds(CredsEntry *ce, int procfd)
 	 */
 
 	/*
+	 * Setup supplementary group IDs early.
+	 */
+	if (args->groups) {
+		ret = sys_setgroups(ce->n_groups, args->groups);
+		if (ret) {
+			pr_err("Can't setup supplementary group IDs: %d\n", ret);
+			return -1;
+		}
+	}
+
+	/*
 	 * First -- set the SECURE_NO_SETUID_FIXUP bit not to
 	 * lose caps bits when changing xids.
 	 */
@@ -190,9 +202,9 @@ static int restore_creds(CredsEntry *ce, int procfd)
 
 	for (b = 0; b < CR_CAP_SIZE; b++) {
 		for (i = 0; i < 32; i++) {
-			if (b * 32 + i > cap_last_cap)
+			if (b * 32 + i > args->cap_last_cap)
 				break;
-			if (ce->cap_bnd[b] & (1 << i))
+			if (args->cap_bnd[b] & (1 << i))
 				/* already set */
 				continue;
 			ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
@@ -215,9 +227,9 @@ static int restore_creds(CredsEntry *ce, int procfd)
 	BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
 
 	for (i = 0; i < CR_CAP_SIZE; i++) {
-		data[i].eff = ce->cap_eff[i];
-		data[i].prm = ce->cap_prm[i];
-		data[i].inh = ce->cap_inh[i];
+		data[i].eff = args->cap_eff[i];
+		data[i].prm = args->cap_prm[i];
+		data[i].inh = args->cap_inh[i];
 	}
 
 	ret = sys_capset(&hdr, data);
@@ -226,9 +238,8 @@ static int restore_creds(CredsEntry *ce, int procfd)
 		return -1;
 	}
 
-	if (lsm_set_label(ce->lsm_profile, procfd) < 0)
+	if (lsm_set_label(args->lsm_profile, procfd) < 0)
 		return -1;
-
 	return 0;
 }
 
@@ -443,7 +454,7 @@ long __export_restore_thread(struct thread_restore_args *args)
 	if (restore_thread_common(rt_sigframe, args))
 		goto core_restore_end;
 
-	ret = restore_creds(&args->ta->creds, args->ta->proc_fd);
+	ret = restore_creds(args->creds_args, args->ta->proc_fd);
 	if (ret)
 		goto core_restore_end;
 
@@ -884,7 +895,7 @@ long __export_restore_task(struct task_restore_args *args)
 	log_set_fd(args->logfd);
 	log_set_loglevel(args->loglevel);
 
-	cap_last_cap = args->cap_last_cap;
+	cap_last_cap = args->t->creds_args->cap_last_cap;
 
 	pr_info("Switched to the restorer %d\n", my_pid);
 
@@ -1262,7 +1273,7 @@ long __export_restore_task(struct task_restore_args *args)
 	 * thus restore* creds _after_ all of the above.
 	 */
 
-	ret = restore_creds(&args->creds, args->proc_fd);
+	ret = restore_creds(args->t->creds_args, args->proc_fd);
 	ret = ret || restore_dumpable_flag(&args->mm);
 	ret = ret || restore_pdeath_sig(args->t);
 
-- 
2.5.0



More information about the CRIU mailing list