[CRIU] [PATCH 8/9 v2] creds: restore -- Implement per-thread restore of credentials

Cyrill Gorcunov gorcunov at gmail.com
Mon Dec 21 03:08:12 PST 2015


Because the creds parameters are to be passed inside pie/restorer
code but read before thread_restore_args and task_restore_args
structures are allocated we need a small trick and prepare
creds int several stages

 - collect all creds data into separate private memory blobs
 - once all memory needed for restorer is allocated we relocate
   pointers in this blocks and setup
   thread_restore_args::thread_creds_args to appropriate
   address
 - restorer works as usual and setup creds parameters as before

v2:
 - fix addressing in positioning of rst_ memory (I've occasionally
   zap pointers and when been sending patches forgot to merge changes
   back, so while I've the series successfully restoring containers
   with different creds, if been merged the series won't work. So
   all changes are merged as appropriate)

 - drop module's global @cap_last_cap from pie/restorer.c

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
 cr-restore.c       | 298 ++++++++++++++++++++++++++++++++---------------------
 include/restorer.h |  29 ++++--
 pie/restorer.c     |  34 +++---
 3 files changed, 224 insertions(+), 137 deletions(-)

diff --git a/cr-restore.c b/cr-restore.c
index aade3bc0c6a1..a368e3ca3291 100644
--- a/cr-restore.c
+++ b/cr-restore.c
@@ -2460,73 +2460,6 @@ static inline int verify_cap_size(CredsEntry *ce)
 		(ce->n_cap_prm == CR_CAP_SIZE) && (ce->n_cap_bnd == CR_CAP_SIZE));
 }
 
-static CredsEntry *read_creds(int pid)
-{
-	int ret;
-	struct cr_img *img;
-	CredsEntry *ce = NULL;
-
-	img = open_image(CR_FD_CREDS, O_RSTR, pid);
-	if (!img)
-		return NULL;
-
-	ret = pb_read_one(img, &ce, PB_CREDS);
-	close_image(img);
-
-	if (ret < 0) {
-		creds_entry__free_unpacked(ce, NULL);
-		return NULL;
-	}
-
-	if (!verify_cap_size(ce)) {
-		pr_err("Caps size mismatch %d %d %d %d\n",
-		       (int)ce->n_cap_inh, (int)ce->n_cap_eff,
-		       (int)ce->n_cap_prm, (int)ce->n_cap_bnd);
-		creds_entry__free_unpacked(ce, NULL);
-		return NULL;
-	}
-
-	if (!may_restore(ce)) {
-		creds_entry__free_unpacked(ce, NULL);
-		return NULL;
-	}
-
-	return ce;
-}
-
-static int prepare_creds(CredsEntry *ce, struct task_restore_args *args)
-{
-	args->creds = *ce;
-	args->creds.cap_inh = args->cap_inh;
-	memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh));
-	args->creds.cap_eff = args->cap_eff;
-	memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff));
-	args->creds.cap_prm = args->cap_prm;
-	memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm));
-	args->creds.cap_bnd = args->cap_bnd;
-	memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd));
-
-	/*
-	 * We can set supplementary groups here. This won't affect any
-	 * permission checks for us (we're still root) and will not be
-	 * reset by subsequent creds changes in restorer.
-	 */
-
-	BUILD_BUG_ON(sizeof(*ce->groups) != sizeof(gid_t));
-	if (setgroups(ce->n_groups, ce->groups) < 0) {
-		pr_perror("Can't set supplementary groups");
-		return -1;
-	}
-
-	creds_entry__free_unpacked(ce, NULL);
-
-	args->cap_last_cap = kdat.last_cap;
-
-	/* XXX -- validate creds here? */
-
-	return 0;
-}
-
 static int prepare_mm(pid_t pid, struct task_restore_args *args)
 {
 	int exe_fd, i, ret = -1;
@@ -2823,6 +2756,175 @@ out:
 extern void __gcov_flush(void) __attribute__((weak));
 void __gcov_flush(void) {}
 
+static void rst_reloc_creds(struct thread_restore_args *thread_args,
+			    unsigned long *creds_pos_next)
+{
+	struct thread_creds_args *args;
+
+	if (unlikely(!*creds_pos_next))
+		return;
+
+	args = rst_mem_remap_ptr(*creds_pos_next, RM_PRIVATE);
+
+	if (args->lsm_profile)
+		args->lsm_profile = rst_mem_remap_ptr(args->mem_lsm_profile_pos, RM_PRIVATE);
+	if (args->groups)
+		args->groups = rst_mem_remap_ptr(args->mem_groups_pos, RM_PRIVATE);
+
+	*creds_pos_next = args->mem_pos_next;
+	thread_args->creds_args = args;
+}
+
+static struct thread_creds_args *
+rst_prep_creds_args(struct thread_creds_args *prev, CredsEntry *ce)
+{
+	unsigned long this_pos = rst_mem_cpos(RM_PRIVATE);
+	struct thread_creds_args *args;
+
+	if (!verify_cap_size(ce)) {
+		pr_err("Caps size mismatch %d %d %d %d\n",
+		       (int)ce->n_cap_inh, (int)ce->n_cap_eff,
+		       (int)ce->n_cap_prm, (int)ce->n_cap_bnd);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!may_restore(ce))
+		return ERR_PTR(-EINVAL);
+
+	args = rst_mem_alloc(sizeof(*args), RM_PRIVATE);
+	if (!args)
+		return ERR_PTR(-ENOMEM);
+
+	args->cap_last_cap = kdat.last_cap;
+	memcpy(&args->creds, ce, sizeof(args->creds));
+
+	if (ce->lsm_profile || opts.lsm_supplied) {
+		char *rendered, *profile;
+
+		profile = ce->lsm_profile;
+		if (opts.lsm_supplied)
+			profile = opts.lsm_profile;
+
+		if (validate_lsm(profile) < 0)
+			return ERR_PTR(-EINVAL);
+
+		if (profile) {
+			size_t lsm_profile_len;
+
+			if (render_lsm_profile(profile, &rendered))
+				return ERR_PTR(-EINVAL);
+
+			args->mem_lsm_profile_pos = rst_mem_cpos(RM_PRIVATE);
+			lsm_profile_len = strlen(rendered);
+			args->lsm_profile = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE);
+			if (!args->lsm_profile) {
+				xfree(rendered);
+				return ERR_PTR(-ENOMEM);
+			}
+
+			strncpy(args->lsm_profile, rendered, lsm_profile_len);
+			xfree(rendered);
+		}
+	} else {
+		args->lsm_profile = NULL;
+		args->mem_lsm_profile_pos = 0;
+	}
+
+	/*
+	 * Zap fields which we cant use.
+	 */
+	args->creds.cap_inh = NULL;
+	args->creds.cap_eff = NULL;
+	args->creds.cap_prm = NULL;
+	args->creds.cap_bnd = NULL;
+	args->creds.groups = NULL;
+	args->creds.lsm_profile = NULL;
+
+	memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh));
+	memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff));
+	memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm));
+	memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd));
+
+	if (ce->n_groups) {
+		args->mem_groups_pos = rst_mem_cpos(RM_PRIVATE);
+		args->groups = rst_mem_alloc(ce->n_groups * sizeof(u32), RM_PRIVATE);
+		if (!args->groups)
+			return ERR_PTR(-ENOMEM);
+		memcpy(args->groups, ce->groups, ce->n_groups * sizeof(u32));
+	} else {
+		args->groups = NULL;
+		args->mem_groups_pos = 0;
+	}
+
+	args->mem_pos_next = 0;
+
+	if (prev)
+		prev->mem_pos_next = this_pos;
+	return args;
+}
+
+static int rst_prep_creds_from_img(pid_t pid)
+{
+	CredsEntry *ce = NULL;
+	struct cr_img *img;
+	int ret;
+
+	img = open_image(CR_FD_CREDS, O_RSTR, pid);
+	if (!img)
+		return -ENOENT;
+
+	ret = pb_read_one(img, &ce, PB_CREDS);
+	close_image(img);
+
+	if (ret > 0) {
+		struct thread_creds_args *args;
+
+		args = rst_prep_creds_args(NULL, ce);
+		if (IS_ERR(args))
+			ret = PTR_ERR(args);
+		else
+			ret = 0;
+	}
+	creds_entry__free_unpacked(ce, NULL);
+	return ret;
+}
+
+static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos)
+{
+	struct thread_creds_args *args = NULL;
+	size_t i;
+
+	/*
+	 * This is _really_ very old image
+	 * format where @thread_core were not
+	 * present. It means we don't have
+	 * creds either, just ignore and exit
+	 * early.
+	 */
+	if (unlikely(!core->thread_core)) {
+		*creds_pos = 0;
+		return 0;
+	}
+
+	*creds_pos = rst_mem_cpos(RM_PRIVATE);
+
+	/*
+	 * Old format: one Creds per task carried in own image file.
+	 */
+	if (!core->thread_core->creds)
+		return rst_prep_creds_from_img(pid);
+
+	for (i = 0; i < current->nr_threads; i++) {
+		CredsEntry *ce = current->core[i]->thread_core->creds;
+
+		args = rst_prep_creds_args(args, ce);
+		if (IS_ERR(args))
+			return PTR_ERR(args);
+	}
+
+	return 0;
+}
+
 static int sigreturn_restore(pid_t pid, CoreEntry *core)
 {
 	void *mem = MAP_FAILED;
@@ -2850,10 +2952,6 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	unsigned long aio_rings;
 	MmEntry *mm = rsti(current)->mm;
 
-	char *lsm = NULL;
-	int lsm_profile_len = 0;
-	unsigned long lsm_pos = 0;
-
 	int n_seccomp_filters = 0;
 	unsigned long seccomp_filter_pos = 0;
 
@@ -2861,7 +2959,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	struct vm_area_list *vmas = &rsti(current)->vmas;
 	int i;
 
-	CredsEntry *creds;
+	unsigned long creds_pos = 0;
+	unsigned long creds_pos_next;
 
 	pr_info("Restore via sigreturn\n");
 
@@ -2925,6 +3024,13 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 		goto err_nv;
 
 	/*
+	 * Read creds info for every thread and allocate memory
+	 * needed so we can use this data inside restorer.
+	 */
+	if (rst_prep_creds(pid, core, &creds_pos))
+		goto err_nv;
+
+	/*
 	 * We're about to search for free VM area and inject the restorer blob
 	 * into it. No irrelevent mmaps/mremaps beyond this point, otherwise
 	 * this unwanted mapping might get overlapped by the restorer.
@@ -2934,45 +3040,9 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	if (ret < 0)
 		goto err;
 
-	creds = read_creds(pid);
-	if (!creds)
-		goto err;
-
-	if (creds->lsm_profile || opts.lsm_supplied) {
-		char *rendered, *profile;
-		int ret;
-
-		profile = creds->lsm_profile;
-		if (opts.lsm_supplied)
-			profile = opts.lsm_profile;
-
-		if (validate_lsm(profile) < 0)
-			return -1;
-
-		if (profile) {
-			ret = render_lsm_profile(profile, &rendered);
-			if (ret < 0) {
-				goto err_nv;
-			}
-
-			lsm_pos = rst_mem_cpos(RM_PRIVATE);
-			lsm_profile_len = strlen(rendered);
-			lsm = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE);
-			if (!lsm) {
-				xfree(rendered);
-				goto err_nv;
-			}
-
-			strncpy(lsm, rendered, lsm_profile_len);
-			xfree(rendered);
-		}
-
-	}
-
 	if (seccomp_filters_get_rst_pos(core, &n_seccomp_filters, &seccomp_filter_pos) < 0)
 		goto err;
 
-
 	rst_mem_size = rst_mem_lock();
 	restore_bootstrap_len = restorer_len + args_len + rst_mem_size;
 
@@ -3048,10 +3118,6 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 		goto err;
 	}
 
-	ret = prepare_creds(creds, task_args);
-	if (ret < 0)
-		goto err;
-
 	/*
 	 * Get a reference to shared memory area which is
 	 * used to signal if shmem restoration complete
@@ -3102,11 +3168,6 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	if (core->tc->has_seccomp_mode)
 		task_args->seccomp_mode = core->tc->seccomp_mode;
 
-	if (lsm)
-		task_args->creds.lsm_profile = rst_mem_remap_ptr(lsm_pos, RM_PRIVATE);
-	else
-		task_args->creds.lsm_profile = NULL;
-
 	/*
 	 * Arguments for task restoration.
 	 */
@@ -3123,6 +3184,7 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 	/*
 	 * Fill up per-thread data.
 	 */
+	creds_pos_next = creds_pos;
 	for (i = 0; i < current->nr_threads; i++) {
 		CoreEntry *tcore;
 		struct rt_sigframe *sigframe;
@@ -3157,6 +3219,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
 		thread_args[i].clear_tid_addr	= CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
 		core_get_tls(tcore, &thread_args[i].tls);
 
+		rst_reloc_creds(&thread_args[i], &creds_pos_next);
+
 		if (tcore->thread_core) {
 			thread_args[i].has_futex	= true;
 			thread_args[i].futex_rla	= tcore->thread_core->futex_rla;
diff --git a/include/restorer.h b/include/restorer.h
index 74be81dacb9b..4570909f10b8 100644
--- a/include/restorer.h
+++ b/include/restorer.h
@@ -72,6 +72,26 @@ struct task_restore_core_args;
  * simpler, force both _args alignment be 64 bytes.
  */
 
+struct thread_creds_args {
+	CredsEntry			creds;
+
+	unsigned int			cap_last_cap;
+
+	u32				cap_inh[CR_CAP_SIZE];
+	u32				cap_prm[CR_CAP_SIZE];
+	u32				cap_eff[CR_CAP_SIZE];
+	u32				cap_bnd[CR_CAP_SIZE];
+
+	unsigned int			secbits;
+	char				*lsm_profile;
+	unsigned int			*groups;
+
+	unsigned long			mem_lsm_profile_pos;
+	unsigned long			mem_groups_pos;
+
+	unsigned long			mem_pos_next;
+};
+
 struct thread_restore_args {
 	struct restore_mem_zone		mem_zone;
 
@@ -93,6 +113,8 @@ struct thread_restore_args {
 	unsigned int			siginfo_n;
 
 	int				pdeath_sig;
+
+	struct thread_creds_args	*creds_args;
 } __aligned(64);
 
 struct task_restore_args {
@@ -153,13 +175,6 @@ struct task_restore_args {
 
 	struct itimerval		itimers[3];
 
-	CredsEntry			creds;
-	u32				cap_inh[CR_CAP_SIZE];
-	u32				cap_prm[CR_CAP_SIZE];
-	u32				cap_eff[CR_CAP_SIZE];
-	u32				cap_bnd[CR_CAP_SIZE];
-	u32				cap_last_cap;
-
 	MmEntry				mm;
 	auxv_t				mm_saved_auxv[AT_VECTOR_SIZE];
 	u32				mm_saved_auxv_size;
diff --git a/pie/restorer.c b/pie/restorer.c
index 4665c5d78872..d96a29b68bdb 100644
--- a/pie/restorer.c
+++ b/pie/restorer.c
@@ -51,7 +51,6 @@
 
 static struct task_entries *task_entries;
 static futex_t thread_inprogress;
-static int cap_last_cap;
 static pid_t *helpers;
 static int n_helpers;
 static pid_t *zombies;
@@ -121,8 +120,9 @@ static int lsm_set_label(char *label, int procfd)
 	return 0;
 }
 
-static int restore_creds(CredsEntry *ce, int procfd)
+static int restore_creds(struct thread_creds_args *args, int procfd)
 {
+	CredsEntry *ce = &args->creds;
 	int b, i, ret;
 	struct cap_header hdr;
 	struct cap_data data[_LINUX_CAPABILITY_U32S_3];
@@ -132,6 +132,17 @@ static int restore_creds(CredsEntry *ce, int procfd)
 	 */
 
 	/*
+	 * Setup supplementary group IDs early.
+	 */
+	if (args->groups) {
+		ret = sys_setgroups(ce->n_groups, args->groups);
+		if (ret) {
+			pr_err("Can't setup supplementary group IDs: %d\n", ret);
+			return -1;
+		}
+	}
+
+	/*
 	 * First -- set the SECURE_NO_SETUID_FIXUP bit not to
 	 * lose caps bits when changing xids.
 	 */
@@ -190,9 +201,9 @@ static int restore_creds(CredsEntry *ce, int procfd)
 
 	for (b = 0; b < CR_CAP_SIZE; b++) {
 		for (i = 0; i < 32; i++) {
-			if (b * 32 + i > cap_last_cap)
+			if (b * 32 + i > args->cap_last_cap)
 				break;
-			if (ce->cap_bnd[b] & (1 << i))
+			if (args->cap_bnd[b] & (1 << i))
 				/* already set */
 				continue;
 			ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
@@ -215,9 +226,9 @@ static int restore_creds(CredsEntry *ce, int procfd)
 	BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
 
 	for (i = 0; i < CR_CAP_SIZE; i++) {
-		data[i].eff = ce->cap_eff[i];
-		data[i].prm = ce->cap_prm[i];
-		data[i].inh = ce->cap_inh[i];
+		data[i].eff = args->cap_eff[i];
+		data[i].prm = args->cap_prm[i];
+		data[i].inh = args->cap_inh[i];
 	}
 
 	ret = sys_capset(&hdr, data);
@@ -226,9 +237,8 @@ static int restore_creds(CredsEntry *ce, int procfd)
 		return -1;
 	}
 
-	if (lsm_set_label(ce->lsm_profile, procfd) < 0)
+	if (lsm_set_label(args->lsm_profile, procfd) < 0)
 		return -1;
-
 	return 0;
 }
 
@@ -443,7 +453,7 @@ long __export_restore_thread(struct thread_restore_args *args)
 	if (restore_thread_common(rt_sigframe, args))
 		goto core_restore_end;
 
-	ret = restore_creds(&args->ta->creds, args->ta->proc_fd);
+	ret = restore_creds(args->creds_args, args->ta->proc_fd);
 	if (ret)
 		goto core_restore_end;
 
@@ -884,8 +894,6 @@ long __export_restore_task(struct task_restore_args *args)
 	log_set_fd(args->logfd);
 	log_set_loglevel(args->loglevel);
 
-	cap_last_cap = args->cap_last_cap;
-
 	pr_info("Switched to the restorer %d\n", my_pid);
 
 	if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
@@ -1262,7 +1270,7 @@ long __export_restore_task(struct task_restore_args *args)
 	 * thus restore* creds _after_ all of the above.
 	 */
 
-	ret = restore_creds(&args->creds, args->proc_fd);
+	ret = restore_creds(args->t->creds_args, args->proc_fd);
 	ret = ret || restore_dumpable_flag(&args->mm);
 	ret = ret || restore_pdeath_sig(args->t);
 
-- 
2.5.0



More information about the CRIU mailing list