[Devel] [PATCH CRIU v4 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Valeriy Vdovin valeriy.vdovin at virtuozzo.com
Tue Jan 28 20:19:40 MSK 2020


https://jira.sw.ru/browse/PSBM-64123

Introducing 'start_time' field into core image.
It is stored during suspend for each process inside of a dumped
container and set back to each new process at container restore
operation. Container will see this value after restore instead of the
real start time of all restored processes.

Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 criu/cr-dump.c          | 66 ++++++++++++++++++++++++++++++++-
 criu/cr-restore.c       | 99 ++++++++++++++++++++++++++++++++++++-------------
 criu/include/crtools.h  | 25 +++++++++++++
 criu/include/prctl.h    | 10 +++++
 criu/include/restorer.h |  9 +++++
 criu/pie/restorer.c     | 25 +++++++++++++
 images/core.proto       |  1 +
 7 files changed, 209 insertions(+), 26 deletions(-)

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 45626e8..89373b1 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -996,12 +996,71 @@ static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_im
 	return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
 }
 
+struct get_internal_start_time_rq {
+	int pid;
+	unsigned long long result;
+};
+
+static int child_get_internal_start_time(void *arg)
+{
+	struct proc_pid_stat p;
+	struct get_internal_start_time_rq *r =
+		(struct get_internal_start_time_rq *)arg;
+
+	/* We need to join ve to access container relative
+	 * value of task's start_time, otherwize we will see
+	 * start_time visible to host.
+	 */
+	if (join_veX(r->pid)) {
+		pr_err("Failed to join ve, owning process %d\n", r->pid);
+		return -1;
+	}
+
+	if (parse_pid_stat(r->pid, &p)) {
+		pr_err("Failed to parse /proc/[pid]/stat for process: %d\n", r->pid);
+		return -1;
+	}
+
+	r->result = p.start_time;
+	return 0;
+}
+
+static int dump_thread_ve_start_time(int pid, ThreadCoreEntry *thread_core)
+{
+	int ret;
+	struct get_internal_start_time_rq r = {
+		.pid = pid,
+		.result = 0
+	};
+
+	ret = call_in_child_process(child_get_internal_start_time, &r);
+	if (ret) {
+		pr_err("Failed to get internal start_time of a process from ve\n");
+		return ret;
+	}
+
+	thread_core->has_start_time = 1;
+	thread_core->start_time = r.result;
+
+	pr_info("Dumped start_time for task %d is %lu\n", pid, thread_core->start_time);
+	return 0;
+}
+
 int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
 
 {
 	int ret;
 	ThreadCoreEntry *tc = core->thread_core;
 
+	ret = dump_thread_ve_start_time(pid, tc);
+	if (ret) {
+		pr_err("Failed to dump start_time for task %d\n", pid);
+		return -1;
+	}
+
+	pr_info("Dumped start_time of task %d is %lu\n", pid,
+		tc->start_time);
+
 	ret = collect_lsm_profile(pid, tc->creds);
 	if (!ret) {
 		/*
@@ -1214,12 +1273,17 @@ static int dump_one_zombie(const struct pstree_item *item,
 	CoreEntry *core;
 	int ret = -1;
 	struct cr_img *img;
+	pid_t virt_pid = item->pid->ns[0].virt;
 
-	core = core_entry_alloc(0, 1);
+	core = core_entry_alloc(1, 1);
 	if (!core)
 		return -1;
 
 	strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
+
+	if (dump_thread_ve_start_time(virt_pid, core->thread_core))
+		return -1;
+
 	core->tc->task_state = TASK_DEAD;
 	core->tc->exit_code = pps->exit_code;
 
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 170beab..e564d0f 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -947,6 +947,7 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
 static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
 static int prepare_mm(pid_t pid, struct task_restore_args *args);
 
+
 static int restore_one_alive_task(int pid, CoreEntry *core)
 {
 	unsigned args_len;
@@ -1122,6 +1123,60 @@ static int wait_on_helpers_zombies(void)
 
 static int wait_exiting_children(char *prefix);
 
+static int restore_zombie_start_time(CoreEntry *core, pid_t pid)
+{
+	/* Zombie's start_time restoration differs from the one for a live task.
+	 * Here is why:
+	 *
+	 * For alive task the main codepath is done in pie sigreturn
+	 * restorer. Each thread in a process executes restorer blob code before
+	 * doing a sigreturn. Within a restorer blob each thread get's the chance
+	 * to run start_time restoration code for itself via prctl. Prctl identifies
+	 * each thread and set's start_time field nicely.
+	 *
+	 * Zombie process is special. We don't support multithreaded zombies and we don't run
+	 * any parasite code. We just create one single-threaded process in this very callstack,
+	 * patch some stats and kill it back. Right now we are in this single zombie thread,
+	 * so we can call prctl to set start_time for it right here.
+	 * If we decide to somewhy support zombies more, this code might be changed.
+	 */
+
+	int ret;
+	unsigned long flags;
+	long ticks_per_sec;
+	struct prctl_task_ct_fields ct_fields;
+
+	if (!core->thread_core) {
+		pr_info("Skipping zombie start_time restore. thread_core missing in dump.\n");
+		return 0;
+	}
+
+	if (!core->thread_core->has_start_time) {
+		pr_info("Skipping start_time restore for old image format.\n");
+		return 0;
+	}
+
+	ticks_per_sec = sysconf(_SC_CLK_TCK);
+	if (ticks_per_sec == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
+	ct_fields.real_start_time = core->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
+	flags = PR_TASK_CT_FIELDS_START_TIME;
+
+	ret = prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0);
+	if (ret) {
+		pr_perror("Failed to restore zombie start_time\n");
+		return ret;
+	}
+
+	pr_info("Restored zombie start_time for task %d is %lu\n",
+		pid,
+		core->thread_core->start_time);
+	return 0;
+}
+
 static int restore_one_zombie(CoreEntry *core)
 {
 	int exit_code = core->tc->exit_code;
@@ -1136,6 +1191,9 @@ static int restore_one_zombie(CoreEntry *core)
 
 	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
 
+	if (restore_zombie_start_time(core, vpid(current)))
+		return -1;
+
 	if (task_entries != NULL) {
 		if (wait_exiting_children("zombie"))
 			return -1;
@@ -2156,31 +2214,6 @@ static int write_restored_pid(void)
 
 extern char *get_dumpee_veid(pid_t pid_real);
 
-#define join_veX(pid)	join_ve(pid, true)
-
-/*
- * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
- * of files on CT mounts for security. All mounts created after join_veX are
- * marked as CT mounts, including all mounts of the root_yard temporary mntns.
- * So if you do join_ve0 you can be blocked from executing anything.
- *
- * https://jira.sw.ru/browse/PSBM-98702
- *
- * note: If for some reason we will desperately need to execute binaries from
- * mounts in the root_yard temporary mntns from VE0 we have an option:
- *
- * In restore_root_task before calling join_veX we can clone a helper process
- * which will create CT userns and mntns first (all mounts are marked as host
- * mounts), next after join_veX in restore_root_task we create another helper
- * process which setns'es to these user and mnt namespaces, and from these
- * helper we can clone CT init process obviousely without CLONE_NEWNS and
- * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
- * but all mounts cloned from host will be marked as host mounts, and execution
- * on them will be allowed even from VE0.
- */
-
-#define join_ve0(pid)	join_ve(pid, false)
-
 /*
  * To eliminate overhead we don't parse VE cgroup mountpoint
  * but presume to find it in known place. Otherwise simply
@@ -3455,6 +3488,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 
 	long new_sp;
 	long ret;
+	long ticks_per_sec;
 
 	long rst_mem_size;
 	long memzone_size;
@@ -3483,6 +3517,12 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 	BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
 	BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
 
+	ticks_per_sec = sysconf(_SC_CLK_TCK);
+	if (ticks_per_sec == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
 	/*
 	 * Read creds info for every thread and allocate memory
 	 * needed so we can use this data inside restorer.
@@ -3740,6 +3780,15 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
 			strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1);
 		thread_args[i].comm[TASK_COMM_LEN - 1] = 0;
 
+		if (!tcore->thread_core->has_start_time) {
+			thread_args[i].restore_start_time = 0;
+			pr_warn("Skipping restore_start_time for old image version.\n");
+		} else {
+			thread_args[i].start_time =
+				tcore->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
+			thread_args[i].restore_start_time = 1;
+		}
+
 		if (thread_args[i].pid != pid)
 			core_entry__free_unpacked(tcore, NULL);
 
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
index c3f7cb3..902d18a 100644
--- a/criu/include/crtools.h
+++ b/criu/include/crtools.h
@@ -44,6 +44,31 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
 			.actor = name##_cb,			\
 	}
 
+#define join_veX(pid)	join_ve(pid, true)
+
+/*
+ * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
+ * of files on CT mounts for security. All mounts created after join_veX are
+ * marked as CT mounts, including all mounts of the root_yard temporary mntns.
+ * So if you do join_ve0 you can be blocked from executing anything.
+ *
+ * https://jira.sw.ru/browse/PSBM-98702
+ *
+ * note: If for some reason we will desperately need to execute binaries from
+ * mounts in the root_yard temporary mntns from VE0 we have an option:
+ *
+ * In restore_root_task before calling join_veX we can clone a helper process
+ * which will create CT userns and mntns first (all mounts are marked as host
+ * mounts), next after join_veX in restore_root_task we create another helper
+ * process which setns'es to these user and mnt namespaces, and from these
+ * helper we can clone CT init process obviousely without CLONE_NEWNS and
+ * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
+ * but all mounts cloned from host will be marked as host mounts, and execution
+ * on them will be allowed even from VE0.
+ */
+
+#define join_ve0(pid)	join_ve(pid, false)
+
 int join_ve(pid_t pid, bool veX);
 
 #endif /* __CR_CRTOOLS_H__ */
diff --git a/criu/include/prctl.h b/criu/include/prctl.h
index 8e7fef3..52b9a30 100644
--- a/criu/include/prctl.h
+++ b/criu/include/prctl.h
@@ -82,4 +82,14 @@ struct prctl_mm_map {
 # define PR_GET_THP_DISABLE	42
 #endif
 
+#ifndef PR_SET_TASK_CT_FIELDS
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS	1000
+#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
+
+struct prctl_task_ct_fields {
+	s64 real_start_time;
+};
+#endif
+
 #endif /* __CR_PRCTL_H__ */
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
index 07f0439..9f43dd8 100644
--- a/criu/include/restorer.h
+++ b/criu/include/restorer.h
@@ -112,6 +112,15 @@ struct thread_restore_args {
 	bool				seccomp_force_tsync;
 
 	char				comm[TASK_COMM_LEN];
+
+	/* set to 1 if start_time value is valid
+	 * and should be used for restore process
+	 */
+	bool				restore_start_time;
+	/* start_time of a recovered process
+	 * in ticks format as shown in /proc/pid/stat(22)
+	 */
+	unsigned long long		start_time;
 } __aligned(64);
 
 typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
index 0351986..945a00a 100644
--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@@ -113,6 +113,28 @@ void parasite_cleanup(void)
 {
 }
 
+static int restore_start_time(struct thread_restore_args *args)
+{
+	int ret;
+	unsigned long flags;
+	struct prctl_task_ct_fields ct_fields;
+
+	if (!args->restore_start_time)
+		return 0;
+
+	ct_fields.real_start_time = args->start_time;
+	flags = PR_TASK_CT_FIELDS_START_TIME;
+
+	ret = sys_prctl_safe(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0);
+	if (ret) {
+		pr_info("Failed to restore start_time\n");
+		return ret;
+	}
+
+	pr_info("Restored start_time to %lld\n", args->start_time);
+	return 0;
+}
+
 extern void cr_restore_rt (void) asm ("__cr_restore_rt")
 			__attribute__ ((visibility ("hidden")));
 
@@ -523,6 +545,9 @@ static int restore_thread_common(struct thread_restore_args *args)
 
 	restore_sched_info(&args->sp);
 
+	if (restore_start_time(args))
+		return -1;
+
 	if (restore_nonsigframe_gpregs(&args->gpregs))
 		return -1;
 
diff --git a/images/core.proto b/images/core.proto
index 6ef5f50..41f412f 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -119,6 +119,7 @@ message thread_core_entry {
 	optional uint32			seccomp_filter	= 12;
 
 	optional string			comm		= 13;
+	optional uint64			start_time	= 14;
 }
 
 message task_rlimits_entry {
-- 
1.8.3.1



More information about the Devel mailing list