[Devel] [PATCH CRIU 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Valeriy Vdovin valeriy.vdovin at virtuozzo.com
Mon Jan 13 11:25:41 MSK 2020


https://jira.sw.ru/browse/PSBM-100083
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 criu/cr-dump.c         | 49 ++++++++++++++++++++++++++++++++++++
 criu/cr-restore.c      | 68 ++++++++++++++++++++++++++++++--------------------
 criu/include/crtools.h | 31 +++++++++++++++++++++++
 images/core.proto      |  2 ++
 4 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 45626e8..751141f 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1037,6 +1037,47 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
 	return ret;
 }
 
+struct get_internal_start_time_rq {
+	int pid;
+	unsigned long long result;
+};
+
+static int child_get_internal_start_time(void *arg)
+{
+	struct proc_pid_stat p;
+	struct get_internal_start_time_rq *r =
+		(struct get_internal_start_time_rq *)arg;
+
+	/* We need to join ve to access container relative
+	 * value of task's start_time, otherwize we will see
+	 * start_time visible to host.
+	 */
+	join_veX(r->pid);
+
+	parse_pid_stat(r->pid, &p);
+	r->result = p.start_time;
+	return 0;
+}
+
+static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
+{
+	int ret;
+	struct get_internal_start_time_rq r = {
+		.pid = pid,
+		.result = 0
+	};
+
+	ret = call_in_child_process(child_get_internal_start_time, &r);
+	if (ret) {
+		pr_err("Failed to exec in child\n");
+		return ret;
+	}
+
+	tc->has_start_time = 1;
+	tc->start_time = r.result;
+	return 0;
+}
+
 static int dump_task_core_all(struct parasite_ctl *ctl,
 			      struct pstree_item *item,
 			      const struct proc_pid_stat *stat,
@@ -1063,6 +1104,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
 	core->tc->task_state = item->pid->state;
 	core->tc->exit_code = 0;
 
+	ret = dump_task_internal_start_time(pid, core->tc);
+	if (ret) {
+		pr_err("Failed to dump start_time for task %d\n", pid);
+		goto err;
+	}
+
+	pr_info("Dumped start_time of task %d is %lu\n", pid, core->tc->start_time);
+
 	if (stat->tty_nr) {
 		struct pstree_item *p = item;
 
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 170beab..c9777ae 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -947,6 +947,40 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
 static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
 static int prepare_mm(pid_t pid, struct task_restore_args *args);
 
+static int restore_start_time(int pid, CoreEntry *core)
+{
+	unsigned long long total_nsec;
+	unsigned long flags;
+	long tps;
+	struct prctl_task_ct_fields ct_fields;
+
+	if (!core->tc->has_start_time) {
+		pr_warn("Skipping restore_start_time for old image version.\n");
+		return -1;
+	}
+
+	tps = sysconf(_SC_CLK_TCK);
+	if (tps == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
+	total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
+
+	ct_fields.real_start_time = total_nsec;
+	flags = PR_TASK_CT_FIELDS_START_TIME;
+
+	if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
+		pr_perror("Can't set process start time");
+		return -1;
+	}
+
+	pr_info("Restored start_time of task %d is %lu\n",
+		pid, core->tc->start_time);
+
+	return 0;
+}
+
 static int restore_one_alive_task(int pid, CoreEntry *core)
 {
 	unsigned args_len;
@@ -955,6 +989,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
 
 	rst_mem_switch_to_private();
 
+	restore_start_time(pid, core);
+
 	args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
 			current->nr_threads, page_size());
 	ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
@@ -1122,7 +1158,7 @@ static int wait_on_helpers_zombies(void)
 
 static int wait_exiting_children(char *prefix);
 
-static int restore_one_zombie(CoreEntry *core)
+static int restore_one_zombie(int pid, CoreEntry *core)
 {
 	int exit_code = core->tc->exit_code;
 
@@ -1134,6 +1170,8 @@ static int restore_one_zombie(CoreEntry *core)
 	if (lazy_pages_setup_zombie(vpid(current)))
 		return -1;
 
+	restore_start_time(pid, core);
+
 	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
 
 	if (task_entries != NULL) {
@@ -1142,6 +1180,7 @@ static int restore_one_zombie(CoreEntry *core)
 		zombie_prepare_signals();
 	}
 
+
 	if (exit_code & 0x7f) {
 		int signr;
 
@@ -1344,7 +1383,7 @@ static int restore_one_task(int pid, CoreEntry *core)
 	if (task_alive(current))
 		ret = restore_one_alive_task(pid, core);
 	else if (current->pid->state == TASK_DEAD)
-		ret = restore_one_zombie(core);
+		ret = restore_one_zombie(pid, core);
 	else if (current->pid->state == TASK_HELPER) {
 		ret = restore_one_helper();
 	} else {
@@ -2156,31 +2195,6 @@ static int write_restored_pid(void)
 
 extern char *get_dumpee_veid(pid_t pid_real);
 
-#define join_veX(pid)	join_ve(pid, true)
-
-/*
- * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
- * of files on CT mounts for security. All mounts created after join_veX are
- * marked as CT mounts, including all mounts of the root_yard temporary mntns.
- * So if you do join_ve0 you can be blocked from executing anything.
- *
- * https://jira.sw.ru/browse/PSBM-98702
- *
- * note: If for some reason we will desperately need to execute binaries from
- * mounts in the root_yard temporary mntns from VE0 we have an option:
- *
- * In restore_root_task before calling join_veX we can clone a helper process
- * which will create CT userns and mntns first (all mounts are marked as host
- * mounts), next after join_veX in restore_root_task we create another helper
- * process which setns'es to these user and mnt namespaces, and from these
- * helper we can clone CT init process obviousely without CLONE_NEWNS and
- * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
- * but all mounts cloned from host will be marked as host mounts, and execution
- * on them will be allowed even from VE0.
- */
-
-#define join_ve0(pid)	join_ve(pid, false)
-
 /*
  * To eliminate overhead we don't parse VE cgroup mountpoint
  * but presume to find it in known place. Otherwise simply
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
index c3f7cb3..acb1faf 100644
--- a/criu/include/crtools.h
+++ b/criu/include/crtools.h
@@ -44,6 +44,37 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
 			.actor = name##_cb,			\
 	}
 
+#define join_veX(pid)	join_ve(pid, true)
+
+/*
+ * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
+ * of files on CT mounts for security. All mounts created after join_veX are
+ * marked as CT mounts, including all mounts of the root_yard temporary mntns.
+ * So if you do join_ve0 you can be blocked from executing anything.
+ *
+ * https://jira.sw.ru/browse/PSBM-98702
+ *
+ * note: If for some reason we will desperately need to execute binaries from
+ * mounts in the root_yard temporary mntns from VE0 we have an option:
+ *
+ * In restore_root_task before calling join_veX we can clone a helper process
+ * which will create CT userns and mntns first (all mounts are marked as host
+ * mounts), next after join_veX in restore_root_task we create another helper
+ * process which setns'es to these user and mnt namespaces, and from these
+ * helper we can clone CT init process obviousely without CLONE_NEWNS and
+ * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
+ * but all mounts cloned from host will be marked as host mounts, and execution
+ * on them will be allowed even from VE0.
+ */
+
+#define join_ve0(pid)	join_ve(pid, false)
+
+/*
+ * To eliminate overhead we don't parse VE cgroup mountpoint
+ * but presume to find it in known place. Otherwise simply
+ * don't enter into veX with one warning.
+ */
+
 int join_ve(pid_t pid, bool veX);
 
 #endif /* __CR_CRTOOLS_H__ */
diff --git a/images/core.proto b/images/core.proto
index 6ef5f50..c164c7a 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
 	optional int32			tty_nr		= 15;
 	optional int32			tty_pgrp	= 16;
 	repeated sa_entry		sigactions	= 17;
+	optional uint64			start_time	= 19;
 }
 
 message task_core_entry {
@@ -79,6 +80,7 @@ message task_core_entry {
 	repeated sa_entry		sigactions	= 15;
 	optional int32			tty_nr		= 16;
 	optional int32			tty_pgrp	= 17;
+	optional uint64			start_time	= 19;
 }
 
 message task_kobj_ids_entry {
-- 
1.8.3.1



More information about the Devel mailing list