[Devel] [PATCH CRIU v3 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Valeriy Vdovin valeriy.vdovin at virtuozzo.com
Tue Jan 14 21:54:50 MSK 2020


https://jira.sw.ru/browse/PSBM-100083

Introducing 'start_time' field into core image.
It is stored during suspend for each process inside of a dumped
container and set back to each new process at container restore
operation. Container will see this value after restore instead of the
real start time of all restored processes.

Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 criu/cr-dump.c         | 57 +++++++++++++++++++++++++++++++++++++++
 criu/cr-restore.c      | 73 +++++++++++++++++++++++++++++++-------------------
 criu/include/crtools.h | 25 +++++++++++++++++
 images/core.proto      |  2 ++
 4 files changed, 130 insertions(+), 27 deletions(-)

diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 45626e8..75a34a9 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1037,6 +1037,54 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
 	return ret;
 }
 
+struct get_internal_start_time_rq {
+	int pid;
+	unsigned long long result;
+};
+
+static int child_get_internal_start_time(void *arg)
+{
+	struct proc_pid_stat p;
+	struct get_internal_start_time_rq *r =
+		(struct get_internal_start_time_rq *)arg;
+
+	/* We need to join ve to access container relative
+	 * value of task's start_time, otherwize we will see
+	 * start_time visible to host.
+	 */
+	if (join_veX(r->pid)) {
+		pr_err("Failed to join ve, owning process %d\n", r->pid);
+		return -1;
+	}
+
+	if (parse_pid_stat(r->pid, &p)) {
+		pr_err("Failed to parse /proc/[pid]/stat for process: %d\n", r->pid);
+		return -1;
+	}
+
+	r->result = p.start_time;
+	return 0;
+}
+
+static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
+{
+	int ret;
+	struct get_internal_start_time_rq r = {
+		.pid = pid,
+		.result = 0
+	};
+
+	ret = call_in_child_process(child_get_internal_start_time, &r);
+	if (ret) {
+		pr_err("Failed to get internal start_time of a process from ve\n");
+		return ret;
+	}
+
+	tc->has_start_time = 1;
+	tc->start_time = r.result;
+	return 0;
+}
+
 static int dump_task_core_all(struct parasite_ctl *ctl,
 			      struct pstree_item *item,
 			      const struct proc_pid_stat *stat,
@@ -1063,6 +1111,15 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
 	core->tc->task_state = item->pid->state;
 	core->tc->exit_code = 0;
 
+	ret = dump_task_internal_start_time(pid, core->tc);
+	if (ret) {
+		pr_err("Failed to dump start_time for task %d\n", pid);
+		goto err;
+	}
+
+	pr_info("Dumped start_time of task %d is %lu\n", pid,
+		core->tc->start_time);
+
 	if (stat->tty_nr) {
 		struct pstree_item *p = item;
 
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 170beab..af2c3d1 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -947,6 +947,44 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
 static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
 static int prepare_mm(pid_t pid, struct task_restore_args *args);
 
+static int restore_start_time(int pid, CoreEntry *core)
+{
+	unsigned long long total_nsec;
+	unsigned long flags;
+	long tps;
+	struct prctl_task_ct_fields ct_fields;
+
+	if (!core->tc->has_start_time) {
+		pr_warn("Skipping restore_start_time for old image version.\n");
+		return 0;
+	}
+
+	tps = sysconf(_SC_CLK_TCK);
+	if (tps == -1) {
+		pr_perror("Failed to get clock ticks via sysconf");
+		return -1;
+	}
+
+	total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
+
+	ct_fields.real_start_time = total_nsec;
+	flags = PR_TASK_CT_FIELDS_START_TIME;
+
+	if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
+		if (errno == EINVAL) {
+			pr_err("Kernel doesn't support PR_SET_TASK_CT_FIELDS.\n");
+			return 0;
+		}
+		pr_perror("Failed prctl PR_SET_TASK_CT_FIELDS.\n");
+		return -1;
+	}
+
+	pr_info("Restored start_time of task %d is %lu\n",
+		pid, core->tc->start_time);
+
+	return 0;
+}
+
 static int restore_one_alive_task(int pid, CoreEntry *core)
 {
 	unsigned args_len;
@@ -955,6 +993,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
 
 	rst_mem_switch_to_private();
 
+	if (restore_start_time(pid, core))
+		return -1;
+
 	args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
 			current->nr_threads, page_size());
 	ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
@@ -1122,7 +1163,7 @@ static int wait_on_helpers_zombies(void)
 
 static int wait_exiting_children(char *prefix);
 
-static int restore_one_zombie(CoreEntry *core)
+static int restore_one_zombie(int pid, CoreEntry *core)
 {
 	int exit_code = core->tc->exit_code;
 
@@ -1134,6 +1175,9 @@ static int restore_one_zombie(CoreEntry *core)
 	if (lazy_pages_setup_zombie(vpid(current)))
 		return -1;
 
+	if (restore_start_time(pid, core))
+		return -1;
+
 	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
 
 	if (task_entries != NULL) {
@@ -1344,7 +1388,7 @@ static int restore_one_task(int pid, CoreEntry *core)
 	if (task_alive(current))
 		ret = restore_one_alive_task(pid, core);
 	else if (current->pid->state == TASK_DEAD)
-		ret = restore_one_zombie(core);
+		ret = restore_one_zombie(pid, core);
 	else if (current->pid->state == TASK_HELPER) {
 		ret = restore_one_helper();
 	} else {
@@ -2156,31 +2200,6 @@ static int write_restored_pid(void)
 
 extern char *get_dumpee_veid(pid_t pid_real);
 
-#define join_veX(pid)	join_ve(pid, true)
-
-/*
- * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
- * of files on CT mounts for security. All mounts created after join_veX are
- * marked as CT mounts, including all mounts of the root_yard temporary mntns.
- * So if you do join_ve0 you can be blocked from executing anything.
- *
- * https://jira.sw.ru/browse/PSBM-98702
- *
- * note: If for some reason we will desperately need to execute binaries from
- * mounts in the root_yard temporary mntns from VE0 we have an option:
- *
- * In restore_root_task before calling join_veX we can clone a helper process
- * which will create CT userns and mntns first (all mounts are marked as host
- * mounts), next after join_veX in restore_root_task we create another helper
- * process which setns'es to these user and mnt namespaces, and from these
- * helper we can clone CT init process obviousely without CLONE_NEWNS and
- * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
- * but all mounts cloned from host will be marked as host mounts, and execution
- * on them will be allowed even from VE0.
- */
-
-#define join_ve0(pid)	join_ve(pid, false)
-
 /*
  * To eliminate overhead we don't parse VE cgroup mountpoint
  * but presume to find it in known place. Otherwise simply
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
index c3f7cb3..902d18a 100644
--- a/criu/include/crtools.h
+++ b/criu/include/crtools.h
@@ -44,6 +44,31 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
 			.actor = name##_cb,			\
 	}
 
+#define join_veX(pid)	join_ve(pid, true)
+
+/*
+ * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
+ * of files on CT mounts for security. All mounts created after join_veX are
+ * marked as CT mounts, including all mounts of the root_yard temporary mntns.
+ * So if you do join_ve0 you can be blocked from executing anything.
+ *
+ * https://jira.sw.ru/browse/PSBM-98702
+ *
+ * note: If for some reason we will desperately need to execute binaries from
+ * mounts in the root_yard temporary mntns from VE0 we have an option:
+ *
+ * In restore_root_task before calling join_veX we can clone a helper process
+ * which will create CT userns and mntns first (all mounts are marked as host
+ * mounts), next after join_veX in restore_root_task we create another helper
+ * process which setns'es to these user and mnt namespaces, and from these
+ * helper we can clone CT init process obviousely without CLONE_NEWNS and
+ * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
+ * but all mounts cloned from host will be marked as host mounts, and execution
+ * on them will be allowed even from VE0.
+ */
+
+#define join_ve0(pid)	join_ve(pid, false)
+
 int join_ve(pid_t pid, bool veX);
 
 #endif /* __CR_CRTOOLS_H__ */
diff --git a/images/core.proto b/images/core.proto
index 6ef5f50..c164c7a 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
 	optional int32			tty_nr		= 15;
 	optional int32			tty_pgrp	= 16;
 	repeated sa_entry		sigactions	= 17;
+	optional uint64			start_time	= 19;
 }
 
 message task_core_entry {
@@ -79,6 +80,7 @@ message task_core_entry {
 	repeated sa_entry		sigactions	= 15;
 	optional int32			tty_nr		= 16;
 	optional int32			tty_pgrp	= 17;
+	optional uint64			start_time	= 19;
 }
 
 message task_kobj_ids_entry {
-- 
1.8.3.1



More information about the Devel mailing list