[Devel] [PATCH CRIU 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task
Valeriy Vdovin
valeriy.vdovin at virtuozzo.com
Mon Jan 13 11:25:41 MSK 2020
https://jira.sw.ru/browse/PSBM-100083
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
criu/cr-dump.c | 49 ++++++++++++++++++++++++++++++++++++
criu/cr-restore.c | 68 ++++++++++++++++++++++++++++++--------------------
criu/include/crtools.h | 31 +++++++++++++++++++++++
images/core.proto | 2 ++
4 files changed, 123 insertions(+), 27 deletions(-)
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
index 45626e8..751141f 100644
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@@ -1037,6 +1037,47 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
return ret;
}
+struct get_internal_start_time_rq {
+ int pid;
+ unsigned long long result;
+};
+
+static int child_get_internal_start_time(void *arg)
+{
+ struct proc_pid_stat p;
+ struct get_internal_start_time_rq *r =
+ (struct get_internal_start_time_rq *)arg;
+
+ /* We need to join ve to access container relative
+ * value of task's start_time, otherwize we will see
+ * start_time visible to host.
+ */
+ join_veX(r->pid);
+
+ parse_pid_stat(r->pid, &p);
+ r->result = p.start_time;
+ return 0;
+}
+
+static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
+{
+ int ret;
+ struct get_internal_start_time_rq r = {
+ .pid = pid,
+ .result = 0
+ };
+
+ ret = call_in_child_process(child_get_internal_start_time, &r);
+ if (ret) {
+ pr_err("Failed to exec in child\n");
+ return ret;
+ }
+
+ tc->has_start_time = 1;
+ tc->start_time = r.result;
+ return 0;
+}
+
static int dump_task_core_all(struct parasite_ctl *ctl,
struct pstree_item *item,
const struct proc_pid_stat *stat,
@@ -1063,6 +1104,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
core->tc->task_state = item->pid->state;
core->tc->exit_code = 0;
+ ret = dump_task_internal_start_time(pid, core->tc);
+ if (ret) {
+ pr_err("Failed to dump start_time for task %d\n", pid);
+ goto err;
+ }
+
+ pr_info("Dumped start_time of task %d is %lu\n", pid, core->tc->start_time);
+
if (stat->tty_nr) {
struct pstree_item *p = item;
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
index 170beab..c9777ae 100644
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -947,6 +947,40 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
static int prepare_mm(pid_t pid, struct task_restore_args *args);
+static int restore_start_time(int pid, CoreEntry *core)
+{
+ unsigned long long total_nsec;
+ unsigned long flags;
+ long tps;
+ struct prctl_task_ct_fields ct_fields;
+
+ if (!core->tc->has_start_time) {
+ pr_warn("Skipping restore_start_time for old image version.\n");
+ return -1;
+ }
+
+ tps = sysconf(_SC_CLK_TCK);
+ if (tps == -1) {
+ pr_perror("Failed to get clock ticks via sysconf");
+ return -1;
+ }
+
+ total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
+
+ ct_fields.real_start_time = total_nsec;
+ flags = PR_TASK_CT_FIELDS_START_TIME;
+
+ if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
+ pr_perror("Can't set process start time");
+ return -1;
+ }
+
+ pr_info("Restored start_time of task %d is %lu\n",
+ pid, core->tc->start_time);
+
+ return 0;
+}
+
static int restore_one_alive_task(int pid, CoreEntry *core)
{
unsigned args_len;
@@ -955,6 +989,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
rst_mem_switch_to_private();
+ restore_start_time(pid, core);
+
args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
current->nr_threads, page_size());
ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
@@ -1122,7 +1158,7 @@ static int wait_on_helpers_zombies(void)
static int wait_exiting_children(char *prefix);
-static int restore_one_zombie(CoreEntry *core)
+static int restore_one_zombie(int pid, CoreEntry *core)
{
int exit_code = core->tc->exit_code;
@@ -1134,6 +1170,8 @@ static int restore_one_zombie(CoreEntry *core)
if (lazy_pages_setup_zombie(vpid(current)))
return -1;
+ restore_start_time(pid, core);
+
prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
if (task_entries != NULL) {
@@ -1142,6 +1180,7 @@ static int restore_one_zombie(CoreEntry *core)
zombie_prepare_signals();
}
+
if (exit_code & 0x7f) {
int signr;
@@ -1344,7 +1383,7 @@ static int restore_one_task(int pid, CoreEntry *core)
if (task_alive(current))
ret = restore_one_alive_task(pid, core);
else if (current->pid->state == TASK_DEAD)
- ret = restore_one_zombie(core);
+ ret = restore_one_zombie(pid, core);
else if (current->pid->state == TASK_HELPER) {
ret = restore_one_helper();
} else {
@@ -2156,31 +2195,6 @@ static int write_restored_pid(void)
extern char *get_dumpee_veid(pid_t pid_real);
-#define join_veX(pid) join_ve(pid, true)
-
-/*
- * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
- * of files on CT mounts for security. All mounts created after join_veX are
- * marked as CT mounts, including all mounts of the root_yard temporary mntns.
- * So if you do join_ve0 you can be blocked from executing anything.
- *
- * https://jira.sw.ru/browse/PSBM-98702
- *
- * note: If for some reason we will desperately need to execute binaries from
- * mounts in the root_yard temporary mntns from VE0 we have an option:
- *
- * In restore_root_task before calling join_veX we can clone a helper process
- * which will create CT userns and mntns first (all mounts are marked as host
- * mounts), next after join_veX in restore_root_task we create another helper
- * process which setns'es to these user and mnt namespaces, and from these
- * helper we can clone CT init process obviousely without CLONE_NEWNS and
- * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
- * but all mounts cloned from host will be marked as host mounts, and execution
- * on them will be allowed even from VE0.
- */
-
-#define join_ve0(pid) join_ve(pid, false)
-
/*
* To eliminate overhead we don't parse VE cgroup mountpoint
* but presume to find it in known place. Otherwise simply
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
index c3f7cb3..acb1faf 100644
--- a/criu/include/crtools.h
+++ b/criu/include/crtools.h
@@ -44,6 +44,37 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
.actor = name##_cb, \
}
+#define join_veX(pid) join_ve(pid, true)
+
+/*
+ * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
+ * of files on CT mounts for security. All mounts created after join_veX are
+ * marked as CT mounts, including all mounts of the root_yard temporary mntns.
+ * So if you do join_ve0 you can be blocked from executing anything.
+ *
+ * https://jira.sw.ru/browse/PSBM-98702
+ *
+ * note: If for some reason we will desperately need to execute binaries from
+ * mounts in the root_yard temporary mntns from VE0 we have an option:
+ *
+ * In restore_root_task before calling join_veX we can clone a helper process
+ * which will create CT userns and mntns first (all mounts are marked as host
+ * mounts), next after join_veX in restore_root_task we create another helper
+ * process which setns'es to these user and mnt namespaces, and from these
+ * helper we can clone CT init process obviousely without CLONE_NEWNS and
+ * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
+ * but all mounts cloned from host will be marked as host mounts, and execution
+ * on them will be allowed even from VE0.
+ */
+
+#define join_ve0(pid) join_ve(pid, false)
+
+/*
+ * To eliminate overhead we don't parse VE cgroup mountpoint
+ * but presume to find it in known place. Otherwise simply
+ * don't enter into veX with one warning.
+ */
+
int join_ve(pid_t pid, bool veX);
#endif /* __CR_CRTOOLS_H__ */
diff --git a/images/core.proto b/images/core.proto
index 6ef5f50..c164c7a 100644
--- a/images/core.proto
+++ b/images/core.proto
@@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
optional int32 tty_nr = 15;
optional int32 tty_pgrp = 16;
repeated sa_entry sigactions = 17;
+ optional uint64 start_time = 19;
}
message task_core_entry {
@@ -79,6 +80,7 @@ message task_core_entry {
repeated sa_entry sigactions = 15;
optional int32 tty_nr = 16;
optional int32 tty_pgrp = 17;
+ optional uint64 start_time = 19;
}
message task_kobj_ids_entry {
--
1.8.3.1
More information about the Devel
mailing list