[Devel] [PATCH CRIU 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Mon Jan 13 18:23:17 MSK 2020
Two small nits, everything else is OK.
On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
> criu/cr-dump.c | 49 ++++++++++++++++++++++++++++++++++++
> criu/cr-restore.c | 68 ++++++++++++++++++++++++++++++--------------------
> criu/include/crtools.h | 31 +++++++++++++++++++++++
> images/core.proto | 2 ++
> 4 files changed, 123 insertions(+), 27 deletions(-)
>
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 45626e8..751141f 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1037,6 +1037,47 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
> return ret;
> }
>
> +struct get_internal_start_time_rq {
> + int pid;
> + unsigned long long result;
> +};
> +
> +static int child_get_internal_start_time(void *arg)
> +{
> + struct proc_pid_stat p;
> + struct get_internal_start_time_rq *r =
> + (struct get_internal_start_time_rq *)arg;
> +
> + /* We need to join ve to access container relative
> + * value of task's start_time, otherwize we will see
> + * start_time visible to host.
> + */
> + join_veX(r->pid);
> +
> + parse_pid_stat(r->pid, &p);
> + r->result = p.start_time;
> + return 0;
> +}
> +
> +static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
> +{
> + int ret;
> + struct get_internal_start_time_rq r = {
> + .pid = pid,
> + .result = 0
> + };
> +
> + ret = call_in_child_process(child_get_internal_start_time, &r);
> + if (ret) {
> + pr_err("Failed to exec in child\n");
> + return ret;
> + }
> +
> + tc->has_start_time = 1;
> + tc->start_time = r.result;
> + return 0;
> +}
> +
> static int dump_task_core_all(struct parasite_ctl *ctl,
> struct pstree_item *item,
> const struct proc_pid_stat *stat,
> @@ -1063,6 +1104,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
> core->tc->task_state = item->pid->state;
> core->tc->exit_code = 0;
>
> + ret = dump_task_internal_start_time(pid, core->tc);
> + if (ret) {
> + pr_err("Failed to dump start_time for task %d\n", pid);
> + goto err;
> + }
> +
> + pr_info("Dumped start_time of task %d is %lu\n", pid, core->tc->start_time);
> +
> if (stat->tty_nr) {
> struct pstree_item *p = item;
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 170beab..c9777ae 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -947,6 +947,40 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
> static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
> static int prepare_mm(pid_t pid, struct task_restore_args *args);
>
> +static int restore_start_time(int pid, CoreEntry *core)
> +{
> + unsigned long long total_nsec;
> + unsigned long flags;
> + long tps;
> + struct prctl_task_ct_fields ct_fields;
> +
> + if (!core->tc->has_start_time) {
> + pr_warn("Skipping restore_start_time for old image version.\n");
> + return -1;
> + }
> +
> + tps = sysconf(_SC_CLK_TCK);
> + if (tps == -1) {
> + pr_perror("Failed to get clock ticks via sysconf");
> + return -1;
> + }
> +
> + total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
> +
> + ct_fields.real_start_time = total_nsec;
> + flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> + if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
> + pr_perror("Can't set process start time");
> + return -1;
> + }
> +
> + pr_info("Restored start_time of task %d is %lu\n",
> + pid, core->tc->start_time);
> +
> + return 0;
> +}
> +
> static int restore_one_alive_task(int pid, CoreEntry *core)
> {
> unsigned args_len;
> @@ -955,6 +989,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
>
> rst_mem_switch_to_private();
>
> + restore_start_time(pid, core);
> +
> args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
> current->nr_threads, page_size());
> ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
> @@ -1122,7 +1158,7 @@ static int wait_on_helpers_zombies(void)
>
> static int wait_exiting_children(char *prefix);
>
> -static int restore_one_zombie(CoreEntry *core)
> +static int restore_one_zombie(int pid, CoreEntry *core)
> {
> int exit_code = core->tc->exit_code;
>
> @@ -1134,6 +1170,8 @@ static int restore_one_zombie(CoreEntry *core)
> if (lazy_pages_setup_zombie(vpid(current)))
> return -1;
>
> + restore_start_time(pid, core);
> +
> prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>
> if (task_entries != NULL) {
> @@ -1142,6 +1180,7 @@ static int restore_one_zombie(CoreEntry *core)
> zombie_prepare_signals();
> }
>
> +
Excess empty line.
> if (exit_code & 0x7f) {
> int signr;
>
> @@ -1344,7 +1383,7 @@ static int restore_one_task(int pid, CoreEntry *core)
> if (task_alive(current))
> ret = restore_one_alive_task(pid, core);
> else if (current->pid->state == TASK_DEAD)
> - ret = restore_one_zombie(core);
> + ret = restore_one_zombie(pid, core);
> else if (current->pid->state == TASK_HELPER) {
> ret = restore_one_helper();
> } else {
> @@ -2156,31 +2195,6 @@ static int write_restored_pid(void)
>
> extern char *get_dumpee_veid(pid_t pid_real);
>
> -#define join_veX(pid) join_ve(pid, true)
> -
> -/*
> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> - * of files on CT mounts for security. All mounts created after join_veX are
> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> - * So if you do join_ve0 you can be blocked from executing anything.
> - *
> - * https://jira.sw.ru/browse/PSBM-98702
> - *
> - * note: If for some reason we will desperately need to execute binaries from
> - * mounts in the root_yard temporary mntns from VE0 we have an option:
> - *
> - * In restore_root_task before calling join_veX we can clone a helper process
> - * which will create CT userns and mntns first (all mounts are marked as host
> - * mounts), next after join_veX in restore_root_task we create another helper
> - * process which setns'es to these user and mnt namespaces, and from these
> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> - * but all mounts cloned from host will be marked as host mounts, and execution
> - * on them will be allowed even from VE0.
> - */
> -
> -#define join_ve0(pid) join_ve(pid, false)
> -
> /*
> * To eliminate overhead we don't parse VE cgroup mountpoint
> * but presume to find it in known place. Otherwise simply
> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
> index c3f7cb3..acb1faf 100644
> --- a/criu/include/crtools.h
> +++ b/criu/include/crtools.h
> @@ -44,6 +44,37 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
> .actor = name##_cb, \
> }
>
> +#define join_veX(pid) join_ve(pid, true)
> +
> +/*
> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> + * of files on CT mounts for security. All mounts created after join_veX are
> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> + * So if you do join_ve0 you can be blocked from executing anything.
> + *
> + * https://jira.sw.ru/browse/PSBM-98702
> + *
> + * note: If for some reason we will desperately need to execute binaries from
> + * mounts in the root_yard temporary mntns from VE0 we have an option:
> + *
> + * In restore_root_task before calling join_veX we can clone a helper process
> + * which will create CT userns and mntns first (all mounts are marked as host
> + * mounts), next after join_veX in restore_root_task we create another helper
> + * process which setns'es to these user and mnt namespaces, and from these
> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> + * but all mounts cloned from host will be marked as host mounts, and execution
> + * on them will be allowed even from VE0.
> + */
> +
> +#define join_ve0(pid) join_ve(pid, false)
> +
> +/*
> + * To eliminate overhead we don't parse VE cgroup mountpoint
> + * but presume to find it in known place. Otherwise simply
> + * don't enter into veX with one warning.
> + */
Duplicated comment.
> +
> int join_ve(pid_t pid, bool veX);
>
> #endif /* __CR_CRTOOLS_H__ */
> diff --git a/images/core.proto b/images/core.proto
> index 6ef5f50..c164c7a 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
> optional int32 tty_nr = 15;
> optional int32 tty_pgrp = 16;
> repeated sa_entry sigactions = 17;
> + optional uint64 start_time = 19;
> }
>
> message task_core_entry {
> @@ -79,6 +80,7 @@ message task_core_entry {
> repeated sa_entry sigactions = 15;
> optional int32 tty_nr = 16;
> optional int32 tty_pgrp = 17;
> + optional uint64 start_time = 19;
> }
>
> message task_kobj_ids_entry {
>
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list