[Devel] [PATCH CRIU v4 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task
Alexander Mikhalitsyn
alexander.mikhalitsyn at virtuozzo.com
Wed Jan 29 11:43:41 MSK 2020
On Tue, 28 Jan 2020 20:19:40 +0300
Valeriy Vdovin <valeriy.vdovin at virtuozzo.com> wrote:
> https://jira.sw.ru/browse/PSBM-64123
>
> Introducing 'start_time' field into core image.
> It is stored during suspend for each process inside of a dumped
> container and set back to each new process at container restore
> operation. Container will see this value after restore instead of the
> real start time of all restored processes.
>
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
> criu/cr-dump.c | 66 ++++++++++++++++++++++++++++++++-
> criu/cr-restore.c | 99 ++++++++++++++++++++++++++++++++++++-------------
> criu/include/crtools.h | 25 +++++++++++++
> criu/include/prctl.h | 10 +++++
> criu/include/restorer.h | 9 +++++
> criu/pie/restorer.c | 25 +++++++++++++
> images/core.proto | 1 +
> 7 files changed, 209 insertions(+), 26 deletions(-)
>
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 45626e8..89373b1 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -996,12 +996,71 @@ static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_im
> return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
> }
>
> +struct get_internal_start_time_rq {
> + int pid;
> + unsigned long long result;
> +};
> +
> +static int child_get_internal_start_time(void *arg)
> +{
> + struct proc_pid_stat p;
> + struct get_internal_start_time_rq *r =
> + (struct get_internal_start_time_rq *)arg;
> +
> + /* We need to join ve to access container relative
> + * value of task's start_time, otherwize we will see
> + * start_time visible to host.
> + */
> + if (join_veX(r->pid)) {
> + pr_err("Failed to join ve, owning process %d\n", r->pid);
> + return -1;
> + }
> +
> + if (parse_pid_stat(r->pid, &p)) {
> + pr_err("Failed to parse /proc/[pid]/stat for process: %d\n", r->pid);
> + return -1;
> + }
> +
> + r->result = p.start_time;
> + return 0;
> +}
> +
> +static int dump_thread_ve_start_time(int pid, ThreadCoreEntry *thread_core)
> +{
> + int ret;
> + struct get_internal_start_time_rq r = {
> + .pid = pid,
> + .result = 0
> + };
> +
> + ret = call_in_child_process(child_get_internal_start_time, &r);
> + if (ret) {
> + pr_err("Failed to get internal start_time of a process from ve\n");
> + return ret;
> + }
> +
> + thread_core->has_start_time = 1;
stylistic suggestion
1 -> true
> + thread_core->start_time = r.result;
> +
> + pr_info("Dumped start_time for task %d is %lu\n", pid, thread_core->start_time);
> + return 0;
> +}
> +
> int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
>
> {
> int ret;
> ThreadCoreEntry *tc = core->thread_core;
>
> + ret = dump_thread_ve_start_time(pid, tc);
> + if (ret) {
> + pr_err("Failed to dump start_time for task %d\n", pid);
> + return -1;
> + }
> +
> + pr_info("Dumped start_time of task %d is %lu\n", pid,
> + tc->start_time);
this info already printed in dump_thread_ve_start_time. Is it needed here?
> +
> ret = collect_lsm_profile(pid, tc->creds);
> if (!ret) {
> /*
> @@ -1214,12 +1273,17 @@ static int dump_one_zombie(const struct pstree_item *item,
> CoreEntry *core;
> int ret = -1;
> struct cr_img *img;
> + pid_t virt_pid = item->pid->ns[0].virt;
Please, use vpid(item)
>
> - core = core_entry_alloc(0, 1);
> + core = core_entry_alloc(1, 1);
> if (!core)
> return -1;
>
> strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
> +
> + if (dump_thread_ve_start_time(virt_pid, core->thread_core))
virt_pid -> vpid(item)
> + return -1;
> +
> core->tc->task_state = TASK_DEAD;
> core->tc->exit_code = pps->exit_code;
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 170beab..e564d0f 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -947,6 +947,7 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
> static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
> static int prepare_mm(pid_t pid, struct task_restore_args *args);
>
> +
I think that this additional empty line added by mistake
> static int restore_one_alive_task(int pid, CoreEntry *core)
> {
> unsigned args_len;
> @@ -1122,6 +1123,60 @@ static int wait_on_helpers_zombies(void)
>
> static int wait_exiting_children(char *prefix);
>
> +static int restore_zombie_start_time(CoreEntry *core, pid_t pid)
> +{
> + /* Zombie's start_time restoration differs from the one for a live task.
> + * Here is why:
> + *
> + * For alive task the main codepath is done in pie sigreturn
> + * restorer. Each thread in a process executes restorer blob code before
> + * doing a sigreturn. Within a restorer blob each thread get's the chance
> + * to run start_time restoration code for itself via prctl. Prctl identifies
> + * each thread and set's start_time field nicely.
> + *
> + * Zombie process is special. We don't support multithreaded zombies and we don't run
> + * any parasite code. We just create one single-threaded process in this very callstack,
> + * patch some stats and kill it back. Right now we are in this single zombie thread,
> + * so we can call prctl to set start_time for it right here.
> + * If we decide to somewhy support zombies more, this code might be changed.
> + */
> +
> + int ret;
> + unsigned long flags;
> + long ticks_per_sec;
> + struct prctl_task_ct_fields ct_fields;
> +
> + if (!core->thread_core) {
> + pr_info("Skipping zombie start_time restore. thread_core missing in dump.\n");
> + return 0;
> + }
> +
> + if (!core->thread_core->has_start_time) {
> + pr_info("Skipping start_time restore for old image format.\n");
> + return 0;
> + }
> +
> + ticks_per_sec = sysconf(_SC_CLK_TCK);
> + if (ticks_per_sec == -1) {
> + pr_perror("Failed to get clock ticks via sysconf");
> + return -1;
> + }
> +
> + ct_fields.real_start_time = core->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
> + flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> + ret = prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0);
> + if (ret) {
> + pr_perror("Failed to restore zombie start_time\n");
\n with pr_perror is not needed
> + return ret;
> + }
> +
> + pr_info("Restored zombie start_time for task %d is %lu\n",
> + pid,
> + core->thread_core->start_time);
> + return 0;
> +}
> +
> static int restore_one_zombie(CoreEntry *core)
> {
> int exit_code = core->tc->exit_code;
> @@ -1136,6 +1191,9 @@ static int restore_one_zombie(CoreEntry *core)
>
> prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>
> + if (restore_zombie_start_time(core, vpid(current)))
> + return -1;
> +
> if (task_entries != NULL) {
> if (wait_exiting_children("zombie"))
> return -1;
> @@ -2156,31 +2214,6 @@ static int write_restored_pid(void)
>
> extern char *get_dumpee_veid(pid_t pid_real);
>
> -#define join_veX(pid) join_ve(pid, true)
> -
> -/*
> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> - * of files on CT mounts for security. All mounts created after join_veX are
> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> - * So if you do join_ve0 you can be blocked from executing anything.
> - *
> - * https://jira.sw.ru/browse/PSBM-98702
> - *
> - * note: If for some reason we will desperately need to execute binaries from
> - * mounts in the root_yard temporary mntns from VE0 we have an option:
> - *
> - * In restore_root_task before calling join_veX we can clone a helper process
> - * which will create CT userns and mntns first (all mounts are marked as host
> - * mounts), next after join_veX in restore_root_task we create another helper
> - * process which setns'es to these user and mnt namespaces, and from these
> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> - * but all mounts cloned from host will be marked as host mounts, and execution
> - * on them will be allowed even from VE0.
> - */
> -
> -#define join_ve0(pid) join_ve(pid, false)
> -
> /*
> * To eliminate overhead we don't parse VE cgroup mountpoint
> * but presume to find it in known place. Otherwise simply
> @@ -3455,6 +3488,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>
> long new_sp;
> long ret;
> + long ticks_per_sec;
>
> long rst_mem_size;
> long memzone_size;
> @@ -3483,6 +3517,12 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
> BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
>
> + ticks_per_sec = sysconf(_SC_CLK_TCK);
> + if (ticks_per_sec == -1) {
> + pr_perror("Failed to get clock ticks via sysconf");
> + return -1;
> + }
> +
> /*
> * Read creds info for every thread and allocate memory
> * needed so we can use this data inside restorer.
> @@ -3740,6 +3780,15 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1);
> thread_args[i].comm[TASK_COMM_LEN - 1] = 0;
>
> + if (!tcore->thread_core->has_start_time) {
> + thread_args[i].restore_start_time = 0;
> + pr_warn("Skipping restore_start_time for old image version.\n");
> + } else {
> + thread_args[i].start_time =
> + tcore->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
> + thread_args[i].restore_start_time = 1;
> + }
> +
> if (thread_args[i].pid != pid)
> core_entry__free_unpacked(tcore, NULL);
>
> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
> index c3f7cb3..902d18a 100644
> --- a/criu/include/crtools.h
> +++ b/criu/include/crtools.h
> @@ -44,6 +44,31 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
> .actor = name##_cb, \
> }
>
> +#define join_veX(pid) join_ve(pid, true)
> +
> +/*
> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> + * of files on CT mounts for security. All mounts created after join_veX are
> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> + * So if you do join_ve0 you can be blocked from executing anything.
> + *
> + * https://jira.sw.ru/browse/PSBM-98702
> + *
> + * note: If for some reason we will desperately need to execute binaries from
> + * mounts in the root_yard temporary mntns from VE0 we have an option:
> + *
> + * In restore_root_task before calling join_veX we can clone a helper process
> + * which will create CT userns and mntns first (all mounts are marked as host
> + * mounts), next after join_veX in restore_root_task we create another helper
> + * process which setns'es to these user and mnt namespaces, and from these
> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> + * but all mounts cloned from host will be marked as host mounts, and execution
> + * on them will be allowed even from VE0.
> + */
> +
> +#define join_ve0(pid) join_ve(pid, false)
> +
> int join_ve(pid_t pid, bool veX);
>
> #endif /* __CR_CRTOOLS_H__ */
> diff --git a/criu/include/prctl.h b/criu/include/prctl.h
> index 8e7fef3..52b9a30 100644
> --- a/criu/include/prctl.h
> +++ b/criu/include/prctl.h
> @@ -82,4 +82,14 @@ struct prctl_mm_map {
> # define PR_GET_THP_DISABLE 42
> #endif
>
> +#ifndef PR_SET_TASK_CT_FIELDS
> +/* Set task container related fields */
> +#define PR_SET_TASK_CT_FIELDS 1000
> +#define PR_TASK_CT_FIELDS_START_TIME (1ULL << 0)
> +
> +struct prctl_task_ct_fields {
> + s64 real_start_time;
> +};
> +#endif
> +
> #endif /* __CR_PRCTL_H__ */
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index 07f0439..9f43dd8 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -112,6 +112,15 @@ struct thread_restore_args {
> bool seccomp_force_tsync;
>
> char comm[TASK_COMM_LEN];
> +
> + /* set to 1 if start_time value is valid
> + * and should be used for restore process
> + */
> + bool restore_start_time;
> + /* start_time of a recovered process
> + * in ticks format as shown in /proc/pid/stat(22)
> + */
> + unsigned long long start_time;
> } __aligned(64);
>
> typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 0351986..945a00a 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -113,6 +113,28 @@ void parasite_cleanup(void)
> {
> }
>
> +static int restore_start_time(struct thread_restore_args *args)
> +{
> + int ret;
> + unsigned long flags;
> + struct prctl_task_ct_fields ct_fields;
> +
> + if (!args->restore_start_time)
> + return 0;
> +
> + ct_fields.real_start_time = args->start_time;
> + flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> + ret = sys_prctl_safe(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0);
> + if (ret) {
> + pr_info("Failed to restore start_time\n");
> + return ret;
> + }
> +
> + pr_info("Restored start_time to %lld\n", args->start_time);
> + return 0;
> +}
> +
> extern void cr_restore_rt (void) asm ("__cr_restore_rt")
> __attribute__ ((visibility ("hidden")));
>
> @@ -523,6 +545,9 @@ static int restore_thread_common(struct thread_restore_args *args)
>
> restore_sched_info(&args->sp);
>
> + if (restore_start_time(args))
> + return -1;
> +
> if (restore_nonsigframe_gpregs(&args->gpregs))
> return -1;
>
> diff --git a/images/core.proto b/images/core.proto
> index 6ef5f50..41f412f 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -119,6 +119,7 @@ message thread_core_entry {
> optional uint32 seccomp_filter = 12;
>
> optional string comm = 13;
> + optional uint64 start_time = 14;
> }
>
> message task_rlimits_entry {
> --
> 1.8.3.1
>
More information about the Devel
mailing list