[Devel] [PATCH CRIU v4 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Tue Feb 4 12:59:28 MSK 2020
1) Agree with all comments from Alexander.
2) Also we still have a problem with backward compatibility (I've wrote
about it previousely), on old kernel and criu with this patch we have:
[root at silo criu]# vzctl suspend test-starttime
Setting up checkpoint...
Checkpointing finished successfully
Unmount image: /vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Container is unmounted
Checkpointing completed successfully
[root at silo criu]# vzctl resume test-starttime
Restoring the Container ...
Mount image: /vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Container is mounted
Setting permissions for
image=/vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Configure memguarantee: 0%
(01.145145) pie: 236: Error (criu/pie/restorer.c:128): prctl failed @128
with -22
(01.145154) pie: 236: Error (criu/pie/restorer.c:1995): Restorer fail 236
(01.146428) pie: 237: Error (criu/pie/restorer.c:128): prctl failed @128
with -22
(01.146435) pie: 237: Error (criu/pie/restorer.c:1995): Restorer fail 237
(01.263444) Error (criu/cr-restore.c:2612): Restoring FAILED.
The restore log was saved in
/vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/dump/Dump/restore.log
criu exited with rc=17
Unmount image: /vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Container is unmounted
Failed to restore the Container
[root at silo criu]# uname -r
3.10.0-957.21.3.vz7.106.10
I think we should validly ignore EINVAL from prctl and print some
warning that kernel does not support this prctl. (best way to do it once
and save in new kdat.prctl_set_starttime_supported field and skip prctl
based on it later).
3) Please also add #include "prctl.h" in criu/cr-restore.c so that we
can compile it on the older kernel. (I doubt our build system updates
kernel headers to often)
On 1/29/20 11:43 AM, Alexander Mikhalitsyn wrote:
> On Tue, 28 Jan 2020 20:19:40 +0300
> Valeriy Vdovin <valeriy.vdovin at virtuozzo.com> wrote:
>
>> https://jira.sw.ru/browse/PSBM-64123
>>
>> Introducing 'start_time' field into core image.
>> It is stored during suspend for each process inside of a dumped
>> container and set back to each new process at container restore
>> operation. Container will see this value after restore instead of the
>> real start time of all restored processes.
>>
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
>> ---
>> criu/cr-dump.c | 66 ++++++++++++++++++++++++++++++++-
>> criu/cr-restore.c | 99 ++++++++++++++++++++++++++++++++++++-------------
>> criu/include/crtools.h | 25 +++++++++++++
>> criu/include/prctl.h | 10 +++++
>> criu/include/restorer.h | 9 +++++
>> criu/pie/restorer.c | 25 +++++++++++++
>> images/core.proto | 1 +
>> 7 files changed, 209 insertions(+), 26 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index 45626e8..89373b1 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -996,12 +996,71 @@ static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_im
>> return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
>> }
>>
>> +struct get_internal_start_time_rq {
>> + int pid;
>> + unsigned long long result;
>> +};
>> +
>> +static int child_get_internal_start_time(void *arg)
>> +{
>> + struct proc_pid_stat p;
>> + struct get_internal_start_time_rq *r =
>> + (struct get_internal_start_time_rq *)arg;
>> +
>> + /* We need to join ve to access container relative
>> + * value of task's start_time, otherwize we will see
>> + * start_time visible to host.
>> + */
>> + if (join_veX(r->pid)) {
>> + pr_err("Failed to join ve, owning process %d\n", r->pid);
>> + return -1;
>> + }
>> +
>> + if (parse_pid_stat(r->pid, &p)) {
>> + pr_err("Failed to parse /proc/[pid]/stat for process: %d\n", r->pid);
>> + return -1;
>> + }
>> +
>> + r->result = p.start_time;
>> + return 0;
>> +}
>> +
>> +static int dump_thread_ve_start_time(int pid, ThreadCoreEntry *thread_core)
>> +{
>> + int ret;
>> + struct get_internal_start_time_rq r = {
>> + .pid = pid,
>> + .result = 0
>> + };
>> +
>> + ret = call_in_child_process(child_get_internal_start_time, &r);
>> + if (ret) {
>> + pr_err("Failed to get internal start_time of a process from ve\n");
>> + return ret;
>> + }
>> +
>> + thread_core->has_start_time = 1;
> stylistic suggestion
> 1 -> true
>> + thread_core->start_time = r.result;
>> +
>> + pr_info("Dumped start_time for task %d is %lu\n", pid, thread_core->start_time);
>> + return 0;
>> +}
>> +
>> int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
>>
>> {
>> int ret;
>> ThreadCoreEntry *tc = core->thread_core;
>>
>> + ret = dump_thread_ve_start_time(pid, tc);
>> + if (ret) {
>> + pr_err("Failed to dump start_time for task %d\n", pid);
>> + return -1;
>> + }
>> +
>> + pr_info("Dumped start_time of task %d is %lu\n", pid,
>> + tc->start_time);
> this info already printed in dump_thread_ve_start_time. Is it needed here?
>> +
>> ret = collect_lsm_profile(pid, tc->creds);
>> if (!ret) {
>> /*
>> @@ -1214,12 +1273,17 @@ static int dump_one_zombie(const struct pstree_item *item,
>> CoreEntry *core;
>> int ret = -1;
>> struct cr_img *img;
>> + pid_t virt_pid = item->pid->ns[0].virt;
> Please, use vpid(item)
>>
>> - core = core_entry_alloc(0, 1);
>> + core = core_entry_alloc(1, 1);
>> if (!core)
>> return -1;
>>
>> strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
>> +
>> + if (dump_thread_ve_start_time(virt_pid, core->thread_core))
> virt_pid -> vpid(item)
>> + return -1;
>> +
>> core->tc->task_state = TASK_DEAD;
>> core->tc->exit_code = pps->exit_code;
>>
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index 170beab..e564d0f 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -947,6 +947,7 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
>> static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
>> static int prepare_mm(pid_t pid, struct task_restore_args *args);
>>
>> +
> I think that this additional empty line added by mistake
>> static int restore_one_alive_task(int pid, CoreEntry *core)
>> {
>> unsigned args_len;
>> @@ -1122,6 +1123,60 @@ static int wait_on_helpers_zombies(void)
>>
>> static int wait_exiting_children(char *prefix);
>>
>> +static int restore_zombie_start_time(CoreEntry *core, pid_t pid)
>> +{
>> + /* Zombie's start_time restoration differs from the one for a live task.
>> + * Here is why:
>> + *
>> + * For alive task the main codepath is done in pie sigreturn
>> + * restorer. Each thread in a process executes restorer blob code before
>> + * doing a sigreturn. Within a restorer blob each thread get's the chance
>> + * to run start_time restoration code for itself via prctl. Prctl identifies
>> + * each thread and set's start_time field nicely.
>> + *
>> + * Zombie process is special. We don't support multithreaded zombies and we don't run
>> + * any parasite code. We just create one single-threaded process in this very callstack,
>> + * patch some stats and kill it back. Right now we are in this single zombie thread,
>> + * so we can call prctl to set start_time for it right here.
>> + * If we decide to somewhy support zombies more, this code might be changed.
>> + */
>> +
>> + int ret;
>> + unsigned long flags;
>> + long ticks_per_sec;
>> + struct prctl_task_ct_fields ct_fields;
>> +
>> + if (!core->thread_core) {
>> + pr_info("Skipping zombie start_time restore. thread_core missing in dump.\n");
>> + return 0;
>> + }
>> +
>> + if (!core->thread_core->has_start_time) {
>> + pr_info("Skipping start_time restore for old image format.\n");
>> + return 0;
>> + }
>> +
>> + ticks_per_sec = sysconf(_SC_CLK_TCK);
>> + if (ticks_per_sec == -1) {
>> + pr_perror("Failed to get clock ticks via sysconf");
>> + return -1;
>> + }
>> +
>> + ct_fields.real_start_time = core->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
>> + flags = PR_TASK_CT_FIELDS_START_TIME;
>> +
>> + ret = prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0);
>> + if (ret) {
>> + pr_perror("Failed to restore zombie start_time\n");
> \n with pr_perror is not needed
>> + return ret;
>> + }
>> +
>> + pr_info("Restored zombie start_time for task %d is %lu\n",
>> + pid,
>> + core->thread_core->start_time);
>> + return 0;
>> +}
>> +
>> static int restore_one_zombie(CoreEntry *core)
>> {
>> int exit_code = core->tc->exit_code;
>> @@ -1136,6 +1191,9 @@ static int restore_one_zombie(CoreEntry *core)
>>
>> prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>>
>> + if (restore_zombie_start_time(core, vpid(current)))
>> + return -1;
>> +
>> if (task_entries != NULL) {
>> if (wait_exiting_children("zombie"))
>> return -1;
>> @@ -2156,31 +2214,6 @@ static int write_restored_pid(void)
>>
>> extern char *get_dumpee_veid(pid_t pid_real);
>>
>> -#define join_veX(pid) join_ve(pid, true)
>> -
>> -/*
>> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
>> - * of files on CT mounts for security. All mounts created after join_veX are
>> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
>> - * So if you do join_ve0 you can be blocked from executing anything.
>> - *
>> - * https://jira.sw.ru/browse/PSBM-98702
>> - *
>> - * note: If for some reason we will desperately need to execute binaries from
>> - * mounts in the root_yard temporary mntns from VE0 we have an option:
>> - *
>> - * In restore_root_task before calling join_veX we can clone a helper process
>> - * which will create CT userns and mntns first (all mounts are marked as host
>> - * mounts), next after join_veX in restore_root_task we create another helper
>> - * process which setns'es to these user and mnt namespaces, and from these
>> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
>> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
>> - * but all mounts cloned from host will be marked as host mounts, and execution
>> - * on them will be allowed even from VE0.
>> - */
>> -
>> -#define join_ve0(pid) join_ve(pid, false)
>> -
>> /*
>> * To eliminate overhead we don't parse VE cgroup mountpoint
>> * but presume to find it in known place. Otherwise simply
>> @@ -3455,6 +3488,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>>
>> long new_sp;
>> long ret;
>> + long ticks_per_sec;
>>
>> long rst_mem_size;
>> long memzone_size;
>> @@ -3483,6 +3517,12 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>> BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
>> BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
>>
>> + ticks_per_sec = sysconf(_SC_CLK_TCK);
>> + if (ticks_per_sec == -1) {
>> + pr_perror("Failed to get clock ticks via sysconf");
>> + return -1;
>> + }
>> +
>> /*
>> * Read creds info for every thread and allocate memory
>> * needed so we can use this data inside restorer.
>> @@ -3740,6 +3780,15 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>> strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1);
>> thread_args[i].comm[TASK_COMM_LEN - 1] = 0;
>>
>> + if (!tcore->thread_core->has_start_time) {
>> + thread_args[i].restore_start_time = 0;
>> + pr_warn("Skipping restore_start_time for old image version.\n");
>> + } else {
>> + thread_args[i].start_time =
>> + tcore->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
>> + thread_args[i].restore_start_time = 1;
>> + }
>> +
>> if (thread_args[i].pid != pid)
>> core_entry__free_unpacked(tcore, NULL);
>>
>> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
>> index c3f7cb3..902d18a 100644
>> --- a/criu/include/crtools.h
>> +++ b/criu/include/crtools.h
>> @@ -44,6 +44,31 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
>> .actor = name##_cb, \
>> }
>>
>> +#define join_veX(pid) join_ve(pid, true)
>> +
>> +/*
>> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
>> + * of files on CT mounts for security. All mounts created after join_veX are
>> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
>> + * So if you do join_ve0 you can be blocked from executing anything.
>> + *
>> + * https://jira.sw.ru/browse/PSBM-98702
>> + *
>> + * note: If for some reason we will desperately need to execute binaries from
>> + * mounts in the root_yard temporary mntns from VE0 we have an option:
>> + *
>> + * In restore_root_task before calling join_veX we can clone a helper process
>> + * which will create CT userns and mntns first (all mounts are marked as host
>> + * mounts), next after join_veX in restore_root_task we create another helper
>> + * process which setns'es to these user and mnt namespaces, and from these
>> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
>> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
>> + * but all mounts cloned from host will be marked as host mounts, and execution
>> + * on them will be allowed even from VE0.
>> + */
>> +
>> +#define join_ve0(pid) join_ve(pid, false)
>> +
>> int join_ve(pid_t pid, bool veX);
>>
>> #endif /* __CR_CRTOOLS_H__ */
>> diff --git a/criu/include/prctl.h b/criu/include/prctl.h
>> index 8e7fef3..52b9a30 100644
>> --- a/criu/include/prctl.h
>> +++ b/criu/include/prctl.h
>> @@ -82,4 +82,14 @@ struct prctl_mm_map {
>> # define PR_GET_THP_DISABLE 42
>> #endif
>>
>> +#ifndef PR_SET_TASK_CT_FIELDS
>> +/* Set task container related fields */
>> +#define PR_SET_TASK_CT_FIELDS 1000
>> +#define PR_TASK_CT_FIELDS_START_TIME (1ULL << 0)
>> +
>> +struct prctl_task_ct_fields {
>> + s64 real_start_time;
>> +};
>> +#endif
>> +
>> #endif /* __CR_PRCTL_H__ */
>> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
>> index 07f0439..9f43dd8 100644
>> --- a/criu/include/restorer.h
>> +++ b/criu/include/restorer.h
>> @@ -112,6 +112,15 @@ struct thread_restore_args {
>> bool seccomp_force_tsync;
>>
>> char comm[TASK_COMM_LEN];
>> +
>> + /* set to 1 if start_time value is valid
>> + * and should be used for restore process
>> + */
>> + bool restore_start_time;
>> + /* start_time of a recovered process
>> + * in ticks format as shown in /proc/pid/stat(22)
>> + */
>> + unsigned long long start_time;
>> } __aligned(64);
>>
>> typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>> index 0351986..945a00a 100644
>> --- a/criu/pie/restorer.c
>> +++ b/criu/pie/restorer.c
>> @@ -113,6 +113,28 @@ void parasite_cleanup(void)
>> {
>> }
>>
>> +static int restore_start_time(struct thread_restore_args *args)
>> +{
>> + int ret;
>> + unsigned long flags;
>> + struct prctl_task_ct_fields ct_fields;
>> +
>> + if (!args->restore_start_time)
>> + return 0;
>> +
>> + ct_fields.real_start_time = args->start_time;
>> + flags = PR_TASK_CT_FIELDS_START_TIME;
>> +
>> + ret = sys_prctl_safe(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0);
>> + if (ret) {
>> + pr_info("Failed to restore start_time\n");
>> + return ret;
>> + }
>> +
>> + pr_info("Restored start_time to %lld\n", args->start_time);
>> + return 0;
>> +}
>> +
>> extern void cr_restore_rt (void) asm ("__cr_restore_rt")
>> __attribute__ ((visibility ("hidden")));
>>
>> @@ -523,6 +545,9 @@ static int restore_thread_common(struct thread_restore_args *args)
>>
>> restore_sched_info(&args->sp);
>>
>> + if (restore_start_time(args))
>> + return -1;
>> +
>> if (restore_nonsigframe_gpregs(&args->gpregs))
>> return -1;
>>
>> diff --git a/images/core.proto b/images/core.proto
>> index 6ef5f50..41f412f 100644
>> --- a/images/core.proto
>> +++ b/images/core.proto
>> @@ -119,6 +119,7 @@ message thread_core_entry {
>> optional uint32 seccomp_filter = 12;
>>
>> optional string comm = 13;
>> + optional uint64 start_time = 14;
>> }
>>
>> message task_rlimits_entry {
>> --
>> 1.8.3.1
>>
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list