[Devel] [PATCH CRIU v4 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Tue Feb 4 12:59:28 MSK 2020

1) Agree with all comments from Alexander.

2) Also we still have a problem with backward compatibility (I've wrote 
about it previousely), on old kernel and criu with this patch we have:

[root at silo criu]# vzctl suspend test-starttime
Setting up checkpoint...
Checkpointing finished successfully
Unmount image: /vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Container is unmounted
Checkpointing completed successfully
[root at silo criu]# vzctl resume test-starttime
Restoring the Container ...
Mount image: /vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Container is mounted
Setting permissions for 
image=/vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Configure memguarantee: 0%
(01.145145) pie: 236: Error (criu/pie/restorer.c:128): prctl failed @128 
with -22
(01.145154) pie: 236: Error (criu/pie/restorer.c:1995): Restorer fail 236
(01.146428) pie: 237: Error (criu/pie/restorer.c:128): prctl failed @128 
with -22
(01.146435) pie: 237: Error (criu/pie/restorer.c:1995): Restorer fail 237
(01.263444) Error (criu/cr-restore.c:2612): Restoring FAILED.
The restore log was saved in 
/vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/dump/Dump/restore.log
criu exited with rc=17
Unmount image: /vz/private/40808b0d-5001-49df-9b72-c5ee5b5b5659/root.hdd
Container is unmounted
Failed to restore the Container

[root at silo criu]# uname -r
3.10.0-957.21.3.vz7.106.10

I think we should validly ignore EINVAL from prctl and print some 
warning that kernel does not support this prctl. (best way to do it once 
and save in new kdat.prctl_set_starttime_supported field and skip prctl 
based on it later).

3) Please also add #include "prctl.h" in criu/cr-restore.c so that we 
can compile it on the older kernel. (I doubt our build system updates 
kernel headers to often)

On 1/29/20 11:43 AM, Alexander Mikhalitsyn wrote:
> On Tue, 28 Jan 2020 20:19:40 +0300
> Valeriy Vdovin <valeriy.vdovin at virtuozzo.com> wrote:
> 
>> https://jira.sw.ru/browse/PSBM-64123
>>
>> Introducing 'start_time' field into core image.
>> It is stored during suspend for each process inside of a dumped
>> container and set back to each new process at container restore
>> operation. Container will see this value after restore instead of the
>> real start time of all restored processes.
>>
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
>> ---
>>   criu/cr-dump.c          | 66 ++++++++++++++++++++++++++++++++-
>>   criu/cr-restore.c       | 99 ++++++++++++++++++++++++++++++++++++-------------
>>   criu/include/crtools.h  | 25 +++++++++++++
>>   criu/include/prctl.h    | 10 +++++
>>   criu/include/restorer.h |  9 +++++
>>   criu/pie/restorer.c     | 25 +++++++++++++
>>   images/core.proto       |  1 +
>>   7 files changed, 209 insertions(+), 26 deletions(-)
>>
>> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
>> index 45626e8..89373b1 100644
>> --- a/criu/cr-dump.c
>> +++ b/criu/cr-dump.c
>> @@ -996,12 +996,71 @@ static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_im
>>   	return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
>>   }
>>   
>> +struct get_internal_start_time_rq {
>> +	int pid;
>> +	unsigned long long result;
>> +};
>> +
>> +static int child_get_internal_start_time(void *arg)
>> +{
>> +	struct proc_pid_stat p;
>> +	struct get_internal_start_time_rq *r =
>> +		(struct get_internal_start_time_rq *)arg;
>> +
>> +	/* We need to join ve to access container relative
>> +	 * value of task's start_time, otherwize we will see
>> +	 * start_time visible to host.
>> +	 */
>> +	if (join_veX(r->pid)) {
>> +		pr_err("Failed to join ve, owning process %d\n", r->pid);
>> +		return -1;
>> +	}
>> +
>> +	if (parse_pid_stat(r->pid, &p)) {
>> +		pr_err("Failed to parse /proc/[pid]/stat for process: %d\n", r->pid);
>> +		return -1;
>> +	}
>> +
>> +	r->result = p.start_time;
>> +	return 0;
>> +}
>> +
>> +static int dump_thread_ve_start_time(int pid, ThreadCoreEntry *thread_core)
>> +{
>> +	int ret;
>> +	struct get_internal_start_time_rq r = {
>> +		.pid = pid,
>> +		.result = 0
>> +	};
>> +
>> +	ret = call_in_child_process(child_get_internal_start_time, &r);
>> +	if (ret) {
>> +		pr_err("Failed to get internal start_time of a process from ve\n");
>> +		return ret;
>> +	}
>> +
>> +	thread_core->has_start_time = 1;
> stylistic suggestion
> 1 -> true
>> +	thread_core->start_time = r.result;
>> +
>> +	pr_info("Dumped start_time for task %d is %lu\n", pid, thread_core->start_time);
>> +	return 0;
>> +}
>> +
>>   int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
>>   
>>   {
>>   	int ret;
>>   	ThreadCoreEntry *tc = core->thread_core;
>>   
>> +	ret = dump_thread_ve_start_time(pid, tc);
>> +	if (ret) {
>> +		pr_err("Failed to dump start_time for task %d\n", pid);
>> +		return -1;
>> +	}
>> +
>> +	pr_info("Dumped start_time of task %d is %lu\n", pid,
>> +		tc->start_time);
> this info already printed in dump_thread_ve_start_time. Is it needed here?
>> +
>>   	ret = collect_lsm_profile(pid, tc->creds);
>>   	if (!ret) {
>>   		/*
>> @@ -1214,12 +1273,17 @@ static int dump_one_zombie(const struct pstree_item *item,
>>   	CoreEntry *core;
>>   	int ret = -1;
>>   	struct cr_img *img;
>> +	pid_t virt_pid = item->pid->ns[0].virt;
> Please, use vpid(item)
>>   
>> -	core = core_entry_alloc(0, 1);
>> +	core = core_entry_alloc(1, 1);
>>   	if (!core)
>>   		return -1;
>>   
>>   	strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
>> +
>> +	if (dump_thread_ve_start_time(virt_pid, core->thread_core))
> virt_pid -> vpid(item)
>> +		return -1;
>> +
>>   	core->tc->task_state = TASK_DEAD;
>>   	core->tc->exit_code = pps->exit_code;
>>   
>> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
>> index 170beab..e564d0f 100644
>> --- a/criu/cr-restore.c
>> +++ b/criu/cr-restore.c
>> @@ -947,6 +947,7 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
>>   static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
>>   static int prepare_mm(pid_t pid, struct task_restore_args *args);
>>   
>> +
> I think that this additional empty line added by mistake
>>   static int restore_one_alive_task(int pid, CoreEntry *core)
>>   {
>>   	unsigned args_len;
>> @@ -1122,6 +1123,60 @@ static int wait_on_helpers_zombies(void)
>>   
>>   static int wait_exiting_children(char *prefix);
>>   
>> +static int restore_zombie_start_time(CoreEntry *core, pid_t pid)
>> +{
>> +	/* Zombie's start_time restoration differs from the one for a live task.
>> +	 * Here is why:
>> +	 *
>> +	 * For alive task the main codepath is done in pie sigreturn
>> +	 * restorer. Each thread in a process executes restorer blob code before
>> +	 * doing a sigreturn. Within a restorer blob each thread get's the chance
>> +	 * to run start_time restoration code for itself via prctl. Prctl identifies
>> +	 * each thread and set's start_time field nicely.
>> +	 *
>> +	 * Zombie process is special. We don't support multithreaded zombies and we don't run
>> +	 * any parasite code. We just create one single-threaded process in this very callstack,
>> +	 * patch some stats and kill it back. Right now we are in this single zombie thread,
>> +	 * so we can call prctl to set start_time for it right here.
>> +	 * If we decide to somewhy support zombies more, this code might be changed.
>> +	 */
>> +
>> +	int ret;
>> +	unsigned long flags;
>> +	long ticks_per_sec;
>> +	struct prctl_task_ct_fields ct_fields;
>> +
>> +	if (!core->thread_core) {
>> +		pr_info("Skipping zombie start_time restore. thread_core missing in dump.\n");
>> +		return 0;
>> +	}
>> +
>> +	if (!core->thread_core->has_start_time) {
>> +		pr_info("Skipping start_time restore for old image format.\n");
>> +		return 0;
>> +	}
>> +
>> +	ticks_per_sec = sysconf(_SC_CLK_TCK);
>> +	if (ticks_per_sec == -1) {
>> +		pr_perror("Failed to get clock ticks via sysconf");
>> +		return -1;
>> +	}
>> +
>> +	ct_fields.real_start_time = core->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
>> +	flags = PR_TASK_CT_FIELDS_START_TIME;
>> +
>> +	ret = prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0);
>> +	if (ret) {
>> +		pr_perror("Failed to restore zombie start_time\n");
> \n with pr_perror is not needed
>> +		return ret;
>> +	}
>> +
>> +	pr_info("Restored zombie start_time for task %d is %lu\n",
>> +		pid,
>> +		core->thread_core->start_time);
>> +	return 0;
>> +}
>> +
>>   static int restore_one_zombie(CoreEntry *core)
>>   {
>>   	int exit_code = core->tc->exit_code;
>> @@ -1136,6 +1191,9 @@ static int restore_one_zombie(CoreEntry *core)
>>   
>>   	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>>   
>> +	if (restore_zombie_start_time(core, vpid(current)))
>> +		return -1;
>> +
>>   	if (task_entries != NULL) {
>>   		if (wait_exiting_children("zombie"))
>>   			return -1;
>> @@ -2156,31 +2214,6 @@ static int write_restored_pid(void)
>>   
>>   extern char *get_dumpee_veid(pid_t pid_real);
>>   
>> -#define join_veX(pid)	join_ve(pid, true)
>> -
>> -/*
>> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
>> - * of files on CT mounts for security. All mounts created after join_veX are
>> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
>> - * So if you do join_ve0 you can be blocked from executing anything.
>> - *
>> - * https://jira.sw.ru/browse/PSBM-98702
>> - *
>> - * note: If for some reason we will desperately need to execute binaries from
>> - * mounts in the root_yard temporary mntns from VE0 we have an option:
>> - *
>> - * In restore_root_task before calling join_veX we can clone a helper process
>> - * which will create CT userns and mntns first (all mounts are marked as host
>> - * mounts), next after join_veX in restore_root_task we create another helper
>> - * process which setns'es to these user and mnt namespaces, and from these
>> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
>> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
>> - * but all mounts cloned from host will be marked as host mounts, and execution
>> - * on them will be allowed even from VE0.
>> - */
>> -
>> -#define join_ve0(pid)	join_ve(pid, false)
>> -
>>   /*
>>    * To eliminate overhead we don't parse VE cgroup mountpoint
>>    * but presume to find it in known place. Otherwise simply
>> @@ -3455,6 +3488,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>>   
>>   	long new_sp;
>>   	long ret;
>> +	long ticks_per_sec;
>>   
>>   	long rst_mem_size;
>>   	long memzone_size;
>> @@ -3483,6 +3517,12 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>>   	BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
>>   	BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
>>   
>> +	ticks_per_sec = sysconf(_SC_CLK_TCK);
>> +	if (ticks_per_sec == -1) {
>> +		pr_perror("Failed to get clock ticks via sysconf");
>> +		return -1;
>> +	}
>> +
>>   	/*
>>   	 * Read creds info for every thread and allocate memory
>>   	 * needed so we can use this data inside restorer.
>> @@ -3740,6 +3780,15 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>>   			strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1);
>>   		thread_args[i].comm[TASK_COMM_LEN - 1] = 0;
>>   
>> +		if (!tcore->thread_core->has_start_time) {
>> +			thread_args[i].restore_start_time = 0;
>> +			pr_warn("Skipping restore_start_time for old image version.\n");
>> +		} else {
>> +			thread_args[i].start_time =
>> +				tcore->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
>> +			thread_args[i].restore_start_time = 1;
>> +		}
>> +
>>   		if (thread_args[i].pid != pid)
>>   			core_entry__free_unpacked(tcore, NULL);
>>   
>> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
>> index c3f7cb3..902d18a 100644
>> --- a/criu/include/crtools.h
>> +++ b/criu/include/crtools.h
>> @@ -44,6 +44,31 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
>>   			.actor = name##_cb,			\
>>   	}
>>   
>> +#define join_veX(pid)	join_ve(pid, true)
>> +
>> +/*
>> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
>> + * of files on CT mounts for security. All mounts created after join_veX are
>> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
>> + * So if you do join_ve0 you can be blocked from executing anything.
>> + *
>> + * https://jira.sw.ru/browse/PSBM-98702
>> + *
>> + * note: If for some reason we will desperately need to execute binaries from
>> + * mounts in the root_yard temporary mntns from VE0 we have an option:
>> + *
>> + * In restore_root_task before calling join_veX we can clone a helper process
>> + * which will create CT userns and mntns first (all mounts are marked as host
>> + * mounts), next after join_veX in restore_root_task we create another helper
>> + * process which setns'es to these user and mnt namespaces, and from these
>> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
>> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
>> + * but all mounts cloned from host will be marked as host mounts, and execution
>> + * on them will be allowed even from VE0.
>> + */
>> +
>> +#define join_ve0(pid)	join_ve(pid, false)
>> +
>>   int join_ve(pid_t pid, bool veX);
>>   
>>   #endif /* __CR_CRTOOLS_H__ */
>> diff --git a/criu/include/prctl.h b/criu/include/prctl.h
>> index 8e7fef3..52b9a30 100644
>> --- a/criu/include/prctl.h
>> +++ b/criu/include/prctl.h
>> @@ -82,4 +82,14 @@ struct prctl_mm_map {
>>   # define PR_GET_THP_DISABLE	42
>>   #endif
>>   
>> +#ifndef PR_SET_TASK_CT_FIELDS
>> +/* Set task container related fields */
>> +#define PR_SET_TASK_CT_FIELDS	1000
>> +#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
>> +
>> +struct prctl_task_ct_fields {
>> +	s64 real_start_time;
>> +};
>> +#endif
>> +
>>   #endif /* __CR_PRCTL_H__ */
>> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
>> index 07f0439..9f43dd8 100644
>> --- a/criu/include/restorer.h
>> +++ b/criu/include/restorer.h
>> @@ -112,6 +112,15 @@ struct thread_restore_args {
>>   	bool				seccomp_force_tsync;
>>   
>>   	char				comm[TASK_COMM_LEN];
>> +
>> +	/* set to 1 if start_time value is valid
>> +	 * and should be used for restore process
>> +	 */
>> +	bool				restore_start_time;
>> +	/* start_time of a recovered process
>> +	 * in ticks format as shown in /proc/pid/stat(22)
>> +	 */
>> +	unsigned long long		start_time;
>>   } __aligned(64);
>>   
>>   typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
>> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
>> index 0351986..945a00a 100644
>> --- a/criu/pie/restorer.c
>> +++ b/criu/pie/restorer.c
>> @@ -113,6 +113,28 @@ void parasite_cleanup(void)
>>   {
>>   }
>>   
>> +static int restore_start_time(struct thread_restore_args *args)
>> +{
>> +	int ret;
>> +	unsigned long flags;
>> +	struct prctl_task_ct_fields ct_fields;
>> +
>> +	if (!args->restore_start_time)
>> +		return 0;
>> +
>> +	ct_fields.real_start_time = args->start_time;
>> +	flags = PR_TASK_CT_FIELDS_START_TIME;
>> +
>> +	ret = sys_prctl_safe(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0);
>> +	if (ret) {
>> +		pr_info("Failed to restore start_time\n");
>> +		return ret;
>> +	}
>> +
>> +	pr_info("Restored start_time to %lld\n", args->start_time);
>> +	return 0;
>> +}
>> +
>>   extern void cr_restore_rt (void) asm ("__cr_restore_rt")
>>   			__attribute__ ((visibility ("hidden")));
>>   
>> @@ -523,6 +545,9 @@ static int restore_thread_common(struct thread_restore_args *args)
>>   
>>   	restore_sched_info(&args->sp);
>>   
>> +	if (restore_start_time(args))
>> +		return -1;
>> +
>>   	if (restore_nonsigframe_gpregs(&args->gpregs))
>>   		return -1;
>>   
>> diff --git a/images/core.proto b/images/core.proto
>> index 6ef5f50..41f412f 100644
>> --- a/images/core.proto
>> +++ b/images/core.proto
>> @@ -119,6 +119,7 @@ message thread_core_entry {
>>   	optional uint32			seccomp_filter	= 12;
>>   
>>   	optional string			comm		= 13;
>> +	optional uint64			start_time	= 14;
>>   }
>>   
>>   message task_rlimits_entry {
>> -- 
>> 1.8.3.1
>>

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.