[Devel] [PATCH CRIU 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Tue Jan 14 11:27:45 MSK 2020


Initially I missed it, but ignoring retcode is really bad mistake.

On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>   criu/cr-dump.c         | 49 ++++++++++++++++++++++++++++++++++++
>   criu/cr-restore.c      | 68 ++++++++++++++++++++++++++++++--------------------
>   criu/include/crtools.h | 31 +++++++++++++++++++++++
>   images/core.proto      |  2 ++
>   4 files changed, 123 insertions(+), 27 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 45626e8..751141f 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -1037,6 +1037,47 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
>   	return ret;
>   }
>   
> +struct get_internal_start_time_rq {
> +	int pid;
> +	unsigned long long result;
> +};
> +
> +static int child_get_internal_start_time(void *arg)
> +{
> +	struct proc_pid_stat p;
> +	struct get_internal_start_time_rq *r =
> +		(struct get_internal_start_time_rq *)arg;
> +
> +	/* We need to join ve to access container relative
> +	 * value of task's start_time, otherwize we will see
> +	 * start_time visible to host.
> +	 */
> +	join_veX(r->pid);

Not checking return code here.

> +
> +	parse_pid_stat(r->pid, &p);

And here.

> +	r->result = p.start_time;
> +	return 0;
> +}
> +
> +static int dump_task_internal_start_time(int pid, TaskCoreEntry *tc)
> +{
> +	int ret;
> +	struct get_internal_start_time_rq r = {
> +		.pid = pid,
> +		.result = 0
> +	};
> +
> +	ret = call_in_child_process(child_get_internal_start_time, &r);
> +	if (ret) {
> +		pr_err("Failed to exec in child\n");
> +		return ret;
> +	}
> +
> +	tc->has_start_time = 1;
> +	tc->start_time = r.result;
> +	return 0;
> +}
> +
>   static int dump_task_core_all(struct parasite_ctl *ctl,
>   			      struct pstree_item *item,
>   			      const struct proc_pid_stat *stat,
> @@ -1063,6 +1104,14 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
>   	core->tc->task_state = item->pid->state;
>   	core->tc->exit_code = 0;
>   
> +	ret = dump_task_internal_start_time(pid, core->tc);
> +	if (ret) {
> +		pr_err("Failed to dump start_time for task %d\n", pid);
> +		goto err;
> +	}
> +
> +	pr_info("Dumped start_time of task %d is %lu\n", pid, core->tc->start_time);
> +
>   	if (stat->tty_nr) {
>   		struct pstree_item *p = item;
>   
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 170beab..c9777ae 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -947,6 +947,40 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
>   static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
>   static int prepare_mm(pid_t pid, struct task_restore_args *args);
>   
> +static int restore_start_time(int pid, CoreEntry *core)
> +{
> +	unsigned long long total_nsec;
> +	unsigned long flags;
> +	long tps;
> +	struct prctl_task_ct_fields ct_fields;
> +
> +	if (!core->tc->has_start_time) {
> +		pr_warn("Skipping restore_start_time for old image version.\n");
> +		return -1;
> +	}
> +
> +	tps = sysconf(_SC_CLK_TCK);
> +	if (tps == -1) {
> +		pr_perror("Failed to get clock ticks via sysconf");
> +		return -1;
> +	}
> +
> +	total_nsec = core->tc->start_time * (NSEC_PER_SEC / tps);
> +
> +	ct_fields.real_start_time = total_nsec;
> +	flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> +	if (prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0)) {
> +		pr_perror("Can't set process start time");
> +		return -1;
> +	}
> +
> +	pr_info("Restored start_time of task %d is %lu\n",
> +		pid, core->tc->start_time);
> +
> +	return 0;
> +}
> +
>   static int restore_one_alive_task(int pid, CoreEntry *core)
>   {
>   	unsigned args_len;
> @@ -955,6 +989,8 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
>   
>   	rst_mem_switch_to_private();
>   
> +	restore_start_time(pid, core);

And here. You should explicitly return 0 in case of EINVAL returned from 
prctl and write a warning that prctl is not supported, and you should 
also return 0 in case of !has_start_time for backward compatibility. 
Sysconf errors and all other errors from prctl do matter, we can't 
ignore them.

I think we need to rework your patch via kdat to call 
prctl(PR_SET_TASK_CT_FIELDS) only once if it's not supported by kernel.

> +
>   	args_len = round_up(sizeof(*ta) + sizeof(struct thread_restore_args) *
>   			current->nr_threads, page_size());
>   	ta = mmap(NULL, args_len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
> @@ -1122,7 +1158,7 @@ static int wait_on_helpers_zombies(void)
>   
>   static int wait_exiting_children(char *prefix);
>   
> -static int restore_one_zombie(CoreEntry *core)
> +static int restore_one_zombie(int pid, CoreEntry *core)
>   {
>   	int exit_code = core->tc->exit_code;
>   
> @@ -1134,6 +1170,8 @@ static int restore_one_zombie(CoreEntry *core)
>   	if (lazy_pages_setup_zombie(vpid(current)))
>   		return -1;
>   
> +	restore_start_time(pid, core);
> +
>   	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>   
>   	if (task_entries != NULL) {
> @@ -1142,6 +1180,7 @@ static int restore_one_zombie(CoreEntry *core)
>   		zombie_prepare_signals();
>   	}
>   
> +
>   	if (exit_code & 0x7f) {
>   		int signr;
>   
> @@ -1344,7 +1383,7 @@ static int restore_one_task(int pid, CoreEntry *core)
>   	if (task_alive(current))
>   		ret = restore_one_alive_task(pid, core);
>   	else if (current->pid->state == TASK_DEAD)
> -		ret = restore_one_zombie(core);
> +		ret = restore_one_zombie(pid, core);
>   	else if (current->pid->state == TASK_HELPER) {
>   		ret = restore_one_helper();
>   	} else {
> @@ -2156,31 +2195,6 @@ static int write_restored_pid(void)
>   
>   extern char *get_dumpee_veid(pid_t pid_real);
>   
> -#define join_veX(pid)	join_ve(pid, true)
> -
> -/*
> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> - * of files on CT mounts for security. All mounts created after join_veX are
> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> - * So if you do join_ve0 you can be blocked from executing anything.
> - *
> - * https://jira.sw.ru/browse/PSBM-98702
> - *
> - * note: If for some reason we will desperately need to execute binaries from
> - * mounts in the root_yard temporary mntns from VE0 we have an option:
> - *
> - * In restore_root_task before calling join_veX we can clone a helper process
> - * which will create CT userns and mntns first (all mounts are marked as host
> - * mounts), next after join_veX in restore_root_task we create another helper
> - * process which setns'es to these user and mnt namespaces, and from these
> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> - * but all mounts cloned from host will be marked as host mounts, and execution
> - * on them will be allowed even from VE0.
> - */
> -
> -#define join_ve0(pid)	join_ve(pid, false)
> -
>   /*
>    * To eliminate overhead we don't parse VE cgroup mountpoint
>    * but presume to find it in known place. Otherwise simply
> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
> index c3f7cb3..acb1faf 100644
> --- a/criu/include/crtools.h
> +++ b/criu/include/crtools.h
> @@ -44,6 +44,37 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
>   			.actor = name##_cb,			\
>   	}
>   
> +#define join_veX(pid)	join_ve(pid, true)
> +
> +/*
> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> + * of files on CT mounts for security. All mounts created after join_veX are
> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> + * So if you do join_ve0 you can be blocked from executing anything.
> + *
> + * https://jira.sw.ru/browse/PSBM-98702
> + *
> + * note: If for some reason we will desperately need to execute binaries from
> + * mounts in the root_yard temporary mntns from VE0 we have an option:
> + *
> + * In restore_root_task before calling join_veX we can clone a helper process
> + * which will create CT userns and mntns first (all mounts are marked as host
> + * mounts), next after join_veX in restore_root_task we create another helper
> + * process which setns'es to these user and mnt namespaces, and from these
> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> + * but all mounts cloned from host will be marked as host mounts, and execution
> + * on them will be allowed even from VE0.
> + */
> +
> +#define join_ve0(pid)	join_ve(pid, false)
> +
> +/*
> + * To eliminate overhead we don't parse VE cgroup mountpoint
> + * but presume to find it in known place. Otherwise simply
> + * don't enter into veX with one warning.
> + */
> +
>   int join_ve(pid_t pid, bool veX);
>   
>   #endif /* __CR_CRTOOLS_H__ */
> diff --git a/images/core.proto b/images/core.proto
> index 6ef5f50..c164c7a 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -50,6 +50,7 @@ message task_core_entry_VZ730 {
>   	optional int32			tty_nr		= 15;
>   	optional int32			tty_pgrp	= 16;
>   	repeated sa_entry		sigactions	= 17;
> +	optional uint64			start_time	= 19;
>   }
>   
>   message task_core_entry {
> @@ -79,6 +80,7 @@ message task_core_entry {
>   	repeated sa_entry		sigactions	= 15;
>   	optional int32			tty_nr		= 16;
>   	optional int32			tty_pgrp	= 17;
> +	optional uint64			start_time	= 19;
>   }
>   
>   message task_kobj_ids_entry {
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.



More information about the Devel mailing list