[Devel] [PATCH CRIU v4 2/2] dump/restore: Maintain proper start_time param from /proc/[pid]/stat for each task

Wed Jan 29 11:43:41 MSK 2020

On Tue, 28 Jan 2020 20:19:40 +0300
Valeriy Vdovin <valeriy.vdovin at virtuozzo.com> wrote:

> https://jira.sw.ru/browse/PSBM-64123
> 
> Introducing 'start_time' field into core image.
> It is stored during suspend for each process inside of a dumped
> container and set back to each new process at container restore
> operation. Container will see this value after restore instead of the
> real start time of all restored processes.
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>  criu/cr-dump.c          | 66 ++++++++++++++++++++++++++++++++-
>  criu/cr-restore.c       | 99 ++++++++++++++++++++++++++++++++++++-------------
>  criu/include/crtools.h  | 25 +++++++++++++
>  criu/include/prctl.h    | 10 +++++
>  criu/include/restorer.h |  9 +++++
>  criu/pie/restorer.c     | 25 +++++++++++++
>  images/core.proto       |  1 +
>  7 files changed, 209 insertions(+), 26 deletions(-)
> 
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 45626e8..89373b1 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -996,12 +996,71 @@ static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_im
>  	return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
>  }
>  
> +struct get_internal_start_time_rq {
> +	int pid;
> +	unsigned long long result;
> +};
> +
> +static int child_get_internal_start_time(void *arg)
> +{
> +	struct proc_pid_stat p;
> +	struct get_internal_start_time_rq *r =
> +		(struct get_internal_start_time_rq *)arg;
> +
> +	/* We need to join ve to access container relative
> +	 * value of task's start_time, otherwize we will see
> +	 * start_time visible to host.
> +	 */
> +	if (join_veX(r->pid)) {
> +		pr_err("Failed to join ve, owning process %d\n", r->pid);
> +		return -1;
> +	}
> +
> +	if (parse_pid_stat(r->pid, &p)) {
> +		pr_err("Failed to parse /proc/[pid]/stat for process: %d\n", r->pid);
> +		return -1;
> +	}
> +
> +	r->result = p.start_time;
> +	return 0;
> +}
> +
> +static int dump_thread_ve_start_time(int pid, ThreadCoreEntry *thread_core)
> +{
> +	int ret;
> +	struct get_internal_start_time_rq r = {
> +		.pid = pid,
> +		.result = 0
> +	};
> +
> +	ret = call_in_child_process(child_get_internal_start_time, &r);
> +	if (ret) {
> +		pr_err("Failed to get internal start_time of a process from ve\n");
> +		return ret;
> +	}
> +
> +	thread_core->has_start_time = 1;
stylistic suggestion
1 -> true
> +	thread_core->start_time = r.result;
> +
> +	pr_info("Dumped start_time for task %d is %lu\n", pid, thread_core->start_time);
> +	return 0;
> +}
> +
>  int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
>  
>  {
>  	int ret;
>  	ThreadCoreEntry *tc = core->thread_core;
>  
> +	ret = dump_thread_ve_start_time(pid, tc);
> +	if (ret) {
> +		pr_err("Failed to dump start_time for task %d\n", pid);
> +		return -1;
> +	}
> +
> +	pr_info("Dumped start_time of task %d is %lu\n", pid,
> +		tc->start_time);
this info already printed in dump_thread_ve_start_time. Is it needed here?
> +
>  	ret = collect_lsm_profile(pid, tc->creds);
>  	if (!ret) {
>  		/*
> @@ -1214,12 +1273,17 @@ static int dump_one_zombie(const struct pstree_item *item,
>  	CoreEntry *core;
>  	int ret = -1;
>  	struct cr_img *img;
> +	pid_t virt_pid = item->pid->ns[0].virt;
Please, use vpid(item)
>  
> -	core = core_entry_alloc(0, 1);
> +	core = core_entry_alloc(1, 1);
>  	if (!core)
>  		return -1;
>  
>  	strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
> +
> +	if (dump_thread_ve_start_time(virt_pid, core->thread_core))
virt_pid -> vpid(item)
> +		return -1;
> +
>  	core->tc->task_state = TASK_DEAD;
>  	core->tc->exit_code = pps->exit_code;
>  
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 170beab..e564d0f 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -947,6 +947,7 @@ static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
>  static int prepare_itimers(int pid, struct task_restore_args *args, CoreEntry *core);
>  static int prepare_mm(pid_t pid, struct task_restore_args *args);
>  
> +
I think that this additional empty line added by mistake
>  static int restore_one_alive_task(int pid, CoreEntry *core)
>  {
>  	unsigned args_len;
> @@ -1122,6 +1123,60 @@ static int wait_on_helpers_zombies(void)
>  
>  static int wait_exiting_children(char *prefix);
>  
> +static int restore_zombie_start_time(CoreEntry *core, pid_t pid)
> +{
> +	/* Zombie's start_time restoration differs from the one for a live task.
> +	 * Here is why:
> +	 *
> +	 * For alive task the main codepath is done in pie sigreturn
> +	 * restorer. Each thread in a process executes restorer blob code before
> +	 * doing a sigreturn. Within a restorer blob each thread get's the chance
> +	 * to run start_time restoration code for itself via prctl. Prctl identifies
> +	 * each thread and set's start_time field nicely.
> +	 *
> +	 * Zombie process is special. We don't support multithreaded zombies and we don't run
> +	 * any parasite code. We just create one single-threaded process in this very callstack,
> +	 * patch some stats and kill it back. Right now we are in this single zombie thread,
> +	 * so we can call prctl to set start_time for it right here.
> +	 * If we decide to somewhy support zombies more, this code might be changed.
> +	 */
> +
> +	int ret;
> +	unsigned long flags;
> +	long ticks_per_sec;
> +	struct prctl_task_ct_fields ct_fields;
> +
> +	if (!core->thread_core) {
> +		pr_info("Skipping zombie start_time restore. thread_core missing in dump.\n");
> +		return 0;
> +	}
> +
> +	if (!core->thread_core->has_start_time) {
> +		pr_info("Skipping start_time restore for old image format.\n");
> +		return 0;
> +	}
> +
> +	ticks_per_sec = sysconf(_SC_CLK_TCK);
> +	if (ticks_per_sec == -1) {
> +		pr_perror("Failed to get clock ticks via sysconf");
> +		return -1;
> +	}
> +
> +	ct_fields.real_start_time = core->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
> +	flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> +	ret = prctl(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0, 0);
> +	if (ret) {
> +		pr_perror("Failed to restore zombie start_time\n");
\n with pr_perror is not needed
> +		return ret;
> +	}
> +
> +	pr_info("Restored zombie start_time for task %d is %lu\n",
> +		pid,
> +		core->thread_core->start_time);
> +	return 0;
> +}
> +
>  static int restore_one_zombie(CoreEntry *core)
>  {
>  	int exit_code = core->tc->exit_code;
> @@ -1136,6 +1191,9 @@ static int restore_one_zombie(CoreEntry *core)
>  
>  	prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
>  
> +	if (restore_zombie_start_time(core, vpid(current)))
> +		return -1;
> +
>  	if (task_entries != NULL) {
>  		if (wait_exiting_children("zombie"))
>  			return -1;
> @@ -2156,31 +2214,6 @@ static int write_restored_pid(void)
>  
>  extern char *get_dumpee_veid(pid_t pid_real);
>  
> -#define join_veX(pid)	join_ve(pid, true)
> -
> -/*
> - * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> - * of files on CT mounts for security. All mounts created after join_veX are
> - * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> - * So if you do join_ve0 you can be blocked from executing anything.
> - *
> - * https://jira.sw.ru/browse/PSBM-98702
> - *
> - * note: If for some reason we will desperately need to execute binaries from
> - * mounts in the root_yard temporary mntns from VE0 we have an option:
> - *
> - * In restore_root_task before calling join_veX we can clone a helper process
> - * which will create CT userns and mntns first (all mounts are marked as host
> - * mounts), next after join_veX in restore_root_task we create another helper
> - * process which setns'es to these user and mnt namespaces, and from these
> - * helper we can clone CT init process obviousely without CLONE_NEWNS and
> - * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> - * but all mounts cloned from host will be marked as host mounts, and execution
> - * on them will be allowed even from VE0.
> - */
> -
> -#define join_ve0(pid)	join_ve(pid, false)
> -
>  /*
>   * To eliminate overhead we don't parse VE cgroup mountpoint
>   * but presume to find it in known place. Otherwise simply
> @@ -3455,6 +3488,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  
>  	long new_sp;
>  	long ret;
> +	long ticks_per_sec;
>  
>  	long rst_mem_size;
>  	long memzone_size;
> @@ -3483,6 +3517,12 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  	BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
>  	BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
>  
> +	ticks_per_sec = sysconf(_SC_CLK_TCK);
> +	if (ticks_per_sec == -1) {
> +		pr_perror("Failed to get clock ticks via sysconf");
> +		return -1;
> +	}
> +
>  	/*
>  	 * Read creds info for every thread and allocate memory
>  	 * needed so we can use this data inside restorer.
> @@ -3740,6 +3780,15 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
>  			strncpy(thread_args[i].comm, core->tc->comm, TASK_COMM_LEN - 1);
>  		thread_args[i].comm[TASK_COMM_LEN - 1] = 0;
>  
> +		if (!tcore->thread_core->has_start_time) {
> +			thread_args[i].restore_start_time = 0;
> +			pr_warn("Skipping restore_start_time for old image version.\n");
> +		} else {
> +			thread_args[i].start_time =
> +				tcore->thread_core->start_time * (NSEC_PER_SEC / ticks_per_sec);
> +			thread_args[i].restore_start_time = 1;
> +		}
> +
>  		if (thread_args[i].pid != pid)
>  			core_entry__free_unpacked(tcore, NULL);
>  
> diff --git a/criu/include/crtools.h b/criu/include/crtools.h
> index c3f7cb3..902d18a 100644
> --- a/criu/include/crtools.h
> +++ b/criu/include/crtools.h
> @@ -44,6 +44,31 @@ extern void pr_check_features(const char *offset, const char *sep, int width);
>  			.actor = name##_cb,			\
>  	}
>  
> +#define join_veX(pid)	join_ve(pid, true)
> +
> +/*
> + * Use join_ve0 very carefully! We have checks in kernel to prohibit execution
> + * of files on CT mounts for security. All mounts created after join_veX are
> + * marked as CT mounts, including all mounts of the root_yard temporary mntns.
> + * So if you do join_ve0 you can be blocked from executing anything.
> + *
> + * https://jira.sw.ru/browse/PSBM-98702
> + *
> + * note: If for some reason we will desperately need to execute binaries from
> + * mounts in the root_yard temporary mntns from VE0 we have an option:
> + *
> + * In restore_root_task before calling join_veX we can clone a helper process
> + * which will create CT userns and mntns first (all mounts are marked as host
> + * mounts), next after join_veX in restore_root_task we create another helper
> + * process which setns'es to these user and mnt namespaces, and from these
> + * helper we can clone CT init process obviousely without CLONE_NEWNS and
> + * CLONE_NEWUSER. These way userns, mntns, ve will be preserved for all tasks
> + * but all mounts cloned from host will be marked as host mounts, and execution
> + * on them will be allowed even from VE0.
> + */
> +
> +#define join_ve0(pid)	join_ve(pid, false)
> +
>  int join_ve(pid_t pid, bool veX);
>  
>  #endif /* __CR_CRTOOLS_H__ */
> diff --git a/criu/include/prctl.h b/criu/include/prctl.h
> index 8e7fef3..52b9a30 100644
> --- a/criu/include/prctl.h
> +++ b/criu/include/prctl.h
> @@ -82,4 +82,14 @@ struct prctl_mm_map {
>  # define PR_GET_THP_DISABLE	42
>  #endif
>  
> +#ifndef PR_SET_TASK_CT_FIELDS
> +/* Set task container related fields */
> +#define PR_SET_TASK_CT_FIELDS	1000
> +#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
> +
> +struct prctl_task_ct_fields {
> +	s64 real_start_time;
> +};
> +#endif
> +
>  #endif /* __CR_PRCTL_H__ */
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index 07f0439..9f43dd8 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -112,6 +112,15 @@ struct thread_restore_args {
>  	bool				seccomp_force_tsync;
>  
>  	char				comm[TASK_COMM_LEN];
> +
> +	/* set to 1 if start_time value is valid
> +	 * and should be used for restore process
> +	 */
> +	bool				restore_start_time;
> +	/* start_time of a recovered process
> +	 * in ticks format as shown in /proc/pid/stat(22)
> +	 */
> +	unsigned long long		start_time;
>  } __aligned(64);
>  
>  typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index 0351986..945a00a 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -113,6 +113,28 @@ void parasite_cleanup(void)
>  {
>  }
>  
> +static int restore_start_time(struct thread_restore_args *args)
> +{
> +	int ret;
> +	unsigned long flags;
> +	struct prctl_task_ct_fields ct_fields;
> +
> +	if (!args->restore_start_time)
> +		return 0;
> +
> +	ct_fields.real_start_time = args->start_time;
> +	flags = PR_TASK_CT_FIELDS_START_TIME;
> +
> +	ret = sys_prctl_safe(PR_SET_TASK_CT_FIELDS, (unsigned long)&ct_fields, flags, 0);
> +	if (ret) {
> +		pr_info("Failed to restore start_time\n");
> +		return ret;
> +	}
> +
> +	pr_info("Restored start_time to %lld\n", args->start_time);
> +	return 0;
> +}
> +
>  extern void cr_restore_rt (void) asm ("__cr_restore_rt")
>  			__attribute__ ((visibility ("hidden")));
>  
> @@ -523,6 +545,9 @@ static int restore_thread_common(struct thread_restore_args *args)
>  
>  	restore_sched_info(&args->sp);
>  
> +	if (restore_start_time(args))
> +		return -1;
> +
>  	if (restore_nonsigframe_gpregs(&args->gpregs))
>  		return -1;
>  
> diff --git a/images/core.proto b/images/core.proto
> index 6ef5f50..41f412f 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -119,6 +119,7 @@ message thread_core_entry {
>  	optional uint32			seccomp_filter	= 12;
>  
>  	optional string			comm		= 13;
> +	optional uint64			start_time	= 14;
>  }
>  
>  message task_rlimits_entry {
> -- 
> 1.8.3.1
>