[Devel] [PATCH RH7 v3 1/1] ve/proc: Added separate start time field to task_struct to show in container

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Fri Jan 17 18:46:12 MSK 2020


Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

On 1/14/20 9:54 PM, Valeriy Vdovin wrote:
> https://jira.sw.ru/browse/PSBM-100083
> 
> Introduced 'real_start_time_ct' field in task_struct.
> 
> The value is READ:
> 1. When the process lives inside of a ve group and any process
> inside of the same ve group wants to know it's start time by reading
> it's /proc/[pid]/stat file.
> 2. At container suspend operation to store this value to a dump image.
> 
> The value is WRITTEN:
> 1. At creation time (copy_process function)
> 1.1. If a process is being created outside of ve group / on host, then
> this value is initialized to 0
> 1.2. If a process is being created by process already living in ve
> group, this value is calculated as host_uptime - ve_uptime.
> 
> 2. During attach to ve. (ve_attach function). The process can be created on
> a host and later attached to ve. It's container's start_time value has been
> already initialized to 0 at creation time. After the process enters the
> domain of a ve, the value should be initialized Note that the process
> can be attached to a non-running container, in which case it's
> start_time value should not be calculated and left initialized to 0.
> 
> 3. At container restore via prctl (prctl_set_task_ct_fields function).
> In this case the value is only settable outside of a container.
> During restore the processes would be created from the dump image.
> At restore step each process will execute prctl to set it's start_time
> value, read from the dump. This would only be permitted during
> pseudosuper ve mode. The value is set as is (read from the dump), without
> any calculations.
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>   fs/proc/array.c            | 16 ++++------------
>   include/linux/sched.h      |  5 +++++
>   include/linux/ve.h         | 23 +++++++++++++++++++++++
>   include/uapi/linux/prctl.h |  7 +++++++
>   kernel/fork.c              | 13 +++++++++++++
>   kernel/sys.c               | 23 +++++++++++++++++++++++
>   kernel/ve/ve.c             |  2 ++
>   7 files changed, 77 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/proc/array.c b/fs/proc/array.c
> index 3aa8a7d..fb611b1 100644
> --- a/fs/proc/array.c
> +++ b/fs/proc/array.c
> @@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
>   		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
>   				+ task->real_start_time.tv_nsec;
>   #ifdef CONFIG_VE
> -	if (!is_super) {
> -		struct timespec *ve_start_ts =
> -				&get_exec_env()->real_start_timespec;
> -		start_time -=
> -			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
> -				+ ve_start_ts->tv_nsec;
> -	}
> -	/* tasks inside a CT can have negative start time e.g. if the CT was
> -	 * migrated from another hw node, in which case we will report 0 in
> -	 * order not to confuse userspace */
> -	if ((s64)start_time < 0)
> -		start_time = 0;
> +	if (!is_super)
> +		start_time = (unsigned long long)
> +			timespec_to_ns(&task->real_start_time_ct);
>   #endif
> +
>   	/* convert nsec -> ticks */
>   	start_time = nsec_to_clock_t(start_time);
>   
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 07f9954..0832904 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1934,6 +1934,11 @@ struct task_struct {
>   	struct wake_q_node wake_q;
>   	struct prev_cputime prev_cputime;
>   	struct vtime vtime;
> +	/*
> +	 * this is a container-side copy of 'real_start_time' field
> +	 * shown from inside of a container and modified by host.
> +	 */
> +	struct timespec real_start_time_ct;
>   #endif /* __GENKSYMS__ */
>   };
>   
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 9d60838..088e274 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -199,6 +199,29 @@ static inline struct ve_struct *cgroup_ve(struct cgroup *cgroup)
>   			struct ve_struct, css);
>   }
>   
> +static inline void ve_try_set_task_start_time(struct ve_struct *ve,
> +	struct task_struct *t)
> +{
> +	struct timespec host_uptime;
> +
> +	/*
> +	 * mitigate memory access reordering risks by doing double check,
> +	 * 'is_running' could be read as 1 before we see
> +	 * 'real_start_timespec' updated here. If it's still 0,
> +	 * we know 'is_running' is being modified right NOW in
> +	 * parallel so it's safe to say that start time is also 0
> +	 */
> +	if (!ve->is_running || !timespec_to_ns(&ve->real_start_timespec)) {
> +		t->real_start_time_ct.tv_sec = 0;
> +		t->real_start_time_ct.tv_nsec = 0;
> +	} else {
> +		do_posix_clock_monotonic_gettime(&host_uptime);
> +		monotonic_to_bootbased(&host_uptime);
> +		t->real_start_time_ct = timespec_sub(host_uptime,
> +			ve->real_start_timespec);
> +	}
> +}
> +
>   extern unsigned long long ve_relative_clock(struct timespec * ts);
>   extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp);
>   extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp);
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 02376de..b185113 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -204,5 +204,12 @@ struct prctl_mm_map {
>   # define PR_SPEC_DISABLE		(1UL << 2)
>   # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
>   # define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
> +/* Set task container related fields */
> +#define PR_SET_TASK_CT_FIELDS	1000
> +#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
> +
> +struct prctl_task_ct_fields {
> +	__s64 real_start_time;
> +};
>   
>   #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3d74228..2314eb8 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1350,6 +1350,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>   	struct task_struct *p;
>   	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
>   
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = get_exec_env();
> +#endif
> +
>   	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
>   		return ERR_PTR(-EINVAL);
>   
> @@ -1472,6 +1476,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>   	do_posix_clock_monotonic_gettime(&p->start_time);
>   	p->real_start_time = p->start_time;
>   	monotonic_to_bootbased(&p->real_start_time);
> +
> +	p->real_start_time_ct.tv_sec = 0;
> +	p->real_start_time_ct.tv_nsec = 0;
> +
> +#ifdef CONFIG_VE
> +	if (!ve_is_super(ve))
> +		ve_try_set_task_start_time(ve, p);
> +#endif
> +
>   	p->io_context = NULL;
>   	p->audit_context = NULL;
>   	if (clone_flags & CLONE_THREAD)
> diff --git a/kernel/sys.c b/kernel/sys.c
> index c4d633ef..2ce16c7 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2457,6 +2457,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
>   }
>   #endif
>   
> +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
> +		unsigned long flags)
> +{
> +	struct prctl_task_ct_fields params;
> +#ifdef CONFIG_VE
> +	struct ve_struct *ve = t->task_ve;
> +
> +	if (!ve_is_super(ve) && !ve->is_pseudosuper)
> +		return -EPERM;
> +#endif
> +
> +	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
> +		return -EFAULT;
> +
> +	if (flags & PR_TASK_CT_FIELDS_START_TIME)
> +		t->real_start_time_ct = ns_to_timespec(params.real_start_time);
> +
> +	return 0;
> +}
> +
>   int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
>   {
>   	return -EINVAL;
> @@ -2684,6 +2704,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>   			return -EINVAL;
>   		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
>   		break;
> +	case PR_SET_TASK_CT_FIELDS:
> +		error = prctl_set_task_ct_fields(me, arg2, arg3);
> +		break;
>   	default:
>   		error = -EINVAL;
>   		break;
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index ad3a698..f3970e8 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -850,6 +850,8 @@ static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
>   		/* Leave parent exec domain */
>   		task->parent_exec_id--;
>   
> +		ve_try_set_task_start_time(ve, task);
> +
>   		task->task_ve = ve;
>   	}
>   
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.



More information about the Devel mailing list