[Devel] [PATCH RH7 1/1] ve/proc: Added separate start time field to task_struct to show in container
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Mon Jan 13 17:59:54 MSK 2020
On 1/13/20 5:36 PM, Pavel Tikhomirov wrote:
> Empty commit message body, please add here some small explanation about
> when (copy_process, ve_attach and prctl_set_task_ct_fields) we set it
> and with what value (0, starttime-ve.start_time, gettime-ve.start_time)
> and why.
>
> On 1/13/20 11:25 AM, Valeriy Vdovin wrote:
>> https://jira.sw.ru/browse/PSBM-100083
>> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
>> ---
>> fs/proc/array.c | 16 ++++------------
>> include/linux/sched.h | 5 +++++
>> include/uapi/linux/prctl.h | 7 +++++++
>> kernel/fork.c | 15 +++++++++++++++
>> kernel/sys.c | 23 +++++++++++++++++++++++
>> kernel/ve/ve.c | 22 ++++++++++++++++++++++
>> 6 files changed, 76 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/proc/array.c b/fs/proc/array.c
>> index 3aa8a7d..fb611b1 100644
>> --- a/fs/proc/array.c
>> +++ b/fs/proc/array.c
>> @@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m,
>> struct pid_namespace *ns,
>> (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
>> + task->real_start_time.tv_nsec;
>> #ifdef CONFIG_VE
>> - if (!is_super) {
>> - struct timespec *ve_start_ts =
>> - &get_exec_env()->real_start_timespec;
>> - start_time -=
>> - (unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
>> - + ve_start_ts->tv_nsec;
>> - }
>> - /* tasks inside a CT can have negative start time e.g. if the CT was
>> - * migrated from another hw node, in which case we will report 0 in
>> - * order not to confuse userspace */
>> - if ((s64)start_time < 0)
>> - start_time = 0;
>> + if (!is_super)
>> + start_time = (unsigned long long)
>> + timespec_to_ns(&task->real_start_time_ct);
>> #endif
>> +
>> /* convert nsec -> ticks */
>> start_time = nsec_to_clock_t(start_time);
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 07f9954..0832904 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1934,6 +1934,11 @@ struct task_struct {
>> struct wake_q_node wake_q;
>> struct prev_cputime prev_cputime;
>> struct vtime vtime;
>> + /*
>> + * this is a container-side copy of 'real_start_time' field
>> + * shown from inside of a container and modified by host.
>> + */
>> + struct timespec real_start_time_ct;
>> #endif /* __GENKSYMS__ */
>> };
>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>> index 02376de..b185113 100644
>> --- a/include/uapi/linux/prctl.h
>> +++ b/include/uapi/linux/prctl.h
>> @@ -204,5 +204,12 @@ struct prctl_mm_map {
>> # define PR_SPEC_DISABLE (1UL << 2)
>> # define PR_SPEC_FORCE_DISABLE (1UL << 3)
>> # define PR_SPEC_DISABLE_NOEXEC (1UL << 4)
>> +/* Set task container related fields */
>> +#define PR_SET_TASK_CT_FIELDS 1000
>> +#define PR_TASK_CT_FIELDS_START_TIME (1ULL << 0)
>> +
>> +struct prctl_task_ct_fields {
>> + __s64 real_start_time;
>> +};
>> #endif /* _LINUX_PRCTL_H */
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 3d74228..dbc584c 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1350,6 +1350,10 @@ static struct task_struct
>> *copy_process(unsigned long clone_flags,
>> struct task_struct *p;
>> void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
>> +#ifdef CONFIG_VE
>> + struct ve_struct *ve = get_exec_env();
>> +#endif
>> +
>> if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) ==
>> (CLONE_NEWNS|CLONE_FS))
>> return ERR_PTR(-EINVAL);
>> @@ -1472,6 +1476,17 @@ static struct task_struct
>> *copy_process(unsigned long clone_flags,
>> do_posix_clock_monotonic_gettime(&p->start_time);
>> p->real_start_time = p->start_time;
>> monotonic_to_bootbased(&p->real_start_time);
>> +
>> + p->real_start_time_ct.tv_sec = 0;
>> + p->real_start_time_ct.tv_nsec = 0;
>
>
>
>> +
>> +#ifdef CONFIG_VE
>
> We might need smp_rmb between reading is_running and real_start_timespec
> (and corresponding smp_wmb in ve_start_container), or use ve->op_sem to
> be 100% safe here in case of cpu instruction reordering. These can
> decrease performance of process creation. >
>> + if (!ve_is_super(ve) && ve->is_running) {
Discussed with Kostya, we can just check that ve->real_start_timespec is
not zero here explicitly and we won't need any lock and barrier here.
Please write a small comment about it. Actually we still leave here a
small chance of failure in case we would really have a CT with zero
real_start_timespec, but we have the same issue in ve_start_container
already =)
>> + p->real_start_time_ct = timespec_sub(p->real_start_time,
>> + ve->real_start_timespec);
>> + }
>> +#endif
>> +
>> p->io_context = NULL;
>> p->audit_context = NULL;
>> if (clone_flags & CLONE_THREAD)
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index c4d633ef..2ce16c7 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -2457,6 +2457,26 @@ static int prctl_get_tid_address(struct
>> task_struct *me, int __user **tid_addr)
>> }
>> #endif
>> +static int prctl_set_task_ct_fields(struct task_struct *t, unsigned
>> long arg,
>> + unsigned long flags)
>> +{
>> + struct prctl_task_ct_fields params;
>> +#ifdef CONFIG_VE
>> + struct ve_struct *ve = t->task_ve;
>> +
>> + if (!ve_is_super(ve) && !ve->is_pseudosuper)
>> + return -EPERM;
>> +#endif
>> +
>> + if (copy_from_user(¶ms, (const void __user *)arg,
>> sizeof(params)))
>> + return -EFAULT;
>> +
>> + if (flags & PR_TASK_CT_FIELDS_START_TIME)
>> + t->real_start_time_ct = ns_to_timespec(params.real_start_time);
>> +
>> + return 0;
>> +}
>> +
>> int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned
>> long which)
>> {
>> return -EINVAL;
>> @@ -2684,6 +2704,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned
>> long, arg2, unsigned long, arg3,
>> return -EINVAL;
>> error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
>> break;
>> + case PR_SET_TASK_CT_FIELDS:
>> + error = prctl_set_task_ct_fields(me, arg2, arg3);
>> + break;
>> default:
>> error = -EINVAL;
>> break;
>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
>> index ad3a698..110c37e 100644
>> --- a/kernel/ve/ve.c
>> +++ b/kernel/ve/ve.c
>> @@ -339,6 +339,7 @@ static void kthread_attach_fn(struct kthread_work *w)
>> err = cgroup_attach_task_all(target, current);
>> if (err)
>> goto out;
>> +
>
> Excess empty line.
>
>> out:
>> work->result = err;
>> complete(&work->done);
>> @@ -506,6 +507,15 @@ static int ve_start_container(struct ve_struct *ve)
>> ve->real_start_timespec = tsk->real_start_time;
>> }
>> + /*
>> + * 'current' belongs to this ve, but it's ct-relative start time
>> hasn't
>> + * been initialized correctly at copy_process, because ve's start
>> time
>> + * was zero. On the other hand, we know this task is this ve's
>> + * init (pid 1), * so we set it's relative start time to 0.
>> + */
>> + tsk->real_start_time_ct.tv_sec = 0;
>> + tsk->real_start_time_ct.tv_nsec = 0;
>
> Please move to ve_attach. I think real_start_time_ct should be set on
> ve_attach for current process and all others even if ve is not running.
>
>> +
>> /* The value is wrong, but it is never compared to process
>> * start times */
>> ve->start_jiffies = get_jiffies_64();
>> @@ -837,6 +847,7 @@ static void ve_attach(struct cgroup *cg, struct
>> cgroup_taskset *tset)
>> {
>> struct ve_struct *ve = cgroup_ve(cg);
>> struct task_struct *task;
>> + struct timespec host_uptime;
>> cgroup_taskset_for_each(task, cg, tset) {
>> /* this probihibts ptracing of task entered to VE from host
>> system */
>> @@ -850,6 +861,17 @@ static void ve_attach(struct cgroup *cg, struct
>> cgroup_taskset *tset)
>> /* Leave parent exec domain */
>> task->parent_exec_id--;
>> + if (ve->is_running) {
>
> Same thing with lack of lock/memory barrier here.
Same here.
>
>> + /* Viewed from inside of a container these processes
>> + * are newly created, so their actual container start
>> + * times should be set now.
>> + */
>> + do_posix_clock_monotonic_gettime(&host_uptime);
>> + monotonic_to_bootbased(&host_uptime);
>> + task->real_start_time_ct = timespec_sub(host_uptime,
>> + ve->real_start_timespec);
>> + }
>
> } else {
> tsk->real_start_time_ct.tv_sec = 0;
> tsk->real_start_time_ct.tv_nsec = 0;
> }
>
> These looks more generic for me.
>
>> +
>> task->task_ve = ve;
>> }
>>
>
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list