[Devel] [PATCH 1/2] Added separate start time field to task_struct to show in container

Valeriy Vdovin valeriy.vdovin at virtuozzo.com
Tue Dec 31 14:41:12 MSK 2019


https://jira.sw.ru/browse/PSBM-100083
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
---
 fs/proc/array.c            | 16 ++++------------
 include/linux/sched.h      |  5 +++++
 include/uapi/linux/prctl.h |  9 +++++++++
 kernel/fork.c              | 20 ++++++++++++++++++++
 kernel/sys.c               | 22 ++++++++++++++++++++++
 kernel/ve/ve.c             | 27 +++++++++++++++++++++++++++
 6 files changed, 87 insertions(+), 12 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3aa8a7d..a6d2834 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -611,19 +611,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
 				+ task->real_start_time.tv_nsec;
 #ifdef CONFIG_VE
-	if (!is_super) {
-		struct timespec *ve_start_ts =
-				&get_exec_env()->real_start_timespec;
-		start_time -=
-			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
-				+ ve_start_ts->tv_nsec;
-	}
-	/* tasks inside a CT can have negative start time e.g. if the CT was
-	 * migrated from another hw node, in which case we will report 0 in
-	 * order not to confuse userspace */
-	if ((s64)start_time < 0)
-		start_time = 0;
+	if (!is_super && !task->task_ve->is_pseudosuper)
+		start_time = (unsigned long long)
+			timespec_to_ns(&task->real_start_time_ct);
 #endif
+
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07f9954..0832904 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1934,6 +1934,11 @@ struct task_struct {
 	struct wake_q_node wake_q;
 	struct prev_cputime prev_cputime;
 	struct vtime vtime;
+	/*
+	 * this is a container-side copy of 'real_start_time' field
+	 * shown from inside of a container and modified by host.
+	 */
+	struct timespec real_start_time_ct;
 #endif /* __GENKSYMS__ */
 };
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 02376de..6de7b22 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -195,6 +195,7 @@ struct prctl_mm_map {
 /* Per task speculation control */
 #define PR_GET_SPECULATION_CTRL		52
 #define PR_SET_SPECULATION_CTRL		53
+
 /* Speculation control variants */
 # define PR_SPEC_STORE_BYPASS		0
 /* Return and control values for PR_SET/GET_SPECULATION_CTRL */
@@ -205,4 +206,12 @@ struct prctl_mm_map {
 # define PR_SPEC_FORCE_DISABLE		(1UL << 3)
 # define PR_SPEC_DISABLE_NOEXEC		(1UL << 4)
 
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS	54
+#define PR_TASK_CT_FIELDS_START_TIME	(1ULL << 0)
+
+struct prctl_task_ct_fields {
+	__s64 real_start_time;
+};
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 3d74228..7bc1957 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1348,8 +1348,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
 	int retval;
 	struct task_struct *p;
+
 	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
+
+
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
@@ -1472,6 +1478,20 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	do_posix_clock_monotonic_gettime(&p->start_time);
 	p->real_start_time = p->start_time;
 	monotonic_to_bootbased(&p->real_start_time);
+
+	/*
+	 * Initially set container-relative start time for newly started
+	 * processes. For init process this value is invalid and will be
+	 * changed at container start.
+	 */
+	p->real_start_time_ct = p->real_start_time;
+
+#ifdef CONFIG_VE
+	set_normalized_timespec(&p->real_start_time_ct,
+		p->real_start_time.tv_sec - ve->real_start_timespec.tv_sec,
+		p->real_start_time.tv_nsec - ve->real_start_timespec.tv_nsec);
+#endif
+
 	p->io_context = NULL;
 	p->audit_context = NULL;
 	if (clone_flags & CLONE_THREAD)
diff --git a/kernel/sys.c b/kernel/sys.c
index c4d633ef..2c9b3e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2457,6 +2457,25 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
 }
 #endif
 
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+		unsigned long flags)
+{
+	struct prctl_task_ct_fields params;
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()) && !t->task_ve->is_pseudosuper)
+		return -EPERM;
+#endif
+
+	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
+		return -EFAULT;
+
+	if (flags & PR_TASK_CT_FIELDS_START_TIME)
+		t->real_start_time_ct = ns_to_timespec(params.real_start_time);
+
+	return 0;
+}
+
 int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
 {
 	return -EINVAL;
@@ -2684,6 +2703,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
 		break;
+	case PR_SET_TASK_CT_FIELDS:
+		error = prctl_set_task_ct_fields(me, arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index ad3a698..662fffe 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -308,6 +308,21 @@ struct kthread_attach_work {
 	int result;
 };
 
+static void adj_start_time_at_ve_change(struct task_struct *t,
+	struct ve_struct *old_ve, struct ve_struct *new_ve)
+{
+	int64_t cur_stime_ns;
+	int64_t cur_ve_stime_ns;
+	int64_t tgt_ve_stime_ns;
+
+	cur_stime_ns = timespec_to_ns(&t->real_start_time);
+	cur_ve_stime_ns = timespec_to_ns(&old_ve->real_start_timespec);
+	tgt_ve_stime_ns = timespec_to_ns(&new_ve->real_start_timespec);
+
+	cur_stime_ns += cur_ve_stime_ns - tgt_ve_stime_ns;
+	t->real_start_time_ct = ns_to_timespec(cur_stime_ns);
+}
+
 static void kthread_attach_fn(struct kthread_work *w)
 {
 	struct kthread_attach_work *work = container_of(w,
@@ -336,9 +351,12 @@ static void kthread_attach_fn(struct kthread_work *w)
 	if (err)
 		goto out;
 
+	adj_start_time_at_ve_change(current, current->task_ve, target->task_ve);
+
 	err = cgroup_attach_task_all(target, current);
 	if (err)
 		goto out;
+
 out:
 	work->result = err;
 	complete(&work->done);
@@ -506,6 +524,15 @@ static int ve_start_container(struct ve_struct *ve)
 		ve->real_start_timespec = tsk->real_start_time;
 	}
 
+	/*
+	 * 'current' belongs to this ve, but it's ct-relative start time hasn't
+	 * been initialized correctly at copy_process, because ve's start time
+     * was * zero. On the other hand, we know this task is this ve's
+     * init (pid 1), * so we set it's relative start time to 0.
+	 */
+	tsk->real_start_time_ct.tv_sec = 0;
+	tsk->real_start_time_ct.tv_nsec = 0;
+
 	/* The value is wrong, but it is never compared to process
 	 * start times */
 	ve->start_jiffies = get_jiffies_64();
-- 
1.8.3.1



More information about the Devel mailing list