[Devel] [PATCH RH9 2/5] ve/proc: Added separate start time field to task_struct to show in container

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Mon Oct 4 18:00:00 MSK 2021


From: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

Introduced 'real_start_time_ct' field in task_struct.

The value is READ:
1. When the process lives inside of a ve group and any process
inside of the same ve group wants to know it's start time by reading
it's /proc/[pid]/stat file.
2. At container suspend operation to store this value to a dump image.

The value is WRITTEN:
1. At creation time (copy_process function)
1.1. If a process is being created outside of ve group / on host, then
this value is initialized to 0
1.2. If a process is being created by process already living in ve
group, this value is calculated as host_uptime - ve_uptime.

2. During attach to ve. (ve_attach function). The process can be created on
a host and later attached to ve. It's container's start_time value has been
already initialized to 0 at creation time. After the process enters the
domain of a ve, the value should be initialized.
Note that the process can be attached to a non-running container, in which
case it's start_time value should not be calculated and left initialized to
0.

3. At container restore via prctl (prctl_set_task_ct_fields function).
In this case the value is only settable outside of a container.
During restore the processes would be created from the dump image.
At restore step each process will execute prctl to set it's start_time
value, read from the dump. This would only be permitted during
pseudosuper ve mode. The value is set as is (read from the dump), without
any calculations.

https://jira.sw.ru/browse/PSBM-64123

Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

(cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0)
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
Reviewed-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>

Changes vz9:
- separate from unrelated sys_times hunks
- switch to time namespaces
- rename real_start_time -> start_boottime

(cherry picked from vz8 commit 222870c58a3b4a284698e8cf7a692f7fea577b13)
Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 fs/proc/array.c            |  5 +++++
 include/linux/sched.h      |  6 ++++++
 include/linux/ve.h         | 30 ++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h |  8 ++++++++
 kernel/fork.c              | 11 +++++++++++
 kernel/sys.c               | 23 +++++++++++++++++++++++
 kernel/ve/ve.c             |  1 +
 7 files changed, 84 insertions(+)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 24e5c06c2ed0..a5e02af46f31 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -588,6 +588,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	start_time =
 		nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime));
 
+#ifdef CONFIG_VE
+	if (!is_super)
+		start_time = nsec_to_clock_t(task->start_boottime_ct);
+#endif
+
 	seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
 	seq_puts(m, " (");
 	proc_task_name(m, task, false);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b4ef0e90c05..332c36a8f4c4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -957,6 +957,12 @@ struct task_struct {
 	/* Boot based time in nsecs: */
 	u64				start_boottime;
 
+	/*
+	 * This is a Container-side copy of 'start_boottime' field
+	 * shown from inside of a Container and modified by host.
+	 */
+	u64				start_boottime_ct;
+
 	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
 	unsigned long			min_flt;
 	unsigned long			maj_flt;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 4de91c86a084..c2ff0602cb25 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -17,6 +17,7 @@
 #include <linux/kthread.h>
 #include <linux/vzstat.h>
 #include <asm/vdso.h>
+#include <linux/time_namespace.h>
 
 struct nsproxy;
 struct veip_struct;
@@ -110,6 +111,35 @@ static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state *css)
 
 extern struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id);
 
+static u64 ve_get_uptime(struct ve_struct *ve)
+{
+	struct timespec64 tp = ns_to_timespec64(0);
+	struct time_namespace *time_ns;
+	struct nsproxy *ve_ns;
+
+	rcu_read_lock();
+	ve_ns = rcu_dereference(ve->ve_ns);
+	if (!ve_ns) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	time_ns = get_time_ns(ve_ns->time_ns);
+	rcu_read_unlock();
+
+	ktime_get_boottime_ts64(&tp);
+	tp = timespec64_add(tp, time_ns->offsets.boottime);
+	put_time_ns(time_ns);
+out:
+	return timespec64_to_ns(&tp);
+}
+
+static inline void ve_set_task_start_time(struct ve_struct *ve,
+					  struct task_struct *t)
+{
+	t->start_boottime_ct = ve_get_uptime(ve);
+}
+
 #define ve_feature_set(ve, f)			\
 	!!((ve)->features & VE_FEATURE_##f)
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 967d9c55323d..709fd88ede27 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -267,4 +267,12 @@ struct prctl_mm_map {
 # define PR_SCHED_CORE_SHARE_FROM	3 /* pull core_sched cookie to pid */
 # define PR_SCHED_CORE_MAX		4
 
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS			1000
+#define PR_TASK_CT_FIELDS_START_BOOTTIME	(1UL << 0)
+
+struct prctl_task_ct_fields {
+	__s64 start_boottime;
+};
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index fa42bf77ddef..61adb6409f0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
 #include <linux/scs.h>
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
+#include <linux/ve.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -1867,6 +1868,9 @@ static __latent_entropy struct task_struct *copy_process(
 	struct file *pidfile = NULL;
 	u64 clone_flags = args->flags;
 	struct nsproxy *nsp = current->nsproxy;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -2233,6 +2237,13 @@ static __latent_entropy struct task_struct *copy_process(
 	p->start_time = ktime_get_ns();
 	p->start_boottime = ktime_get_boottime_ns();
 
+	p->start_boottime_ct = 0;
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(ve))
+		ve_set_task_start_time(ve, p);
+#endif
+
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
diff --git a/kernel/sys.c b/kernel/sys.c
index 8378cb0f5434..ae566d26ab6e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2268,6 +2268,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti
 }
 #endif
 
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+				    unsigned long flags)
+{
+	struct prctl_task_ct_fields params;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = t->task_ve;
+
+	if (!ve_is_super(ve) && !ve->is_pseudosuper)
+		return -EPERM;
+#endif
+
+	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
+		return -EFAULT;
+
+	if (flags & PR_TASK_CT_FIELDS_START_BOOTTIME)
+		t->start_boottime_ct = (u64)params.start_boottime;
+
+	return 0;
+}
+
 static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 {
 	/*
@@ -2568,6 +2588,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
 		break;
 #endif
+	case PR_SET_TASK_CT_FIELDS:
+		error = prctl_set_task_ct_fields(me, arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index d45a10c02493..c93518fe4a33 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -809,6 +809,7 @@ static void ve_attach(struct cgroup_taskset *tset)
 		/* Leave parent exec domain */
 		task->parent_exec_id--;
 
+		ve_set_task_start_time(ve, task);
 		task->task_ve = ve;
 	}
 }
-- 
2.31.1



More information about the Devel mailing list