[Devel] [PATCH rh8] ve/proc: Added separate start time field to task_struct to show in container
Konstantin Khorenko
khorenko at virtuozzo.com
Tue Nov 10 19:09:27 MSK 2020
From: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Introduced 'real_start_time_ct' field in task_struct.
The value is READ:
1. When the process lives inside of a ve group and any process
inside of the same ve group wants to know it's start time by reading
it's /proc/[pid]/stat file.
2. At container suspend operation to store this value to a dump image.
The value is WRITTEN:
1. At creation time (copy_process function)
1.1. If a process is being created outside of ve group / on host, then
this value is initialized to 0
1.2. If a process is being created by process already living in ve
group, this value is calculated as host_uptime - ve_uptime.
2. During attach to ve. (ve_attach function). The process can be created on
a host and later attached to ve. It's container's start_time value has been
already initialized to 0 at creation time. After the process enters the
domain of a ve, the value should be initialized.
Note that the process can be attached to a non-running container, in which
case it's start_time value should not be calculated and left initialized to
0.
3. At container restore via prctl (prctl_set_task_ct_fields function).
In this case the value is only settable outside of a container.
During restore the processes would be created from the dump image.
At restore step each process will execute prctl to set it's start_time
value, read from the dump. This would only be permitted during
pseudosuper ve mode. The value is set as is (read from the dump), without
any calculations.
https://jira.sw.ru/browse/PSBM-64123
Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
(cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0)
Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
---
fs/proc/array.c | 12 +++---------
include/linux/sched.h | 7 ++++++-
include/linux/ve.h | 16 ++++++++++++++++
include/uapi/linux/prctl.h | 7 +++++++
kernel/fork.c | 11 +++++++++++
kernel/sys.c | 23 +++++++++++++++++++++++
kernel/ve/ve.c | 2 ++
7 files changed, 68 insertions(+), 10 deletions(-)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 5e7152d21a9d..2a292d54e804 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -544,16 +544,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
start_time = task->real_start_time;
#ifdef CONFIG_VE
- if (!is_super) {
- u64 offset = get_exec_env()->real_start_time;
- start_time -= (unsigned long long)offset;
- }
- /* tasks inside a CT can have negative start time e.g. if the CT was
- * migrated from another hw node, in which case we will report 0 in
- * order not to confuse userspace */
- if ((s64)start_time < 0)
- start_time = 0;
+ if (!is_super)
+ start_time = task->real_start_time_ct;
#endif
+
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cabed6a47a70..a0616888a5ca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -847,7 +847,6 @@ struct task_struct {
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtime vtime;
#endif
-
#ifdef CONFIG_NO_HZ_FULL
atomic_t tick_dep_mask;
#endif
@@ -861,6 +860,12 @@ struct task_struct {
/* Boot based time in nsecs: */
u64 real_start_time;
+ /*
+ * This is a Container-side copy of 'real_start_time' field
+ * shown from inside of a Container and modified by host.
+ */
+ u64 real_start_time_ct;
+
/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
unsigned long min_flt;
unsigned long maj_flt;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 3aa0ea0b1bab..ab8da4dceec1 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -148,6 +148,22 @@ static u64 ve_get_uptime(struct ve_struct *ve)
return ktime_get_boot_ns() - ve->real_start_time;
}
+static inline void ve_set_task_start_time(struct ve_struct *ve,
+ struct task_struct *t)
+{
+ /*
+ * Mitigate memory access reordering risks by doing double check,
+ * 'is_running' could be read as 1 before we see
+ * 'real_start_time' updated here. If it's still 0,
+ * we know 'is_running' is being modified right NOW in
+ * parallel so it's safe to say that start time is also 0.
+ */
+ if (!ve->is_running || !ve->real_start_time)
+ t->real_start_time_ct = 0;
+ else
+ t->real_start_time_ct = ve_get_uptime(ve);
+}
+
extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec64 *tp);
extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec64 *tp);
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 094bb03b9cc2..e9fca052e6cf 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -220,6 +220,13 @@ struct prctl_mm_map {
# define PR_SPEC_DISABLE (1UL << 2)
# define PR_SPEC_FORCE_DISABLE (1UL << 3)
# define PR_SPEC_DISABLE_NOEXEC (1UL << 4)
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS 1000
+#define PR_TASK_CT_FIELDS_START_TIME (1UL << 0)
+
+struct prctl_task_ct_fields {
+ __s64 real_start_time;
+};
/* Reset arm64 pointer authentication keys */
#define PR_PAC_RESET_KEYS 54
diff --git a/kernel/fork.c b/kernel/fork.c
index afb6929e5081..e3c8510b211c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1711,6 +1711,9 @@ static __latent_entropy struct task_struct *copy_process(
int retval;
struct task_struct *p;
struct multiprocess_signals delayed;
+#ifdef CONFIG_VE
+ struct ve_struct *ve = get_exec_env();
+#endif
/*
* Don't allow sharing the root directory with processes in a different
@@ -1863,6 +1866,14 @@ static __latent_entropy struct task_struct *copy_process(
p->start_time = ktime_get_ns();
p->real_start_time = ktime_get_boot_ns();
+
+ p->real_start_time_ct = 0;
+
+#ifdef CONFIG_VE
+ if (!ve_is_super(ve))
+ ve_set_task_start_time(ve, p);
+#endif
+
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 8560e5bcb6c2..7088fada2018 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2270,6 +2270,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
}
#endif
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+ unsigned long flags)
+{
+ struct prctl_task_ct_fields params;
+#ifdef CONFIG_VE
+ struct ve_struct *ve = t->task_ve;
+
+ if (!ve_is_super(ve) && !ve->is_pseudosuper)
+ return -EPERM;
+#endif
+
+ if (copy_from_user(¶ms, (const void __user *)arg, sizeof(params)))
+ return -EFAULT;
+
+ if (flags & PR_TASK_CT_FIELDS_START_TIME)
+ t->real_start_time_ct = (u64)params.real_start_time;
+
+ return 0;
+}
+
static int propagate_has_child_subreaper(struct task_struct *p, void *data)
{
/*
@@ -2522,6 +2542,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = PAC_RESET_KEYS(me, arg2);
break;
+ case PR_SET_TASK_CT_FIELDS:
+ error = prctl_set_task_ct_fields(me, arg2, arg3);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 29e98e6396dc..1a5c69a254d4 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -820,6 +820,8 @@ static void ve_attach(struct cgroup_taskset *tset)
if (cpuid_override_on())
set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
+
+ ve_set_task_start_time(ve, task);
task->task_ve = ve;
}
}
--
2.28.0
More information about the Devel
mailing list