[Devel] [PATCH RHEL COMMIT] ve/proc: Added separate start time field to task_struct to show in container

Konstantin Khorenko khorenko at virtuozzo.com
Mon Oct 4 21:50:32 MSK 2021


The commit is pushed to "branch-rh9-5.14.vz9.1.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after ark-5.14
------>
commit 37a6fde98ad41a8430f897c1574df1edb09bb321
Author: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
Date:   Mon Oct 4 21:50:32 2021 +0300

    ve/proc: Added separate start time field to task_struct to show in container
    
    Introduced 'real_start_time_ct' field in task_struct.
    
    The value is READ:
    1. When the process lives inside of a ve group and any process
    inside of the same ve group wants to know it's start time by reading
    it's /proc/[pid]/stat file.
    2. At container suspend operation to store this value to a dump image.
    
    The value is WRITTEN:
    1. At creation time (copy_process function)
    1.1. If a process is being created outside of ve group / on host, then
    this value is initialized to 0
    1.2. If a process is being created by process already living in ve
    group, this value is calculated as host_uptime - ve_uptime.
    
    2. During attach to ve. (ve_attach function). The process can be created on
    a host and later attached to ve. It's container's start_time value has been
    already initialized to 0 at creation time. After the process enters the
    domain of a ve, the value should be initialized.
    Note that the process can be attached to a non-running container, in which
    case it's start_time value should not be calculated and left initialized to
    0.
    
    3. At container restore via prctl (prctl_set_task_ct_fields function).
    In this case the value is only settable outside of a container.
    During restore the processes would be created from the dump image.
    At restore step each process will execute prctl to set it's start_time
    value, read from the dump. This would only be permitted during
    pseudosuper ve mode. The value is set as is (read from the dump), without
    any calculations.
    
    https://jira.sw.ru/browse/PSBM-64123
    
    Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
    
    (cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0)
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    Reviewed-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
    
    Changes vz9:
    - separate from unrelated sys_times hunks
    - switch to time namespaces
    - rename real_start_time -> start_boottime
    
    (cherry picked from vz8 commit 222870c58a3b4a284698e8cf7a692f7fea577b13)
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    ====================
    Patchset description:
    
    ve/time: switch from our ve-time to native timenamespace
    
    https://jira.sw.ru/browse/PSBM-134393
    
    As time-namespaces are a new and mainstreamed version of ve-time, it's
    time to switch to it.
    
    Notes:
    1) ve-time does not need configuration on start, though time namespace
       needs configuration (offset == -now).
    
    2) ve-time saved container start time but time namespaces save offset
       between host start time and container start time
       (offset == ve_start_time - now).
    
    3) criu already knows how to handle time namespaces, though we need to
       do a compatibility layer to convert our ve.clock_* to offsets in time
       namespace for pre-vz9 to vz9 migration.
    
    4) vdso time is already handled by time namespaces, though time
       namespace only virtualizes vvar page, so it should not intersect with
       our vdso virtualization for ve.os_release.
    
    https://jira.sw.ru/browse/PSBM-134393
    
    Cyrill Gorcunov (1):
      ve: Add interface for ve::clock_[monotonic|bootbased] adjustment
    
    Kirill Tkhai (2):
      ve/time: Use ve_relative_clock in times() syscall and /proc/[pid]/stat
      ve: Virtualize sysinfo
    
    Pavel Tikhomirov (1):
      ve/time: remove our per-ve times in favor of mainstream
        time-namespaces
    
    Valeriy Vdovin (1):
      ve/proc: Added separate start time field to task_struct to show in
        container
---
 fs/proc/array.c            |  5 +++++
 include/linux/sched.h      |  6 ++++++
 include/linux/ve.h         | 30 ++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h |  8 ++++++++
 kernel/fork.c              | 11 +++++++++++
 kernel/sys.c               | 23 +++++++++++++++++++++++
 kernel/ve/ve.c             |  1 +
 7 files changed, 84 insertions(+)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 24e5c06c2ed0..a5e02af46f31 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -588,6 +588,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	start_time =
 		nsec_to_clock_t(timens_add_boottime_ns(task->start_boottime));
 
+#ifdef CONFIG_VE
+	if (!is_super)
+		start_time = nsec_to_clock_t(task->start_boottime_ct);
+#endif
+
 	seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
 	seq_puts(m, " (");
 	proc_task_name(m, task, false);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b4ef0e90c05..332c36a8f4c4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -957,6 +957,12 @@ struct task_struct {
 	/* Boot based time in nsecs: */
 	u64				start_boottime;
 
+	/*
+	 * This is a Container-side copy of 'start_boottime' field
+	 * shown from inside of a Container and modified by host.
+	 */
+	u64				start_boottime_ct;
+
 	/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
 	unsigned long			min_flt;
 	unsigned long			maj_flt;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index dccf2108f79b..65439ab1302e 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -17,6 +17,7 @@
 #include <linux/kthread.h>
 #include <linux/vzstat.h>
 #include <asm/vdso.h>
+#include <linux/time_namespace.h>
 
 struct nsproxy;
 struct veip_struct;
@@ -121,6 +122,35 @@ static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state *css)
 
 extern struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id);
 
+static u64 ve_get_uptime(struct ve_struct *ve)
+{
+	struct timespec64 tp = ns_to_timespec64(0);
+	struct time_namespace *time_ns;
+	struct nsproxy *ve_ns;
+
+	rcu_read_lock();
+	ve_ns = rcu_dereference(ve->ve_ns);
+	if (!ve_ns) {
+		rcu_read_unlock();
+		goto out;
+	}
+
+	time_ns = get_time_ns(ve_ns->time_ns);
+	rcu_read_unlock();
+
+	ktime_get_boottime_ts64(&tp);
+	tp = timespec64_add(tp, time_ns->offsets.boottime);
+	put_time_ns(time_ns);
+out:
+	return timespec64_to_ns(&tp);
+}
+
+static inline void ve_set_task_start_time(struct ve_struct *ve,
+					  struct task_struct *t)
+{
+	t->start_boottime_ct = ve_get_uptime(ve);
+}
+
 #define ve_feature_set(ve, f)			\
 	!!((ve)->features & VE_FEATURE_##f)
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 967d9c55323d..709fd88ede27 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -267,4 +267,12 @@ struct prctl_mm_map {
 # define PR_SCHED_CORE_SHARE_FROM	3 /* pull core_sched cookie to pid */
 # define PR_SCHED_CORE_MAX		4
 
+/* Set task container related fields */
+#define PR_SET_TASK_CT_FIELDS			1000
+#define PR_TASK_CT_FIELDS_START_BOOTTIME	(1UL << 0)
+
+struct prctl_task_ct_fields {
+	__s64 start_boottime;
+};
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index fa42bf77ddef..61adb6409f0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
 #include <linux/scs.h>
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
+#include <linux/ve.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -1867,6 +1868,9 @@ static __latent_entropy struct task_struct *copy_process(
 	struct file *pidfile = NULL;
 	u64 clone_flags = args->flags;
 	struct nsproxy *nsp = current->nsproxy;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+#endif
 
 	/*
 	 * Don't allow sharing the root directory with processes in a different
@@ -2233,6 +2237,13 @@ static __latent_entropy struct task_struct *copy_process(
 	p->start_time = ktime_get_ns();
 	p->start_boottime = ktime_get_boottime_ns();
 
+	p->start_boottime_ct = 0;
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(ve))
+		ve_set_task_start_time(ve, p);
+#endif
+
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
diff --git a/kernel/sys.c b/kernel/sys.c
index 8378cb0f5434..ae566d26ab6e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2268,6 +2268,26 @@ static int prctl_get_tid_address(struct task_struct *me, int __user * __user *ti
 }
 #endif
 
+static int prctl_set_task_ct_fields(struct task_struct *t, unsigned long arg,
+				    unsigned long flags)
+{
+	struct prctl_task_ct_fields params;
+#ifdef CONFIG_VE
+	struct ve_struct *ve = t->task_ve;
+
+	if (!ve_is_super(ve) && !ve->is_pseudosuper)
+		return -EPERM;
+#endif
+
+	if (copy_from_user(&params, (const void __user *)arg, sizeof(params)))
+		return -EFAULT;
+
+	if (flags & PR_TASK_CT_FIELDS_START_BOOTTIME)
+		t->start_boottime_ct = (u64)params.start_boottime;
+
+	return 0;
+}
+
 static int propagate_has_child_subreaper(struct task_struct *p, void *data)
 {
 	/*
@@ -2568,6 +2588,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = sched_core_share_pid(arg2, arg3, arg4, arg5);
 		break;
 #endif
+	case PR_SET_TASK_CT_FIELDS:
+		error = prctl_set_task_ct_fields(me, arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index e510acf027f5..b77429f10df2 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -821,6 +821,7 @@ static void ve_attach(struct cgroup_taskset *tset)
 		/* Leave parent exec domain */
 		task->parent_exec_id--;
 
+		ve_set_task_start_time(ve, task);
 		task->task_ve = ve;
 	}
 }


More information about the Devel mailing list