[Devel] [PATCH RHEL7 COMMIT] ve/vdso: virtualized monotonic gettime through vdso
Konstantin Khorenko
khorenko at virtuozzo.com
Wed May 31 05:38:11 PDT 2017
The commit is pushed to "branch-rh7-3.10.0-514.16.1.vz7.32.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.16.1.vz7.32.5
------>
commit e955ddcd0d2cb91a9f80b14ab9973b250f1c3d16
Author: Andrey Ryabinin <aryabinin at virtuozzo.com>
Date: Wed May 31 16:38:10 2017 +0400
ve/vdso: virtualized monotonic gettime through vdso
We already have infrastructure for virtualized vdso, however we use
it only to change LINUX_VERSION_NAME in container. Simply store container's
start time - ve->start_timespec in vdso variable - VDSO64_ve_start_timespec,
and use it in __vdso_clock_gettime() to calculate container's monotonic time.
Make uts_arch_setup_additional_pages()/uts_prep_vdso_pages_locked() to always
setup new vdso, since previous policy to setup vdso only if uts_ns->name.release
wouldn't work for virtualized __vdso_clock_gettime()
https://jira.sw.ru/browse/PSBM-66451
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
Reviewed-by: Dmitry Safonov <dsafonov at virtuozzo.com>
---
arch/x86/vdso/vclock_gettime.c | 45 +++++++++++++++++++++++++++++++++++++++++-
arch/x86/vdso/vdso32-setup.c | 10 ++++------
arch/x86/vdso/vma.c | 7 +++----
3 files changed, 51 insertions(+), 11 deletions(-)
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index f079e1f..3a5b319 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -27,6 +27,10 @@
#define gtod (&VVAR(vsyscall_gtod_data))
+struct timespec VDSO64_ve_start_timespec;
+extern struct timespec VDSO32_ve_start_timespec
+ __attribute__((weak, alias("VDSO64_ve_start_timespec")));
+
notrace static cycle_t vread_tsc(void)
{
cycle_t ret = (cycle_t)rdtsc_ordered();
@@ -175,6 +179,43 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
return mode;
}
+notrace static struct timespec *get_ve_timespec(void)
+{
+ struct timespec *ret;
+ asm volatile ("lea VDSO64_ve_start_timespec(%%rip),%0\n": "=r"(ret));
+ return ret;
+}
+
+notrace static void vdso_set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
+{
+ while (nsec >= NSEC_PER_SEC) {
+ /*
+ * The following asm() prevents the compiler from
+ * optimising this loop into a modulo operation. See
+ * also __iter_div_u64_rem() in include/linux/time.h
+ */
+ asm("" : "+rm"(nsec));
+ nsec -= NSEC_PER_SEC;
+ ++sec;
+ }
+ while (nsec < 0) {
+ asm("" : "+rm"(nsec));
+ nsec += NSEC_PER_SEC;
+ --sec;
+ }
+ ts->tv_sec = sec;
+ ts->tv_nsec = nsec;
+}
+
+static void monotonic_time_to_ve(struct timespec *ts)
+{
+ struct timespec *ve_timespec = get_ve_timespec();
+
+ vdso_set_normalized_timespec(ts,
+ ts->tv_sec - ve_timespec->tv_sec,
+ ts->tv_nsec - ve_timespec->tv_nsec);
+}
+
notrace static int do_monotonic(struct timespec *ts)
{
unsigned long seq;
@@ -190,8 +231,9 @@ notrace static int do_monotonic(struct timespec *ts)
ns += vgetsns(&mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
- timespec_add_ns(ts, ns);
+ timespec_add_ns(ts, ns);
+ monotonic_time_to_ve(ts);
return mode;
}
@@ -215,6 +257,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
+ monotonic_time_to_ve(ts);
return 0;
}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 81a16c8..32c9a29 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -344,10 +344,8 @@ static struct page **uts_prep_vdso_pages_locked(int map)
* preallocated one.
*/
new_version = KERNEL_VERSION(n1, n2, n3);
- if (new_version == LINUX_VERSION_CODE)
- goto out;
#ifdef CONFIG_X86_32
- else {
+ {
/*
* Native x86-32 mode requires vDSO runtime
* relocations applied which is not supported
@@ -370,8 +368,8 @@ static struct page **uts_prep_vdso_pages_locked(int map)
* better than walk out with error.
*/
pr_warn_once("Wrong release uts name format detected."
- " Ignoring vDSO virtualization.\n");
- goto out;
+ " Using host's uts name.\n");
+ new_version = LINUX_VERSION_CODE;
}
mutex_lock(&vdso32_mutex);
@@ -402,6 +400,7 @@ static struct page **uts_prep_vdso_pages_locked(int map)
addr = page_address(new_pages[0]);
*((int *)(addr + uts_ns->vdso32.version_off)) = new_version;
+ *((struct timespec*)(VDSO32_SYMBOL(uts_ns->vdso.addr, ve_start_timespec))) = ve->start_timespec;
smp_wmb();
pages = uts_ns->vdso32.pages = new_pages;
@@ -411,7 +410,6 @@ static struct page **uts_prep_vdso_pages_locked(int map)
out_unlock:
mutex_unlock(&vdso32_mutex);
-out:
down_write(&mm->mmap_sem);
return pages;
}
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index a862c79..ad0e0ac 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -244,8 +244,6 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
* preallocated one.
*/
new_version = KERNEL_VERSION(n1, n2, n3);
- if (new_version == LINUX_VERSION_CODE)
- goto map_init_uts;
} else {
/*
* If admin is passed malformed string here
@@ -254,8 +252,8 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
* better than walk out with error.
*/
pr_warn_once("Wrong release uts name format detected."
- " Ignoring vDSO virtualization.\n");
- goto map_init_uts;
+ " Using host's uts name.\n");
+ new_version = LINUX_VERSION_CODE;
}
mutex_lock(&vdso_mutex);
@@ -296,6 +294,7 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
}
*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
+ *((struct timespec*)(VDSO64_SYMBOL(uts_ns->vdso.addr, ve_start_timespec))) = ve->start_timespec;
smp_wmb();
uts_ns->vdso.pages = new_pages;
mutex_unlock(&vdso_mutex);
More information about the Devel
mailing list