[Devel] [PATCH rh7 1/2] vdso: virtualized monotonic gettime through vdso

Andrey Ryabinin aryabinin at virtuozzo.com
Wed May 24 08:21:42 PDT 2017


We already have infrastructure for virtualized vdso, however we use
it only to change LINUX_VERSION_NAME in container. Simply store container's
start time - ve->start_timespec in vdso variable - VDSO64_ve_start_timespec,
and use it in __vdso_clock_gettime() to calculate container's monotonic time.

Make uts_arch_setup_additional_pages()/uts_prep_vdso_pages_locked() to always
setup new vdso, since previous policy to setup vdso only if uts_ns->name.release
wouldn't work for virtualized __vdso_clock_gettime()

https://jira.sw.ru/browse/PSBM-66451
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 arch/x86/vdso/vclock_gettime.c | 45 +++++++++++++++++++++++++++++++++++++++++-
 arch/x86/vdso/vdso32-setup.c   | 10 ++++------
 arch/x86/vdso/vma.c            |  7 +++----
 3 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index f079e1fd5633..3a5b319984c7 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -27,6 +27,10 @@
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
+struct timespec VDSO64_ve_start_timespec;
+extern struct timespec VDSO32_ve_start_timespec
+	__attribute__((weak, alias("VDSO64_ve_start_timespec")));
+
 notrace static cycle_t vread_tsc(void)
 {
 	cycle_t ret = (cycle_t)rdtsc_ordered();
@@ -175,6 +179,43 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
 	return mode;
 }
 
+notrace static struct timespec *get_ve_timespec(void)
+{
+	struct timespec *ret;
+	asm volatile ("lea VDSO64_ve_start_timespec(%%rip),%0\n": "=r"(ret));
+	return ret;
+}
+
+notrace static void vdso_set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		/*
+		 * The following asm() prevents the compiler from
+		 * optimising this loop into a modulo operation. See
+		 * also __iter_div_u64_rem() in include/linux/time.h
+		 */
+		asm("" : "+rm"(nsec));
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		asm("" : "+rm"(nsec));
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
+static void monotonic_time_to_ve(struct timespec *ts)
+{
+	struct timespec *ve_timespec = get_ve_timespec();
+
+	vdso_set_normalized_timespec(ts,
+		ts->tv_sec - ve_timespec->tv_sec,
+		ts->tv_nsec - ve_timespec->tv_nsec);
+}
+
 notrace static int do_monotonic(struct timespec *ts)
 {
 	unsigned long seq;
@@ -190,8 +231,9 @@ notrace static int do_monotonic(struct timespec *ts)
 		ns += vgetsns(&mode);
 		ns >>= gtod->clock.shift;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
-	timespec_add_ns(ts, ns);
 
+	timespec_add_ns(ts, ns);
+	monotonic_time_to_ve(ts);
 	return mode;
 }
 
@@ -215,6 +257,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
 		ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 
+	monotonic_time_to_ve(ts);
 	return 0;
 }
 
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 5056d0ec9ab7..a082f1541f3c 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -344,10 +344,8 @@ static struct page **uts_prep_vdso_pages_locked(int map)
 		 * preallocated one.
 		 */
 		new_version = KERNEL_VERSION(n1, n2, n3);
-		if (new_version == LINUX_VERSION_CODE)
-			goto out;
 #ifdef CONFIG_X86_32
-		else {
+		{
 			/*
 			 * Native x86-32 mode requires vDSO runtime
 			 * relocations applied which is not supported
@@ -370,8 +368,8 @@ static struct page **uts_prep_vdso_pages_locked(int map)
 		 * better than walk out with error.
 		 */
 		pr_warn_once("Wrong release uts name format detected."
-			     " Ignoring vDSO virtualization.\n");
-		goto out;
+			     " Using host's uts name.\n");
+		new_version = LINUX_VERSION_CODE;
 	}
 
 	mutex_lock(&vdso32_mutex);
@@ -402,6 +400,7 @@ static struct page **uts_prep_vdso_pages_locked(int map)
 
 	addr = page_address(new_pages[0]);
 	*((int *)(addr + uts_ns->vdso32.version_off)) = new_version;
+	*((struct timespec*)(VDSO32_SYMBOL(uts_ns->vdso.addr, ve_start_timespec))) = ve->start_timespec;
 	smp_wmb();
 
 	pages = uts_ns->vdso32.pages = new_pages;
@@ -411,7 +410,6 @@ static struct page **uts_prep_vdso_pages_locked(int map)
 
 out_unlock:
 	mutex_unlock(&vdso32_mutex);
-out:
 	down_write(&mm->mmap_sem);
 	return pages;
 }
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index a862c79e2e91..ad0e0ac14f83 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -244,8 +244,6 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
 		 * preallocated one.
 		 */
 		new_version = KERNEL_VERSION(n1, n2, n3);
-		if (new_version == LINUX_VERSION_CODE)
-			goto map_init_uts;
 	} else {
 		/*
 		 * If admin is passed malformed string here
@@ -254,8 +252,8 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
 		 * better than walk out with error.
 		 */
 		pr_warn_once("Wrong release uts name format detected."
-			     " Ignoring vDSO virtualization.\n");
-		goto map_init_uts;
+			     " Using host's uts name.\n");
+		new_version = LINUX_VERSION_CODE;
 	}
 
 	mutex_lock(&vdso_mutex);
@@ -296,6 +294,7 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
 	}
 
 	*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
+	*((struct timespec*)(VDSO64_SYMBOL(uts_ns->vdso.addr, ve_start_timespec))) = ve->start_timespec;
 	smp_wmb();
 	uts_ns->vdso.pages = new_pages;
 	mutex_unlock(&vdso_mutex);
-- 
2.13.0



More information about the Devel mailing list