[Devel] [PATCH rh7 1/2] vdso: virtualized monotonic gettime through vdso

Dmitry Safonov dsafonov at virtuozzo.com
Wed May 24 09:29:03 PDT 2017


On 05/24/2017 06:21 PM, Andrey Ryabinin wrote:
> We already have infrastructure for virtualized vdso, however we use
> it only to change LINUX_VERSION_NAME in container. Simply store container's
> start time - ve->start_timespec in vdso variable - VDSO64_ve_start_timespec,
> and use it in __vdso_clock_gettime() to calculate container's monotonic time.
> 
> Make uts_arch_setup_additional_pages()/uts_prep_vdso_pages_locked() to always
> setup new vdso, since previous policy to setup vdso only if uts_ns->name.release
> wouldn't work for virtualized __vdso_clock_gettime()
> 
> https://jira.sw.ru/browse/PSBM-66451
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

Well, that's all looks good, but I've two questions:

1. Where VDSO64_ve_start_timespec is set? I do not see that part.

2. The part with unconditional creation of vdso pages array in
    uts_arch_setup_additional_pages()/uts_prep_vdso_pages_locked()
    will result in each exec() creating new vdso for all tasks in CT.
    So, that will result in n*8kB additional memory, where n is
    number of exec()'ed tasks in CTs.
    I'm not sure how much can be n, so that may be OK.
    But can we find a way to make it p*8kB, where p - is nr. of CTs?

    As far as I can see, vdso pages array was also copied for UTS
    virtualization where it was needed previously, so (2) question may be
    not related to this patches set, but for the way how it is done
    already. FWIW, what I mean here - it would be worth to have one copy
    of vdso pages per-CT.

    May be I'm trying to rearrange the deck chairs on the Titanic and
    that increase is just insignificant.

> ---
>   arch/x86/vdso/vclock_gettime.c | 45 +++++++++++++++++++++++++++++++++++++++++-
>   arch/x86/vdso/vdso32-setup.c   | 10 ++++------
>   arch/x86/vdso/vma.c            |  7 +++----
>   3 files changed, 51 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> index f079e1fd5633..3a5b319984c7 100644
> --- a/arch/x86/vdso/vclock_gettime.c
> +++ b/arch/x86/vdso/vclock_gettime.c
> @@ -27,6 +27,10 @@
>   
>   #define gtod (&VVAR(vsyscall_gtod_data))
>   
> +struct timespec VDSO64_ve_start_timespec;
> +extern struct timespec VDSO32_ve_start_timespec
> +	__attribute__((weak, alias("VDSO64_ve_start_timespec")));
> +
>   notrace static cycle_t vread_tsc(void)
>   {
>   	cycle_t ret = (cycle_t)rdtsc_ordered();
> @@ -175,6 +179,43 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
>   	return mode;
>   }
>   
> +notrace static struct timespec *get_ve_timespec(void)
> +{
> +	struct timespec *ret;
> +	asm volatile ("lea VDSO64_ve_start_timespec(%%rip),%0\n": "=r"(ret));
> +	return ret;
> +}
> +
> +notrace static void vdso_set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
> +{
> +	while (nsec >= NSEC_PER_SEC) {
> +		/*
> +		 * The following asm() prevents the compiler from
> +		 * optimising this loop into a modulo operation. See
> +		 * also __iter_div_u64_rem() in include/linux/time.h
> +		 */
> +		asm("" : "+rm"(nsec));
> +		nsec -= NSEC_PER_SEC;
> +		++sec;
> +	}
> +	while (nsec < 0) {
> +		asm("" : "+rm"(nsec));
> +		nsec += NSEC_PER_SEC;
> +		--sec;
> +	}
> +	ts->tv_sec = sec;
> +	ts->tv_nsec = nsec;
> +}
> +
> +static void monotonic_time_to_ve(struct timespec *ts)
> +{
> +	struct timespec *ve_timespec = get_ve_timespec();
> +
> +	vdso_set_normalized_timespec(ts,
> +		ts->tv_sec - ve_timespec->tv_sec,
> +		ts->tv_nsec - ve_timespec->tv_nsec);
> +}
> +
>   notrace static int do_monotonic(struct timespec *ts)
>   {
>   	unsigned long seq;
> @@ -190,8 +231,9 @@ notrace static int do_monotonic(struct timespec *ts)
>   		ns += vgetsns(&mode);
>   		ns >>= gtod->clock.shift;
>   	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
> -	timespec_add_ns(ts, ns);
>   
> +	timespec_add_ns(ts, ns);
> +	monotonic_time_to_ve(ts);
>   	return mode;
>   }
>   
> @@ -215,6 +257,7 @@ notrace static int do_monotonic_coarse(struct timespec *ts)
>   		ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec;
>   	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
>   
> +	monotonic_time_to_ve(ts);
>   	return 0;
>   }
>   
> diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
> index 5056d0ec9ab7..a082f1541f3c 100644
> --- a/arch/x86/vdso/vdso32-setup.c
> +++ b/arch/x86/vdso/vdso32-setup.c
> @@ -344,10 +344,8 @@ static struct page **uts_prep_vdso_pages_locked(int map)
>   		 * preallocated one.
>   		 */
>   		new_version = KERNEL_VERSION(n1, n2, n3);
> -		if (new_version == LINUX_VERSION_CODE)
> -			goto out;
>   #ifdef CONFIG_X86_32
> -		else {
> +		{
>   			/*
>   			 * Native x86-32 mode requires vDSO runtime
>   			 * relocations applied which is not supported
> @@ -370,8 +368,8 @@ static struct page **uts_prep_vdso_pages_locked(int map)
>   		 * better than walk out with error.
>   		 */
>   		pr_warn_once("Wrong release uts name format detected."
> -			     " Ignoring vDSO virtualization.\n");
> -		goto out;
> +			     " Using host's uts name.\n");
> +		new_version = LINUX_VERSION_CODE;
>   	}
>   
>   	mutex_lock(&vdso32_mutex);
> @@ -402,6 +400,7 @@ static struct page **uts_prep_vdso_pages_locked(int map)
>   
>   	addr = page_address(new_pages[0]);
>   	*((int *)(addr + uts_ns->vdso32.version_off)) = new_version;
> +	*((struct timespec*)(VDSO32_SYMBOL(uts_ns->vdso.addr, ve_start_timespec))) = ve->start_timespec;
>   	smp_wmb();
>   
>   	pages = uts_ns->vdso32.pages = new_pages;
> @@ -411,7 +410,6 @@ static struct page **uts_prep_vdso_pages_locked(int map)
>   
>   out_unlock:
>   	mutex_unlock(&vdso32_mutex);
> -out:
>   	down_write(&mm->mmap_sem);
>   	return pages;
>   }
> diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
> index a862c79e2e91..ad0e0ac14f83 100644
> --- a/arch/x86/vdso/vma.c
> +++ b/arch/x86/vdso/vma.c
> @@ -244,8 +244,6 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
>   		 * preallocated one.
>   		 */
>   		new_version = KERNEL_VERSION(n1, n2, n3);
> -		if (new_version == LINUX_VERSION_CODE)
> -			goto map_init_uts;
>   	} else {
>   		/*
>   		 * If admin is passed malformed string here
> @@ -254,8 +252,8 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
>   		 * better than walk out with error.
>   		 */
>   		pr_warn_once("Wrong release uts name format detected."
> -			     " Ignoring vDSO virtualization.\n");
> -		goto map_init_uts;
> +			     " Using host's uts name.\n");
> +		new_version = LINUX_VERSION_CODE;
>   	}
>   
>   	mutex_lock(&vdso_mutex);
> @@ -296,6 +294,7 @@ static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_i
>   	}
>   
>   	*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
> +	*((struct timespec*)(VDSO64_SYMBOL(uts_ns->vdso.addr, ve_start_timespec))) = ve->start_timespec;
>   	smp_wmb();
>   	uts_ns->vdso.pages = new_pages;
>   	mutex_unlock(&vdso_mutex);
> 


-- 
              Dmitry


More information about the Devel mailing list