[Devel] [PATCH RHEL7 COMMIT] ve/vdso: x86-64 -- Implement kernel version virtualization

Konstantin Khorenko khorenko at virtuozzo.com
Wed Jun 24 06:15:11 PDT 2015


The commit is pushed to "branch-rh7-3.10.0-123.1.2-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-123.1.2.vz7.5.17
------>
commit 61f67be77dc3cc3de62925fe3972c33ea653cd5a
Author: Cyrill Gorcunov <gorcunov at virtuozzo.com>
Date:   Wed Jun 24 17:15:11 2015 +0400

    ve/vdso: x86-64 -- Implement kernel version virtualization
    
    The kernel carries its version inside .note section of vDSO object which is
    mapped into every process upon Elf execution. Since some version of libc
    it starts checking which kernel version the container is running on using
    vDSO elf parsing thus ours virtualization via /proc/sys/kernel/virt_osrelease
    no longer works (in particular the OpenSUSE 13.2 template refuses to start
    if meets old 2.6.32 version inside vDSO).
    
    To fix this we carry per uts-namespace the copy of vDSO if virtualization
    request has been detected. The request itself is detected this way: upon
    first mapping of vDSO we check if the @virt_osrelease contains a version
    string different from the one compiled in. If so then we create a copy
    of the vDSO pages and patch a version entry.
    
    Note the patch handles only pure x86-64 mode the compat mode will be
    handled separately in another patch.
    
    https://jira.sw.ru/browse/PSBM-30093
    https://bugzilla.openvz.org/show_bug.cgi?id=2768
    
    Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    CC: Vladimir Davydov <vdavydov at virtuozzo.com>
    CC: Konstantin Khorenko <khorenko at virtuozzo.com>
    
    How to test it:
    - run a Container (based on OpenSUSE 13.2 template)
    - echo "3.1.2" > /sys/fs/cgroup/ve/$ctid/ve.os_release
    - check https://bugzilla.openvz.org/show_bug.cgi?id=2768
      and use http://2768.bugzilla.openvz.org/attachment.cgi?id=2204
      to check kernel version in VDSO inside the Container
---
 arch/x86/include/asm/vdso.h | 14 +++++++
 arch/x86/vdso/vdso-note.S   |  2 +
 arch/x86/vdso/vma.c         | 94 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/utsname.h     | 13 +++++++
 kernel/utsname.c            | 16 +++++++-
 5 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index fddb53d..d15a3dd 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,6 +1,20 @@
 #ifndef _ASM_X86_VDSO_H
 #define _ASM_X86_VDSO_H
 
+#ifdef CONFIG_X86_64
+extern const char VDSO64_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO64_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO64_SYMBOL(base, name)					\
+({									\
+	extern const char VDSO64_##name[];				\
+	(void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
 extern const char VDSO32_PRELINK[];
 
diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/vdso/vdso-note.S
index 79a071e..b3b202b 100644
--- a/arch/x86/vdso/vdso-note.S
+++ b/arch/x86/vdso/vdso-note.S
@@ -7,6 +7,8 @@
 #include <linux/version.h>
 #include <linux/elfnote.h>
 
+	.globl VDSO64_linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+VDSO64_linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 431e875..4b86291 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -16,6 +16,10 @@
 #include <asm/vdso.h>
 #include <asm/page.h>
 
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
 unsigned int __read_mostly vdso_enabled = 1;
 
 extern char vdso_start[], vdso_end[];
@@ -111,6 +115,12 @@ static int __init init_vdso(void)
 		vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE);
 #endif
 
+	init_uts_ns.vdso.addr		= vdso_start;
+	init_uts_ns.vdso.pages		= vdso_pages;
+	init_uts_ns.vdso.nr_pages	= npages;
+	init_uts_ns.vdso.size		= vdso_size;
+	init_uts_ns.vdso.version_off	= (unsigned long)VDSO64_SYMBOL(0, linux_version_code);
+
 	return 0;
 }
 subsys_initcall(init_vdso);
@@ -184,10 +194,90 @@ up_fail:
 	return ret;
 }
 
+static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct ve_struct *ve = get_exec_env();
+	int i, n1, n2, n3, new_version;
+
+	/*
+	 * For node or in case we've not changed UTS simply
+	 * map preallocated original vDSO.
+	 *
+	 * In turn if we already allocated one for this UTS
+	 * simply reuse it. It improves speed significantly.
+	 */
+	if (uts_ns == &init_uts_ns)
+		goto map_init_uts;
+	else if (uts_ns->vdso.pages)
+		goto map_uts;
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+		if (new_version == LINUX_VERSION_CODE)
+			goto map_init_uts;
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Ignoring vDSO virtualization.\n");
+		goto map_init_uts;
+	}
+
+	uts_ns->vdso.nr_pages	= init_uts_ns.vdso.nr_pages;
+	uts_ns->vdso.size	= init_uts_ns.vdso.size;
+	uts_ns->vdso.version_off= init_uts_ns.vdso.version_off;
+	uts_ns->vdso.pages	= kmalloc(sizeof(struct page *) * init_uts_ns.vdso.nr_pages, GFP_KERNEL);
+	if (!uts_ns->vdso.pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < uts_ns->vdso.nr_pages; i++) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			pr_err("Can't allocate page for VE %d\n", ve->veid);
+			for (; i > 0; i--)
+				put_page(uts_ns->vdso.pages[i - 1]);
+			kfree(uts_ns->vdso.pages);
+			uts_ns->vdso.pages = NULL;
+			return -ENOMEM;
+		}
+		uts_ns->vdso.pages[i] = p;
+		copy_page(page_address(p), page_address(init_uts_ns.vdso.pages[i]));
+	}
+
+	uts_ns->vdso.addr = vmap(uts_ns->vdso.pages, uts_ns->vdso.nr_pages, 0, PAGE_KERNEL);
+	if (!uts_ns->vdso.addr) {
+		pr_err("Can't map vDSO pages for VE %d\n", ve->veid);
+		for (i = 0; i < uts_ns->vdso.nr_pages; i++)
+			put_page(uts_ns->vdso.pages[i]);
+		kfree(uts_ns->vdso.pages);
+		uts_ns->vdso.pages = NULL;
+		return -ENOMEM;
+	}
+
+	*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+map_uts:
+	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages, uts_ns->vdso.size);
+map_init_uts:
+	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages, init_uts_ns.vdso.size);
+}
+
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-	return setup_additional_pages(bprm, uses_interp, vdso_pages,
-				      vdso_size);
+	return uts_arch_setup_additional_pages(bprm, uses_interp);
 }
 
 #ifdef CONFIG_X86_X32_ABI
diff --git a/include/linux/utsname.h b/include/linux/utsname.h
index 2b680bf..ccd270f 100644
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -19,11 +19,24 @@ enum uts_proc {
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
+#ifdef CONFIG_X86
+struct uts_vdso {
+	void			*addr;
+	struct page		**pages;
+	unsigned int		nr_pages;
+	unsigned int		size;
+	unsigned long		version_off;
+};
+#endif
+
 struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
 	unsigned int proc_inum;
+#ifdef CONFIG_X86
+	struct uts_vdso vdso;
+#endif
 };
 extern struct uts_namespace init_uts_ns;
 extern struct new_utsname virt_utsname;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 2fc8576..1980ddb 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 
@@ -22,8 +23,12 @@ static struct uts_namespace *create_uts_ns(void)
 	struct uts_namespace *uts_ns;
 
 	uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
-	if (uts_ns)
+	if (uts_ns) {
+#ifdef CONFIG_X86
+		memset(&uts_ns->vdso, 0, sizeof(uts_ns->vdso));
+#endif
 		kref_init(&uts_ns->kref);
+	}
 	return uts_ns;
 }
 
@@ -85,6 +90,15 @@ void free_uts_ns(struct kref *kref)
 	ns = container_of(kref, struct uts_namespace, kref);
 	put_user_ns(ns->user_ns);
 	proc_free_inum(ns->proc_inum);
+#ifdef CONFIG_X86
+	if (ns->vdso.pages) {
+		int i;
+		vunmap(ns->vdso.addr);
+		for (i = 0; i < ns->vdso.nr_pages; i++)
+			put_page(ns->vdso.pages[i]);
+		kfree(ns->vdso.pages);
+	}
+#endif
 	kfree(ns);
 }
 



More information about the Devel mailing list