[Devel] [patch rh7 1/2] vdso: x86-64 -- Implement kernel version virtualization

Cyrill Gorcunov gorcunov at virtuozzo.com
Tue Jun 23 15:11:15 PDT 2015


The kernel carries its version inside .note section of vDSO object which is
mapped into every process upon Elf execution. Since some version of libc
it starts checking which kernel version the container is running on using
vDSO elf parsing thus ours virtualization via /proc/sys/kernel/virt_osrelease
no longer works (in particular the OpenSUSE 13.2 template refuses to start
if meets old 2.6.32 version inside vDSO).

To fix this we carry per uts-namespace the copy of vDSO if virtualization
request has been detected. The request itself is detected this way: upon
first mapping of vDSO we check if the @virt_osrelease contains a version
string different from the one compiled in. If so then we create a copy
of the vDSO pages and patch a version entry.

Note the patch handles only pure x86-64 mode the compat mode will be
handled separately in another patch.

https://jira.sw.ru/browse/PSBM-30093
https://bugzilla.openvz.org/show_bug.cgi?id=2768

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
CC: Vladimir Davydov <vdavydov at virtuozzo.com>
CC: Konstantin Khorenko <khorenko at virtuozzo.com>
---
 arch/x86/include/asm/vdso.h |   14 ++++++
 arch/x86/vdso/vdso-note.S   |    2 
 arch/x86/vdso/vma.c         |   94 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/utsname.h     |   13 ++++++
 kernel/utsname.c            |   16 +++++++
 5 files changed, 136 insertions(+), 3 deletions(-)

Index: linux-pcs7.git/arch/x86/include/asm/vdso.h
===================================================================
--- linux-pcs7.git.orig/arch/x86/include/asm/vdso.h
+++ linux-pcs7.git/arch/x86/include/asm/vdso.h
@@ -1,6 +1,20 @@
 #ifndef _ASM_X86_VDSO_H
 #define _ASM_X86_VDSO_H
 
+#ifdef CONFIG_X86_64
+extern const char VDSO64_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO64_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO64_SYMBOL(base, name)					\
+({									\
+	extern const char VDSO64_##name[];				\
+	(void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
 extern const char VDSO32_PRELINK[];
 
Index: linux-pcs7.git/arch/x86/vdso/vdso-note.S
===================================================================
--- linux-pcs7.git.orig/arch/x86/vdso/vdso-note.S
+++ linux-pcs7.git/arch/x86/vdso/vdso-note.S
@@ -7,6 +7,8 @@
 #include <linux/version.h>
 #include <linux/elfnote.h>
 
+	.globl VDSO64_linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+VDSO64_linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
Index: linux-pcs7.git/arch/x86/vdso/vma.c
===================================================================
--- linux-pcs7.git.orig/arch/x86/vdso/vma.c
+++ linux-pcs7.git/arch/x86/vdso/vma.c
@@ -16,6 +16,10 @@
 #include <asm/vdso.h>
 #include <asm/page.h>
 
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
 unsigned int __read_mostly vdso_enabled = 1;
 
 extern char vdso_start[], vdso_end[];
@@ -111,6 +115,12 @@ static int __init init_vdso(void)
 		vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE);
 #endif
 
+	init_uts_ns.vdso.addr		= vdso_start;
+	init_uts_ns.vdso.pages		= vdso_pages;
+	init_uts_ns.vdso.nr_pages	= npages;
+	init_uts_ns.vdso.size		= vdso_size;
+	init_uts_ns.vdso.version_off	= (unsigned long)VDSO64_SYMBOL(0, linux_version_code);
+
 	return 0;
 }
 subsys_initcall(init_vdso);
@@ -184,10 +194,90 @@ up_fail:
 	return ret;
 }
 
+static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct ve_struct *ve = get_exec_env();
+	int i, n1, n2, n3, new_version;
+
+	/*
+	 * For node or in case we've not changed UTS simply
+	 * map preallocated original vDSO.
+	 *
+	 * In turn if we already allocated one for this UTS
+	 * simply reuse it. It improves speed significantly.
+	 */
+	if (uts_ns == &init_uts_ns)
+		goto map_init_uts;
+	else if (uts_ns->vdso.pages)
+		goto map_uts;
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+		if (new_version == LINUX_VERSION_CODE)
+			goto map_init_uts;
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Ignoring vDSO virtualization.\n");
+		goto map_init_uts;
+	}
+
+	uts_ns->vdso.nr_pages	= init_uts_ns.vdso.nr_pages;
+	uts_ns->vdso.size	= init_uts_ns.vdso.size;
+	uts_ns->vdso.version_off= init_uts_ns.vdso.version_off;
+	uts_ns->vdso.pages	= kmalloc(sizeof(struct page *) * init_uts_ns.vdso.nr_pages, GFP_KERNEL);
+	if (!uts_ns->vdso.pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < uts_ns->vdso.nr_pages; i++) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			pr_err("Can't allocate page for VE %d\n", ve->veid);
+			for (; i > 0; i--)
+				put_page(uts_ns->vdso.pages[i - 1]);
+			kfree(uts_ns->vdso.pages);
+			uts_ns->vdso.pages = NULL;
+			return -ENOMEM;
+		}
+		uts_ns->vdso.pages[i] = p;
+		copy_page(page_address(p), page_address(init_uts_ns.vdso.pages[i]));
+	}
+
+	uts_ns->vdso.addr = vmap(uts_ns->vdso.pages, uts_ns->vdso.nr_pages, 0, PAGE_KERNEL);
+	if (!uts_ns->vdso.addr) {
+		pr_err("Can't map vDSO pages for VE %d\n", ve->veid);
+		for (i = 0; i < uts_ns->vdso.nr_pages; i++)
+			put_page(uts_ns->vdso.pages[i]);
+		kfree(uts_ns->vdso.pages);
+		uts_ns->vdso.pages = NULL;
+		return -ENOMEM;
+	}
+
+	*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+map_uts:
+	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages, uts_ns->vdso.size);
+map_init_uts:
+	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages, init_uts_ns.vdso.size);
+}
+
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-	return setup_additional_pages(bprm, uses_interp, vdso_pages,
-				      vdso_size);
+	return uts_arch_setup_additional_pages(bprm, uses_interp);
 }
 
 #ifdef CONFIG_X86_X32_ABI
Index: linux-pcs7.git/include/linux/utsname.h
===================================================================
--- linux-pcs7.git.orig/include/linux/utsname.h
+++ linux-pcs7.git/include/linux/utsname.h
@@ -19,11 +19,24 @@ enum uts_proc {
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
+#ifdef CONFIG_X86
+struct uts_vdso {
+	void			*addr;
+	struct page		**pages;
+	unsigned int		nr_pages;
+	unsigned int		size;
+	unsigned long		version_off;
+};
+#endif
+
 struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
 	unsigned int proc_inum;
+#ifdef CONFIG_X86
+	struct uts_vdso vdso;
+#endif
 };
 extern struct uts_namespace init_uts_ns;
 extern struct new_utsname virt_utsname;
Index: linux-pcs7.git/kernel/utsname.c
===================================================================
--- linux-pcs7.git.orig/kernel/utsname.c
+++ linux-pcs7.git/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 
@@ -22,8 +23,12 @@ static struct uts_namespace *create_uts_
 	struct uts_namespace *uts_ns;
 
 	uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
-	if (uts_ns)
+	if (uts_ns) {
+#ifdef CONFIG_X86
+		memset(&uts_ns->vdso, 0, sizeof(uts_ns->vdso));
+#endif
 		kref_init(&uts_ns->kref);
+	}
 	return uts_ns;
 }
 
@@ -85,6 +90,15 @@ void free_uts_ns(struct kref *kref)
 	ns = container_of(kref, struct uts_namespace, kref);
 	put_user_ns(ns->user_ns);
 	proc_free_inum(ns->proc_inum);
+#ifdef CONFIG_X86
+	if (ns->vdso.pages) {
+		int i;
+		vunmap(ns->vdso.addr);
+		for (i = 0; i < ns->vdso.nr_pages; i++)
+			put_page(ns->vdso.pages[i]);
+		kfree(ns->vdso.pages);
+	}
+#endif
 	kfree(ns);
 }
 




More information about the Devel mailing list