[Devel] [PATCH vz9 18/23] ve, x86_64: add per-ve vdso mapping.

Nikita Yushchenko nikita.yushchenko at virtuozzo.com
Fri Oct 1 18:53:26 MSK 2021


From: Andrey Ryabinin <aryabinin at virtuozzo.com>

Make vdso mapping per-ve. This will allow per container modification
of the linux version in .note section of vdso and monotonic time.

https://jira.sw.ru/browse/PSBM-121668
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

+++
ve: fix copy_vdso error handling

Else we would return null pointer (e.g. to cgroup_init_subsys) and
IS_ERR would say that it's not an error and the caller code
would badly consider ve_start being successful while it's not.

https://jira.sw.ru/browse/PSBM-131158

Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

+++
vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap

vdso_data is mapped to userspace which means that we can't
use kmalloc() to allocate it. Kmalloc() doesn't even guarantee
that we will get page aligned memory.

 kernel BUG at include/linux/mm.h:693!
 RIP: 0010:unmap_page_range+0x15f2/0x2630
 Call Trace:
  unmap_vmas+0x11e/0x1d0
  exit_mmap+0x215/0x420
  mmput+0x10a/0x400
  do_exit+0x98f/0x2d00
  do_group_exit+0xec/0x2b0
  __x64_sys_exit_group+0x3a/0x50
  do_syscall_64+0xa5/0x4d0
  entry_SYSCALL_64_after_hwframe+0x6a/0xdf

Use alloc_pages_exact() to allocate it. We can't use
alloc_pages(), or __get_free_pages() here since vdso_fault()
need to perform get_page() on individual sub-pages and alloc_pages()
doesn't initalize sub-pages.

https://jira.sw.ru/browse/PSBM-123551
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

(cherry-picked from vz8 commit 12c3967a0009 ("ve, x86_64: add per-ve vdso
mapping."))

Signed-off-by: Nikita Yushchenko <nikita.yushchenko at virtuozzo.com>
---
 arch/x86/entry/vdso/vma.c    |  4 +++-
 arch/x86/kernel/process_64.c |  2 +-
 include/linux/ve.h           |  3 +++
 kernel/ve/ve.c               | 44 ++++++++++++++++++++++++++++++++++++
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 235a5794296a..e58417321af2 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -15,6 +15,7 @@
 #include <linux/cpu.h>
 #include <linux/ptrace.h>
 #include <linux/time_namespace.h>
+#include <linux/ve.h>
 
 #include <asm/pvclock.h>
 #include <asm/vgtod.h>
@@ -391,7 +392,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	if (!vdso64_enabled)
 		return 0;
 
-	return map_vdso_randomized(&vdso_image_64);
+
+	return map_vdso_randomized(get_exec_env()->vdso_64);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4811c8669f92..206cdb4793f5 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -829,7 +829,7 @@ long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 		return prctl_map_vdso(&vdso_image_32, arg2);
 # endif
 	case ARCH_MAP_VDSO_64:
-		return prctl_map_vdso(&vdso_image_64, arg2);
+		return prctl_map_vdso(get_exec_env()->vdso_64, arg2);
 #endif
 
 	default:
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 95dcd99267df..741867427f57 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -16,6 +16,7 @@
 #include <linux/kmapset.h>
 #include <linux/kthread.h>
 #include <linux/vzstat.h>
+#include <asm/vdso.h>
 
 struct nsproxy;
 struct veip_struct;
@@ -71,6 +72,8 @@ struct ve_struct {
 
 	struct kthread_worker	umh_worker;
 	struct task_struct	*umh_task;
+
+	struct vdso_image	*vdso_64;
 };
 
 #define VE_MEMINFO_DEFAULT	1	/* default behaviour */
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 178aa658b50b..6a3248efaf07 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -55,6 +55,7 @@ struct ve_struct ve0 = {
 					2,
 #endif
 	.meminfo_val		= VE_MEMINFO_SYSTEM,
+	.vdso_64		= (struct vdso_image*)&vdso_image_64,
 };
 EXPORT_SYMBOL(ve0);
 
@@ -562,6 +563,33 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 	up_write(&ve->op_sem);
 }
 
+static int copy_vdso(struct ve_struct *ve)
+{
+	const struct vdso_image *vdso_src = &vdso_image_64;
+	struct vdso_image *vdso;
+	void *vdso_data;
+
+	if (ve->vdso_64)
+		return 0;
+
+	vdso = kmemdup(vdso_src, sizeof(*vdso), GFP_KERNEL);
+	if (!vdso)
+		return -ENOMEM;
+
+	vdso_data = alloc_pages_exact(vdso_src->size, GFP_KERNEL);
+	if (!vdso_data) {
+		kfree(vdso);
+		return -ENOMEM;
+	}
+
+	memcpy(vdso_data, vdso_src->data, vdso_src->size);
+
+	vdso->data = vdso_data;
+
+	ve->vdso_64 = vdso;
+	return 0;
+}
+
 static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_css)
 {
 	struct ve_struct *ve = &ve0;
@@ -595,12 +623,18 @@ static struct cgroup_subsys_state *ve_create(struct cgroup_subsys_state *parent_
 	if (err)
 		goto err_log;
 
+	err = copy_vdso(ve);
+	if (err)
+		goto err_vdso;
+
 do_init:
 	init_rwsem(&ve->op_sem);
 	INIT_LIST_HEAD(&ve->ve_list);
 	kmapset_init_key(&ve->sysfs_perms_key);
 	return &ve->css;
 
+err_vdso:
+	ve_log_destroy(ve);
 err_log:
 	free_percpu(ve->sched_lat_ve.cur);
 err_lat:
@@ -639,12 +673,22 @@ static void ve_offline(struct cgroup_subsys_state *css)
 	ve->ve_name = NULL;
 }
 
+static void ve_free_vdso(struct ve_struct *ve)
+{
+	if (ve->vdso_64 == &vdso_image_64)
+		return;
+
+	free_pages_exact(ve->vdso_64->data, ve->vdso_64->size);
+	kfree(ve->vdso_64);
+}
+
 static void ve_destroy(struct cgroup_subsys_state *css)
 {
 	struct ve_struct *ve = css_to_ve(css);
 
 	kmapset_unlink(&ve->sysfs_perms_key, &sysfs_ve_perms_set);
 	ve_log_destroy(ve);
+	ve_free_vdso(ve);
 	free_percpu(ve->sched_lat_ve.cur);
 	kmem_cache_free(ve_cachep, ve);
 }
-- 
2.30.2



More information about the Devel mailing list