[CRIU] [PATCH 2/4] arch/x86: implement the process_vm_exec syscall

Andrei Vagin avagin at gmail.com
Wed Apr 14 08:52:15 MSK 2021


This change introduces the new system call:
process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
		siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)

process_vm_exec allows to execute the current process in an address
space of another process.

process_vm_exec swaps the current address space with an address space of
a specified process, sets a state from sigcontex and resumes the process.
When a process receives a signal or calls a system call,
process_vm_exec saves the process state back to sigcontext, restores the
origin address space, restores the origin process state, and returns to
userspace.

If it was interrupted by a signal and the signal is in the user_mask,
the signal is dequeued and information about it is saved in uinfo.
If process_vm_exec is interrupted by a system call, a synthetic siginfo
for the SIGSYS signal is generated.

The behavior of this system call is similar to PTRACE_SYSEMU but
everything is happing in the context of one process, so
process_vm_exec shows a better performance.

PTRACE_SYSEMU is primarily used to implement sandboxes (application
kernels) like User-mode Linux or gVisor. These type of sandboxes
intercepts applications system calls and acts as the guest kernel.
A simple benchmark, where a "tracee" process executes systems calls in a
loop and a "tracer" process traps syscalls and handles them just
incrementing the tracee instruction pointer to skip the syscall
instruction shows that process_vm_exec works more than 5 times faster
than PTRACE_SYSEMU.

Signed-off-by: Andrei Vagin <avagin at gmail.com>
---
 arch/Kconfig                           |  15 +++
 arch/x86/Kconfig                       |   1 +
 arch/x86/entry/common.c                |  16 +++
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 arch/x86/include/asm/sigcontext.h      |   2 +
 arch/x86/kernel/Makefile               |   1 +
 arch/x86/kernel/process_vm_exec.c      | 133 +++++++++++++++++++++++++
 arch/x86/kernel/signal.c               |  47 +++++++++
 include/linux/process_vm_exec.h        |  15 +++
 include/linux/sched.h                  |   7 ++
 include/linux/syscalls.h               |   6 ++
 include/uapi/asm-generic/unistd.h      |   4 +-
 kernel/fork.c                          |   9 ++
 kernel/sys_ni.c                        |   2 +
 14 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/process_vm_exec.c
 create mode 100644 include/linux/process_vm_exec.h

diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..3ed9b8fb1727 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -514,6 +514,21 @@ config SECCOMP_FILTER
 
 	  See Documentation/userspace-api/seccomp_filter.rst for details.
 
+config HAVE_ARCH_PROCESS_VM_EXEC
+	bool
+	help
+	  An arch should select this symbol to support the process_vm_exec system call.
+
+config PROCESS_VM_EXEC
+	prompt "Enable the process_vm_exec syscall"
+	def_bool y
+	depends on HAVE_ARCH_PROCESS_VM_EXEC
+	help
+	  process_vm_exec allows executing code and system calls in a specified
+	  address space.
+
+	  If unsure, say Y.
+
 config HAVE_ARCH_STACKLEAK
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fbf26e0f7a6a..1c7ebb58865e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
+	select HAVE_ARCH_PROCESS_VM_EXEC
 	select MODULES_USE_ELF_RELA
 	select NEED_DMA_MAP_STATE
 	select SWIOTLB
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..42eac459b25b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,6 +19,7 @@
 #include <linux/nospec.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/process_vm_exec.h>
 
 #ifdef CONFIG_XEN_PV
 #include <xen/xen-ops.h>
@@ -38,6 +39,21 @@
 #ifdef CONFIG_X86_64
 __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
 {
+#ifdef CONFIG_PROCESS_VM_EXEC
+	if (current->exec_mm && current->exec_mm->ctx) {
+		kernel_siginfo_t info = {
+			.si_signo = SIGSYS,
+			.si_call_addr = (void __user *)KSTK_EIP(current),
+			.si_arch = syscall_get_arch(current),
+			.si_syscall = nr,
+		};
+		restore_vm_exec_context(regs);
+		regs->ax = copy_siginfo_to_user(current->exec_mm->siginfo, &info);
+		syscall_exit_to_user_mode(regs);
+		return;
+	}
+#endif
+
 	nr = syscall_enter_from_user_mode(regs, nr);
 
 	instrumentation_begin();
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 379819244b91..2a8e27b2d87e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -362,6 +362,7 @@
 438	common	pidfd_getfd		sys_pidfd_getfd
 439	common	faccessat2		sys_faccessat2
 440	common	process_madvise		sys_process_madvise
+441	64	process_vm_exec		sys_process_vm_exec
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 140d890c2c98..e390410cc3e9 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -6,4 +6,6 @@
 
 #include <uapi/asm/sigcontext.h>
 
+extern long swap_vm_exec_context(struct sigcontext __user *uctx);
+
 #endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..d053289fd19e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -163,3 +163,4 @@ ifeq ($(CONFIG_X86_64),y)
 endif
 
 obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)	+= ima_arch.o
+obj-$(CONFIG_PROCESS_VM_EXEC)	+= process_vm_exec.o
diff --git a/arch/x86/kernel/process_vm_exec.c b/arch/x86/kernel/process_vm_exec.c
new file mode 100644
index 000000000000..28b32330f744
--- /dev/null
+++ b/arch/x86/kernel/process_vm_exec.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/syscall.h>
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+#include <asm/mmu_context.h>
+#include <asm/sigcontext.h>
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/syscalls.h>
+#include <linux/vmacache.h>
+#include <linux/process_vm_exec.h>
+
+static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *active_mm;
+
+	task_lock(tsk);
+	/* Hold off tlb flush IPIs while switching mm's */
+	local_irq_disable();
+
+	sync_mm_rss(prev_mm);
+
+	vmacache_flush(tsk);
+
+	active_mm = tsk->active_mm;
+	if (active_mm != target_mm) {
+		mmgrab(target_mm);
+		tsk->active_mm = target_mm;
+	}
+	tsk->mm = target_mm;
+	switch_mm_irqs_off(active_mm, target_mm, tsk);
+	local_irq_enable();
+	task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+	finish_arch_post_lock_switch();
+#endif
+
+	if (active_mm != target_mm)
+		mmdrop(active_mm);
+}
+
+void restore_vm_exec_context(struct pt_regs *regs)
+{
+	struct sigcontext __user *uctx;
+	struct mm_struct *prev_mm, *target_mm;
+
+	uctx = current->exec_mm->ctx;
+	current->exec_mm->ctx = NULL;
+
+	target_mm = current->exec_mm->mm;
+	current->exec_mm->mm = NULL;
+	prev_mm = current->mm;
+
+	swap_mm(prev_mm, target_mm);
+
+	mmput(prev_mm);
+	mmdrop(target_mm);
+
+	swap_vm_exec_context(uctx);
+}
+
+SYSCALL_DEFINE6(process_vm_exec, pid_t, pid, struct sigcontext __user *, uctx,
+		unsigned long, flags, siginfo_t __user *, uinfo,
+		sigset_t __user *, user_mask, size_t, sizemask)
+{
+	struct mm_struct *prev_mm, *mm;
+	struct task_struct *tsk;
+	long ret = -ESRCH;
+
+	sigset_t mask;
+
+	if (flags)
+		return -EINVAL;
+
+	if (sizemask != sizeof(sigset_t))
+		return -EINVAL;
+	if (copy_from_user(&mask, user_mask, sizeof(mask)))
+		return -EFAULT;
+
+	sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+	signotset(&mask);
+
+	tsk = find_get_task_by_vpid(pid);
+	if (!tsk) {
+		ret = -ESRCH;
+		goto err;
+	}
+	mm = mm_access(tsk, PTRACE_MODE_ATTACH_REALCREDS);
+	put_task_struct(tsk);
+	if (!mm || IS_ERR(mm)) {
+		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		goto err;
+	}
+
+	current_pt_regs()->ax = 0;
+	ret = swap_vm_exec_context(uctx);
+	if (ret < 0)
+		goto err_mm_put;
+
+	if (!current->exec_mm) {
+		ret = -ENOMEM;
+		current->exec_mm = kmalloc(sizeof(*current->exec_mm), GFP_KERNEL);
+		if (current->exec_mm == NULL)
+			goto err_mm_put;
+	}
+	current->exec_mm->ctx = uctx;
+	current->exec_mm->mm = current->mm;
+	current->exec_mm->flags = flags;
+	current->exec_mm->sigmask = mask;
+	current->exec_mm->siginfo = uinfo;
+	prev_mm = current->mm;
+
+	mmgrab(prev_mm);
+	swap_mm(prev_mm, mm);
+
+	ret = current_pt_regs()->ax;
+
+	return ret;
+err_mm_put:
+	mmput(mm);
+err:
+	return ret;
+}
+
+void free_exec_mm_struct(struct task_struct *p)
+{
+	kfree(p->exec_mm);
+	p->exec_mm = NULL;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc269a20dd5f..51286c79062b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
 #include <linux/context_tracking.h>
 #include <linux/entry-common.h>
 #include <linux/syscalls.h>
+#include <linux/process_vm_exec.h>
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
@@ -816,6 +817,23 @@ void arch_do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 
+#ifdef CONFIG_PROCESS_VM_EXEC
+	if (current->exec_mm && current->exec_mm->ctx) {
+		kernel_siginfo_t info;
+		int ret;
+
+		restore_vm_exec_context(current_pt_regs());
+
+		spin_lock_irq(&current->sighand->siglock);
+		ret = dequeue_signal(current, &current->exec_mm->sigmask, &info);
+		spin_unlock_irq(&current->sighand->siglock);
+
+		if (ret > 0)
+			ret = copy_siginfo_to_user(current->exec_mm->siginfo, &info);
+		regs->ax = ret;
+	}
+#endif
+
 	if (get_signal(&ksig)) {
 		/* Whee! Actually deliver the signal.  */
 		handle_signal(&ksig, regs);
@@ -896,3 +914,32 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
 	return 0;
 }
 #endif
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+long swap_vm_exec_context(struct sigcontext __user *uctx)
+{
+	struct sigcontext ctx = {};
+	sigset_t set = {};
+
+
+	if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE))
+		return -EFAULT;
+	/* A floating point state is managed from user-space. */
+	if (ctx.fpstate != 0)
+		return -EINVAL;
+	if (!user_access_begin(uctx, sizeof(*uctx)))
+		return -EFAULT;
+	unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault);
+	user_access_end();
+
+	if (__restore_sigcontext(current_pt_regs(), &ctx, 0))
+		goto badframe;
+
+	return 0;
+Efault:
+	user_access_end();
+badframe:
+	signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context");
+	return -EFAULT;
+}
+#endif
diff --git a/include/linux/process_vm_exec.h b/include/linux/process_vm_exec.h
new file mode 100644
index 000000000000..a02535fbd5c8
--- /dev/null
+++ b/include/linux/process_vm_exec.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PROCESS_VM_EXEC_H
+#define _LINUX_PROCESS_VM_EXEC_H
+
+struct exec_mm {
+	struct sigcontext *ctx;
+	struct mm_struct *mm;
+	unsigned long flags;
+	sigset_t sigmask;
+	siginfo_t __user *siginfo;
+};
+
+void free_exec_mm_struct(struct task_struct *tsk);
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76cd21fa5501..864a8fdd0ed7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -64,6 +64,7 @@ struct signal_struct;
 struct task_delay_info;
 struct task_group;
 struct io_uring_task;
+struct exec_mm;
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -637,6 +638,8 @@ struct wake_q_node {
 	struct wake_q_node *next;
 };
 
+struct exec_mm;
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
 	/*
@@ -757,6 +760,10 @@ struct task_struct {
 	struct mm_struct		*mm;
 	struct mm_struct		*active_mm;
 
+#ifdef CONFIG_PROCESS_VM_EXEC
+	struct exec_mm			*exec_mm;
+#endif
+
 	/* Per-thread vma caching: */
 	struct vmacache			vmacache;
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..bdea75a14975 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1347,4 +1347,10 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
 		int __user *optlen);
 int __sys_setsockopt(int fd, int level, int optname, char __user *optval,
 		int optlen);
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+void restore_vm_exec_context(struct pt_regs *regs);
+#else
+static inline void restore_vm_exec_context(struct pt_regs *regs) {}
+#endif
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 2056318988f7..60acbd9cf511 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
 #define __NR_process_madvise 440
 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_process_madvise 441
+__SYSCALL(__NR_process_vm_exec, sys_process_vm_exec)
 
 #undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..61ca7a4a1130 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,6 +96,7 @@
 #include <linux/kasan.h>
 #include <linux/scs.h>
 #include <linux/io_uring.h>
+#include <linux/process_vm_exec.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -461,6 +462,9 @@ void free_task(struct task_struct *tsk)
 	arch_release_task_struct(tsk);
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
+#ifdef CONFIG_PROCESS_VM_EXEC
+	free_exec_mm_struct(tsk);
+#endif
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -943,6 +947,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_MEMCG
 	tsk->active_memcg = NULL;
 #endif
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+	tsk->exec_mm = NULL;
+#endif
+
 	return tsk;
 
 free_stack:
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..2545a409bb07 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -350,6 +350,8 @@ COND_SYSCALL(pkey_mprotect);
 COND_SYSCALL(pkey_alloc);
 COND_SYSCALL(pkey_free);
 
+/* execute in another address space */
+COND_SYSCALL(process_vm_exec);
 
 /*
  * Architecture specific weak syscall entries.
-- 
2.29.2



More information about the CRIU mailing list