[CRIU] [PATCH 2/4] arch/x86: implement the process_vm_exec syscall
Andrei Vagin
avagin at gmail.com
Wed Apr 14 08:52:15 MSK 2021
This change introduces the new system call:
process_vm_exec(pid_t pid, struct sigcontext *uctx, unsigned long flags,
siginfo_t * uinfo, sigset_t *sigmask, size_t sizemask)
process_vm_exec allows to execute the current process in an address
space of another process.
process_vm_exec swaps the current address space with an address space of
a specified process, sets a state from sigcontex and resumes the process.
When a process receives a signal or calls a system call,
process_vm_exec saves the process state back to sigcontext, restores the
origin address space, restores the origin process state, and returns to
userspace.
If it was interrupted by a signal and the signal is in the user_mask,
the signal is dequeued and information about it is saved in uinfo.
If process_vm_exec is interrupted by a system call, a synthetic siginfo
for the SIGSYS signal is generated.
The behavior of this system call is similar to PTRACE_SYSEMU but
everything is happing in the context of one process, so
process_vm_exec shows a better performance.
PTRACE_SYSEMU is primarily used to implement sandboxes (application
kernels) like User-mode Linux or gVisor. These type of sandboxes
intercepts applications system calls and acts as the guest kernel.
A simple benchmark, where a "tracee" process executes systems calls in a
loop and a "tracer" process traps syscalls and handles them just
incrementing the tracee instruction pointer to skip the syscall
instruction shows that process_vm_exec works more than 5 times faster
than PTRACE_SYSEMU.
Signed-off-by: Andrei Vagin <avagin at gmail.com>
---
arch/Kconfig | 15 +++
arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 16 +++
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
arch/x86/include/asm/sigcontext.h | 2 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/process_vm_exec.c | 133 +++++++++++++++++++++++++
arch/x86/kernel/signal.c | 47 +++++++++
include/linux/process_vm_exec.h | 15 +++
include/linux/sched.h | 7 ++
include/linux/syscalls.h | 6 ++
include/uapi/asm-generic/unistd.h | 4 +-
kernel/fork.c | 9 ++
kernel/sys_ni.c | 2 +
14 files changed, 258 insertions(+), 1 deletion(-)
create mode 100644 arch/x86/kernel/process_vm_exec.c
create mode 100644 include/linux/process_vm_exec.h
diff --git a/arch/Kconfig b/arch/Kconfig
index ba4e966484ab..3ed9b8fb1727 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -514,6 +514,21 @@ config SECCOMP_FILTER
See Documentation/userspace-api/seccomp_filter.rst for details.
+config HAVE_ARCH_PROCESS_VM_EXEC
+ bool
+ help
+ An arch should select this symbol to support the process_vm_exec system call.
+
+config PROCESS_VM_EXEC
+ prompt "Enable the process_vm_exec syscall"
+ def_bool y
+ depends on HAVE_ARCH_PROCESS_VM_EXEC
+ help
+ process_vm_exec allows executing code and system calls in a specified
+ address space.
+
+ If unsure, say Y.
+
config HAVE_ARCH_STACKLEAK
bool
help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fbf26e0f7a6a..1c7ebb58865e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -27,6 +27,7 @@ config X86_64
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_SOFT_DIRTY
+ select HAVE_ARCH_PROCESS_VM_EXEC
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select SWIOTLB
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..42eac459b25b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,6 +19,7 @@
#include <linux/nospec.h>
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#include <linux/process_vm_exec.h>
#ifdef CONFIG_XEN_PV
#include <xen/xen-ops.h>
@@ -38,6 +39,21 @@
#ifdef CONFIG_X86_64
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
+#ifdef CONFIG_PROCESS_VM_EXEC
+ if (current->exec_mm && current->exec_mm->ctx) {
+ kernel_siginfo_t info = {
+ .si_signo = SIGSYS,
+ .si_call_addr = (void __user *)KSTK_EIP(current),
+ .si_arch = syscall_get_arch(current),
+ .si_syscall = nr,
+ };
+ restore_vm_exec_context(regs);
+ regs->ax = copy_siginfo_to_user(current->exec_mm->siginfo, &info);
+ syscall_exit_to_user_mode(regs);
+ return;
+ }
+#endif
+
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 379819244b91..2a8e27b2d87e 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -362,6 +362,7 @@
438 common pidfd_getfd sys_pidfd_getfd
439 common faccessat2 sys_faccessat2
440 common process_madvise sys_process_madvise
+441 64 process_vm_exec sys_process_vm_exec
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 140d890c2c98..e390410cc3e9 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -6,4 +6,6 @@
#include <uapi/asm/sigcontext.h>
+extern long swap_vm_exec_context(struct sigcontext __user *uctx);
+
#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..d053289fd19e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -163,3 +163,4 @@ ifeq ($(CONFIG_X86_64),y)
endif
obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
+obj-$(CONFIG_PROCESS_VM_EXEC) += process_vm_exec.o
diff --git a/arch/x86/kernel/process_vm_exec.c b/arch/x86/kernel/process_vm_exec.c
new file mode 100644
index 000000000000..28b32330f744
--- /dev/null
+++ b/arch/x86/kernel/process_vm_exec.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <asm/syscall.h>
+#include <asm/sigframe.h>
+#include <asm/signal.h>
+#include <asm/mmu_context.h>
+#include <asm/sigcontext.h>
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched/mm.h>
+#include <linux/syscalls.h>
+#include <linux/vmacache.h>
+#include <linux/process_vm_exec.h>
+
+static void swap_mm(struct mm_struct *prev_mm, struct mm_struct *target_mm)
+{
+ struct task_struct *tsk = current;
+ struct mm_struct *active_mm;
+
+ task_lock(tsk);
+ /* Hold off tlb flush IPIs while switching mm's */
+ local_irq_disable();
+
+ sync_mm_rss(prev_mm);
+
+ vmacache_flush(tsk);
+
+ active_mm = tsk->active_mm;
+ if (active_mm != target_mm) {
+ mmgrab(target_mm);
+ tsk->active_mm = target_mm;
+ }
+ tsk->mm = target_mm;
+ switch_mm_irqs_off(active_mm, target_mm, tsk);
+ local_irq_enable();
+ task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+ finish_arch_post_lock_switch();
+#endif
+
+ if (active_mm != target_mm)
+ mmdrop(active_mm);
+}
+
+void restore_vm_exec_context(struct pt_regs *regs)
+{
+ struct sigcontext __user *uctx;
+ struct mm_struct *prev_mm, *target_mm;
+
+ uctx = current->exec_mm->ctx;
+ current->exec_mm->ctx = NULL;
+
+ target_mm = current->exec_mm->mm;
+ current->exec_mm->mm = NULL;
+ prev_mm = current->mm;
+
+ swap_mm(prev_mm, target_mm);
+
+ mmput(prev_mm);
+ mmdrop(target_mm);
+
+ swap_vm_exec_context(uctx);
+}
+
+SYSCALL_DEFINE6(process_vm_exec, pid_t, pid, struct sigcontext __user *, uctx,
+ unsigned long, flags, siginfo_t __user *, uinfo,
+ sigset_t __user *, user_mask, size_t, sizemask)
+{
+ struct mm_struct *prev_mm, *mm;
+ struct task_struct *tsk;
+ long ret = -ESRCH;
+
+ sigset_t mask;
+
+ if (flags)
+ return -EINVAL;
+
+ if (sizemask != sizeof(sigset_t))
+ return -EINVAL;
+ if (copy_from_user(&mask, user_mask, sizeof(mask)))
+ return -EFAULT;
+
+ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ signotset(&mask);
+
+ tsk = find_get_task_by_vpid(pid);
+ if (!tsk) {
+ ret = -ESRCH;
+ goto err;
+ }
+ mm = mm_access(tsk, PTRACE_MODE_ATTACH_REALCREDS);
+ put_task_struct(tsk);
+ if (!mm || IS_ERR(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto err;
+ }
+
+ current_pt_regs()->ax = 0;
+ ret = swap_vm_exec_context(uctx);
+ if (ret < 0)
+ goto err_mm_put;
+
+ if (!current->exec_mm) {
+ ret = -ENOMEM;
+ current->exec_mm = kmalloc(sizeof(*current->exec_mm), GFP_KERNEL);
+ if (current->exec_mm == NULL)
+ goto err_mm_put;
+ }
+ current->exec_mm->ctx = uctx;
+ current->exec_mm->mm = current->mm;
+ current->exec_mm->flags = flags;
+ current->exec_mm->sigmask = mask;
+ current->exec_mm->siginfo = uinfo;
+ prev_mm = current->mm;
+
+ mmgrab(prev_mm);
+ swap_mm(prev_mm, mm);
+
+ ret = current_pt_regs()->ax;
+
+ return ret;
+err_mm_put:
+ mmput(mm);
+err:
+ return ret;
+}
+
+void free_exec_mm_struct(struct task_struct *p)
+{
+ kfree(p->exec_mm);
+ p->exec_mm = NULL;
+}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cc269a20dd5f..51286c79062b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -27,6 +27,7 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/syscalls.h>
+#include <linux/process_vm_exec.h>
#include <asm/processor.h>
#include <asm/ucontext.h>
@@ -816,6 +817,23 @@ void arch_do_signal(struct pt_regs *regs)
{
struct ksignal ksig;
+#ifdef CONFIG_PROCESS_VM_EXEC
+ if (current->exec_mm && current->exec_mm->ctx) {
+ kernel_siginfo_t info;
+ int ret;
+
+ restore_vm_exec_context(current_pt_regs());
+
+ spin_lock_irq(¤t->sighand->siglock);
+ ret = dequeue_signal(current, ¤t->exec_mm->sigmask, &info);
+ spin_unlock_irq(¤t->sighand->siglock);
+
+ if (ret > 0)
+ ret = copy_siginfo_to_user(current->exec_mm->siginfo, &info);
+ regs->ax = ret;
+ }
+#endif
+
if (get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
@@ -896,3 +914,32 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
return 0;
}
#endif
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+long swap_vm_exec_context(struct sigcontext __user *uctx)
+{
+ struct sigcontext ctx = {};
+ sigset_t set = {};
+
+
+ if (copy_from_user(&ctx, uctx, CONTEXT_COPY_SIZE))
+ return -EFAULT;
+ /* A floating point state is managed from user-space. */
+ if (ctx.fpstate != 0)
+ return -EINVAL;
+ if (!user_access_begin(uctx, sizeof(*uctx)))
+ return -EFAULT;
+ unsafe_put_sigcontext(uctx, NULL, current_pt_regs(), (&set), Efault);
+ user_access_end();
+
+ if (__restore_sigcontext(current_pt_regs(), &ctx, 0))
+ goto badframe;
+
+ return 0;
+Efault:
+ user_access_end();
+badframe:
+ signal_fault(current_pt_regs(), uctx, "swap_vm_exec_context");
+ return -EFAULT;
+}
+#endif
diff --git a/include/linux/process_vm_exec.h b/include/linux/process_vm_exec.h
new file mode 100644
index 000000000000..a02535fbd5c8
--- /dev/null
+++ b/include/linux/process_vm_exec.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PROCESS_VM_EXEC_H
+#define _LINUX_PROCESS_VM_EXEC_H
+
+struct exec_mm {
+ struct sigcontext *ctx;
+ struct mm_struct *mm;
+ unsigned long flags;
+ sigset_t sigmask;
+ siginfo_t __user *siginfo;
+};
+
+void free_exec_mm_struct(struct task_struct *tsk);
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76cd21fa5501..864a8fdd0ed7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -64,6 +64,7 @@ struct signal_struct;
struct task_delay_info;
struct task_group;
struct io_uring_task;
+struct exec_mm;
/*
* Task state bitmask. NOTE! These bits are also
@@ -637,6 +638,8 @@ struct wake_q_node {
struct wake_q_node *next;
};
+struct exec_mm;
+
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
@@ -757,6 +760,10 @@ struct task_struct {
struct mm_struct *mm;
struct mm_struct *active_mm;
+#ifdef CONFIG_PROCESS_VM_EXEC
+ struct exec_mm *exec_mm;
+#endif
+
/* Per-thread vma caching: */
struct vmacache vmacache;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 37bea07c12f2..bdea75a14975 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1347,4 +1347,10 @@ int __sys_getsockopt(int fd, int level, int optname, char __user *optval,
int __user *optlen);
int __sys_setsockopt(int fd, int level, int optname, char __user *optval,
int optlen);
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+void restore_vm_exec_context(struct pt_regs *regs);
+#else
+static inline void restore_vm_exec_context(struct pt_regs *regs) {}
+#endif
#endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 2056318988f7..60acbd9cf511 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
__SYSCALL(__NR_faccessat2, sys_faccessat2)
#define __NR_process_madvise 440
__SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_process_madvise 441
+__SYSCALL(__NR_process_vm_exec, sys_process_vm_exec)
#undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
/*
* 32 bit systems traditionally used different
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..61ca7a4a1130 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,6 +96,7 @@
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
+#include <linux/process_vm_exec.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -461,6 +462,9 @@ void free_task(struct task_struct *tsk)
arch_release_task_struct(tsk);
if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
+#ifdef CONFIG_PROCESS_VM_EXEC
+ free_exec_mm_struct(tsk);
+#endif
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
@@ -943,6 +947,11 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
+
+#ifdef CONFIG_PROCESS_VM_EXEC
+ tsk->exec_mm = NULL;
+#endif
+
return tsk;
free_stack:
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index f27ac94d5fa7..2545a409bb07 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -350,6 +350,8 @@ COND_SYSCALL(pkey_mprotect);
COND_SYSCALL(pkey_alloc);
COND_SYSCALL(pkey_free);
+/* execute in another address space */
+COND_SYSCALL(process_vm_exec);
/*
* Architecture specific weak syscall entries.
--
2.29.2
More information about the CRIU
mailing list