[CRIU] Cleanup do_full_int80()

Nicolas Viennot Nicolas.Viennot at twosigma.com
Mon Sep 30 23:57:08 MSK 2019


1) Instead of tampering with the nr argument, do_full_int80() returns
the value of the system call. It also avoids copying all registers back
into the syscall_args32 argument after the syscall.

2) Additionally, the registers r12-r15 were added in the list of
clobbers as kernels older than v4.4 do not preserve these.

3) Further, GCC uses a 128-byte red-zone as defined in the x86_64 ABI
optimizing away the correct position of the %rsp register in
leaf-functions. We now avoid tampering with the red-zone, fixing a
SIGSEGV when running mmap_bug_test() in debug mode (DEBUG=1).

Signed-off-by: Nicolas Viennot <Nicolas.Viennot at twosigma.com>
---
 criu/arch/x86/crtools.c            |  6 ++--
 criu/arch/x86/include/asm/compat.h | 51 ++++++++++++++++++++----------
 criu/arch/x86/kerndat.c            |  4 +--
 criu/arch/x86/restorer.c           |  3 +-
 criu/arch/x86/sigaction_compat.c   |  6 +---
 5 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
index efc23e5f..e4073c27 100644
--- a/criu/arch/x86/crtools.c
+++ b/criu/arch/x86/crtools.c
@@ -590,8 +590,7 @@ static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len)
 		.arg2	= (uint32_t)len,
 	};
 
-	do_full_int80(&s);
-	return (int)s.nr;
+	return do_full_int80(&s);
 }
 
 static int set_robust_list32(uint32_t head, uint32_t len)
@@ -602,8 +601,7 @@ static int set_robust_list32(uint32_t head, uint32_t len)
 		.arg1	= len,
 	};
 
-	do_full_int80(&s);
-	return (int)s.nr;
+	return do_full_int80(&s);
 }
 
 int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info)
diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h
index cd1ae472..acd552fb 100644
--- a/criu/arch/x86/include/asm/compat.h
+++ b/criu/arch/x86/include/asm/compat.h
@@ -38,26 +38,45 @@ struct syscall_args32 {
 	uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5;
 };
 
-static inline void do_full_int80(struct syscall_args32 *args)
+static inline uint32_t do_full_int80(struct syscall_args32 *args)
 {
 	/*
-	 * r8-r11 registers are cleared during returning to userspace
-	 * from syscall - that's x86_64 ABI to avoid leaking kernel
-	 * pointers.
+	 * Kernel older than v4.4 do not preserve r8-r15 registers when
+	 * invoking int80, so we need to preserve them.
 	 *
-	 * Other than that - we can't use %rbp in clobbers as GCC's inline
-	 * assembly doesn't allow to do so. So, here is explicitly saving
-	 * %rbp before syscall and restoring it's value afterward.
+	 * Additionally, %rbp is used as the 6th syscall argument, and we need
+	 * to preserve its value when returning from the syscall to avoid
+	 * upsetting GCC. However, we can't use %rbp in the GCC asm clobbers
+	 * due to a GCC limitation. Instead, we explicitly save %rbp on the
+	 * stack before invoking the syscall and restore its value afterward.
+	 *
+	 * Further, GCC may not adjust the %rsp pointer when allocating the
+	 * args and ret variables because 1) do_full_int80() is a leaf
+	 * function, and 2) the local variables (args and ret) are in the
+	 * 128-byte red-zone as defined in the x86_64 ABI. To use the stack
+	 * when preserving %rbp, we must either tell GCC to a) mark the
+	 * function as non-leaf, or b) move away from the red-zone when using
+	 * the stack. It seems that there is no easy way to do a), so we'll go
+	 * with b).
+	 * Note 1: Another workaround would have been to add %rsp in the list
+	 * of clobbers, but this was deprecated in GCC 9.
+	 * Note 2: This red-zone bug only manifests when compiling CRIU with
+	 * DEBUG=1.
 	 */
-	asm volatile ("pushq %%rbp\n\t"
-			"mov %6, %%ebp\n\t"
-			"int $0x80\n\t"
-			"mov %%ebp, %6\n\t"
-			"popq %%rbp\n\t"
-		      : "+a" (args->nr),
-			"+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2),
-			"+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5)
-			: : "r8", "r9", "r10", "r11");
+	uint32_t ret;
+
+	asm volatile ("sub $128, %%rsp\n\t"
+		      "pushq %%rbp\n\t"
+		      "mov %7, %%ebp\n\t"
+		      "int $0x80\n\t"
+		      "popq %%rbp\n\t"
+		      "add $128, %%rsp\n\t"
+		      : "=a" (ret)
+		      : "a" (args->nr),
+			"b" (args->arg0), "c" (args->arg1), "d" (args->arg2),
+			"S" (args->arg3), "D" (args->arg4), "g" (args->arg5)
+		      : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15");
+	return ret;
 }
 
 #ifndef CR_NOGLIBC
diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c
index f7593251..94c954e1 100644
--- a/criu/arch/x86/kerndat.c
+++ b/criu/arch/x86/kerndat.c
@@ -75,9 +75,7 @@ void *mmap_ia32(void *addr, size_t len, int prot,
 	s.arg4  = fildes;
 	s.arg5  = (uint32_t)off;
 
-	do_full_int80(&s);
-
-	return (void *)(uintptr_t)s.nr;
+	return (void *)(uintptr_t)do_full_int80(&s);
 }
 
 /*
diff --git a/criu/arch/x86/restorer.c b/criu/arch/x86/restorer.c
index 2d335d5e..b2c3b366 100644
--- a/criu/arch/x86/restorer.c
+++ b/criu/arch/x86/restorer.c
@@ -54,8 +54,7 @@ int set_compat_robust_list(uint32_t head_ptr, uint32_t len)
 		.arg1	= len,
 	};
 
-	do_full_int80(&s);
-	return (int)s.nr;
+	return do_full_int80(&s);
 }
 
 static int prepare_stack32(void **stack32)
diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c
index b38ba801..f467da49 100644
--- a/criu/arch/x86/sigaction_compat.c
+++ b/criu/arch/x86/sigaction_compat.c
@@ -28,7 +28,6 @@ extern char restore_rt_sigaction;
  */
 int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
 {
-	int ret;
 	struct syscall_args32 arg = {};
 	unsigned long act_stack = (unsigned long)stack32;
 
@@ -49,8 +48,5 @@ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act)
 	arg.arg2	= 0;					/* oldact */
 	arg.arg3	= (uint32_t)sizeof(act->rt_sa_mask);	/* sigsetsize */
 
-	do_full_int80(&arg);
-	asm volatile ("\t movl %%eax,%0\n" : "=r"(ret));
-	return ret;
+	return do_full_int80(&arg);
 }
-
-- 
2.19.1




More information about the CRIU mailing list