[Devel] [PATCH RHEL7 COMMIT] ms/x86_64, traps: Rework bad_iret

Tue Oct 16 15:29:42 MSK 2018

The commit is pushed to "branch-rh7-3.10.0-862.14.4.vz7.72.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-862.14.4.vz7.72.8
------>
commit 6672fb25dd422a40bb377708a2b0bf464cb771fd
Author: Andy Lutomirski <luto at amacapital.net>
Date:   Tue Oct 16 15:29:38 2018 +0300

    ms/x86_64, traps: Rework bad_iret
    
    It's possible for iretq to userspace to fail.  This can happen because
    of a bad CS, SS, or RIP.
    
    Historically, we've handled it by fixing up an exception from iretq to
    land at bad_iret, which pretends that the failed iret frame was really
    the hardware part of #GP(0) from userspace.  To make this work, there's
    an extra fixup to fudge the gs base into a usable state.
    
    This is suboptimal because it loses the original exception.  It's also
    buggy because there's no guarantee that we were on the kernel stack to
    begin with.  For example, if the failing iret happened on return from an
    NMI, then we'll end up executing general_protection on the NMI stack.
    This is bad for several reasons, the most immediate of which is that
    general_protection, as a non-paranoid idtentry, will try to deliver
    signals and/or schedule from the wrong stack.
    
    This patch throws out bad_iret entirely.  As a replacement, it augments
    the existing swapgs fudge into a full-blown iret fixup, mostly written
    in C.  It's should be clearer and more correct.
    
    Signed-off-by: Andy Lutomirski <luto at amacapital.net>
    Reviewed-by: Thomas Gleixner <tglx at linutronix.de>
    Cc: stable at vger.kernel.org
    Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
    
    https://jira.sw.ru/browse/PSBM-89221
    (cherry picked from commit b645af2d5905c4e32399005b867987919cbfc3ae)
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 arch/x86/kernel/entry_64.S | 49 ++++++++++++----------------------------------
 arch/x86/kernel/traps.c    | 29 +++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index edca3b75d1ef..173b765483d3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -935,42 +935,12 @@ retint_restore_args:	/* return to kernel space */
 
 irq_return:
 	INTERRUPT_RETURN
-	_ASM_EXTABLE(irq_return, bad_iret)
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
 	iretq
-	_ASM_EXTABLE(native_iret, bad_iret)
 #endif
 
-	.section .fixup,"ax"
-bad_iret:
-	/*
-	 * The iret traps when the %cs or %ss being restored is bogus.
-	 * We've lost the original trap vector and error code.
-	 * #GPF is the most likely one to get for an invalid selector.
-	 * So pretend we completed the iret and took the #GPF in user mode.
-	 *
-	 * We are now running with the kernel GS after exception recovery.
-	 * But error_entry expects us to have user GS to match the user %cs,
-	 * so swap back.
-	 */
-	pushq $0
-
-	/*
-	 * If a kernel bug clears user CS bit and in turn we'll skip SWAPGS in
-	 * general_protection, skip the SWAPGS here as well so we won't hard reboot.
-	 * This increases robustness of bad_iret to kernel bugs as well.
-	 */
-	testl $3, 8*2(%rsp)
-	je 1f
-	SWAPGS
-1:
-
-	jmp general_protection
-
-	.previous
-
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
@@ -1554,16 +1524,15 @@ ENTRY(error_entry)
 
 /*
  * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. The exception handlers after iret run with
- * kernel gs again, so don't set the user space flag. B stepping K8s
- * sometimes report an truncated RIP for IRET exceptions returning to
- * compat mode. Check for these here too.
+ * usergs. Handle them here.  B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
  */
 error_kernelspace:
 	incl %ebx
 	leaq irq_return(%rip),%rcx
 	cmpq %rcx,RIP+8(%rsp)
-	je error_swapgs
+	je error_bad_iret
 	movl %ecx,%eax	/* zero extend */
 	cmpq %rax,RIP+8(%rsp)
 	je bstep_iret
@@ -1574,7 +1543,15 @@ ENTRY(error_entry)
 bstep_iret:
 	/* Fix truncated RIP */
 	movq %rcx,RIP+8(%rsp)
-	jmp error_swapgs
+	/* fall through */
+
+error_bad_iret:
+	SWAPGS
+	mov %rsp,%rdi
+	call fixup_bad_iret
+	mov %rax,%rsp
+	decl %ebx	/* Return to usergs */
+	jmp error_sti
 	CFI_ENDPROC
 END(error_entry)
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 6b4dd819ef72..ec2bafb12afa 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -463,6 +463,35 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 		*regs = *eregs;
 	return regs;
 }
+
+struct bad_iret_stack {
+	void *error_entry_ret;
+	struct pt_regs regs;
+};
+
+asmlinkage __visible
+struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
+{
+	/*
+	 * This is called from entry_64.S early in handling a fault
+	 * caused by a bad iret to user mode.  To handle the fault
+	 * correctly, we want move our stack frame to task_pt_regs
+	 * and we want to pretend that the exception came from the
+	 * iret target.
+	 */
+	struct bad_iret_stack *new_stack =
+		container_of(task_pt_regs(current),
+			     struct bad_iret_stack, regs);
+
+	/* Copy the IRET target to the new stack. */
+	memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
+
+	/* Copy the remainder of the stack from the current stack. */
+	memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
+
+	BUG_ON(!user_mode_vm(&new_stack->regs));
+	return new_stack;
+}
 #endif
 
 /*