10 年之前 · fffbb5dcfd
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -354,8 +354,8 @@ GLOBAL(int_with_check)
 
				 	movl TI_flags(%rcx),%edx
			
 
				 	andl %edi,%edx
			
 
				 	jnz   int_careful
			
 
				-	andl    $~TS_COMPAT,TI_status(%rcx)
			
 
				-	jmp   retint_swapgs
			
 
				+	andl	$~TS_COMPAT,TI_status(%rcx)
			
 
				+	jmp	syscall_return
			
 
				 
			
 
				 	/* Either reschedule or signal or syscall exit tracking needed. */
			
 
				 	/* First do a reschedule test. */
			
@@ -399,9 +399,86 @@ int_restore_rest:
 
				 	DISABLE_INTERRUPTS(CLBR_NONE)
			
 
				 	TRACE_IRQS_OFF
			
 
				 	jmp int_with_check
			
 
				+
			
 
				+syscall_return:
			
 
				+	/* The IRETQ could re-enable interrupts: */
			
 
				+	DISABLE_INTERRUPTS(CLBR_ANY)
			
 
				+	TRACE_IRQS_IRETQ
			
 
				+
			
 
				+	/*
			
 
				+	 * Try to use SYSRET instead of IRET if we're returning to
			
 
				+	 * a completely clean 64-bit userspace context.
			
 
				+	 */
			
 
				+	movq RCX(%rsp),%rcx
			
 
				+	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
			
 
				+	jne opportunistic_sysret_failed
			
 
				+
			
 
				+	/*
			
 
				+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
			
 
				+	 * in kernel space.  This essentially lets the user take over
			
 
				+	 * the kernel, since userspace controls RSP.  It's not worth
			
 
				+	 * testing for canonicalness exactly -- this check detects any
			
 
				+	 * of the 17 high bits set, which is true for non-canonical
			
 
				+	 * or kernel addresses.  (This will pessimize vsyscall=native.
			
 
				+	 * Big deal.)
			
 
				+	 *
			
 
				+	 * If virtual addresses ever become wider, this will need
			
 
				+	 * to be updated to remain correct on both old and new CPUs.
			
 
				+	 */
			
 
				+	.ifne __VIRTUAL_MASK_SHIFT - 47
			
 
				+	.error "virtual address width changed -- SYSRET checks need update"
			
 
				+	.endif
			
 
				+	shr $__VIRTUAL_MASK_SHIFT, %rcx
			
 
				+	jnz opportunistic_sysret_failed
			
 
				+
			
 
				+	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
			
 
				+	jne opportunistic_sysret_failed
			
 
				+
			
 
				+	movq R11(%rsp),%r11
			
 
				+	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
			
 
				+	jne opportunistic_sysret_failed
			
 
				+
			
 
				+	/*
			
 
				+	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
			
 
				+	 * restoring TF results in a trap from userspace immediately after
			
 
				+	 * SYSRET.  This would cause an infinite loop whenever #DB happens
			
 
				+	 * with register state that satisfies the opportunistic SYSRET
			
 
				+	 * conditions.  For example, single-stepping this user code:
			
 
				+	 *
			
 
				+	 *           movq $stuck_here,%rcx
			
 
				+	 *           pushfq
			
 
				+	 *           popq %r11
			
 
				+	 *   stuck_here:
			
 
				+	 *
			
 
				+	 * would never get past 'stuck_here'.
			
 
				+	 */
			
 
				+	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
			
 
				+	jnz opportunistic_sysret_failed
			
 
				+
			
 
				+	/* nothing to check for RSP */
			
 
				+
			
 
				+	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
			
 
				+	jne opportunistic_sysret_failed
			
 
				+
			
 
				+	/*
			
 
				+	 * We win!  This label is here just for ease of understanding
			
 
				+	 * perf profiles.  Nothing jumps here.
			
 
				+	 */
			
 
				+syscall_return_via_sysret:
			
 
				+	CFI_REMEMBER_STATE
			
 
				+	/* r11 is already restored (see code above) */
			
 
				+	RESTORE_C_REGS_EXCEPT_R11
			
 
				+	movq RSP(%rsp),%rsp
			
 
				+	USERGS_SYSRET64
			
 
				+	CFI_RESTORE_STATE
			
 
				+
			
 
				+opportunistic_sysret_failed:
			
 
				+	SWAPGS
			
 
				+	jmp	restore_c_regs_and_iret
			
 
				 	CFI_ENDPROC
			
 
				 END(system_call)
			
 
				 
			
 
				+
			
 
				 	.macro FORK_LIKE func
			
 
				 ENTRY(stub_\func)
			
 
				 	CFI_STARTPROC
			
@@ -673,76 +750,8 @@ retint_swapgs:		/* return to user-space */
 
				 	DISABLE_INTERRUPTS(CLBR_ANY)
			
 
				 	TRACE_IRQS_IRETQ
			
 
				 
			
 
				-	/*
			
 
				-	 * Try to use SYSRET instead of IRET if we're returning to
			
 
				-	 * a completely clean 64-bit userspace context.
			
 
				-	 */
			
 
				-	movq RCX(%rsp),%rcx
			
 
				-	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
			
 
				-	jne opportunistic_sysret_failed
			
 
				-
			
 
				-	/*
			
 
				-	 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
			
 
				-	 * in kernel space.  This essentially lets the user take over
			
 
				-	 * the kernel, since userspace controls RSP.  It's not worth
			
 
				-	 * testing for canonicalness exactly -- this check detects any
			
 
				-	 * of the 17 high bits set, which is true for non-canonical
			
 
				-	 * or kernel addresses.  (This will pessimize vsyscall=native.
			
 
				-	 * Big deal.)
			
 
				-	 *
			
 
				-	 * If virtual addresses ever become wider, this will need
			
 
				-	 * to be updated to remain correct on both old and new CPUs.
			
 
				-	 */
			
 
				-	.ifne __VIRTUAL_MASK_SHIFT - 47
			
 
				-	.error "virtual address width changed -- sysret checks need update"
			
 
				-	.endif
			
 
				-	shr $__VIRTUAL_MASK_SHIFT, %rcx
			
 
				-	jnz opportunistic_sysret_failed
			
 
				-
			
 
				-	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
			
 
				-	jne opportunistic_sysret_failed
			
 
				-
			
 
				-	movq R11(%rsp),%r11
			
 
				-	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
			
 
				-	jne opportunistic_sysret_failed
			
 
				-
			
 
				-	/*
			
 
				-	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
			
 
				-	 * restoring TF results in a trap from userspace immediately after
			
 
				-	 * SYSRET.  This would cause an infinite loop whenever #DB happens
			
 
				-	 * with register state that satisfies the opportunistic SYSRET
			
 
				-	 * conditions.  For example, single-stepping this user code:
			
 
				-	 *
			
 
				-	 *           movq $stuck_here,%rcx
			
 
				-	 *           pushfq
			
 
				-	 *           popq %r11
			
 
				-	 *   stuck_here:
			
 
				-	 *
			
 
				-	 * would never get past 'stuck_here'.
			
 
				-	 */
			
 
				-	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
			
 
				-	jnz opportunistic_sysret_failed
			
 
				-
			
 
				-	/* nothing to check for RSP */
			
 
				-
			
 
				-	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
			
 
				-	jne opportunistic_sysret_failed
			
 
				-
			
 
				-	/*
			
 
				-	 * We win!  This label is here just for ease of understanding
			
 
				-	 * perf profiles.  Nothing jumps here.
			
 
				-	 */
			
 
				-irq_return_via_sysret:
			
 
				-	CFI_REMEMBER_STATE
			
 
				-	/* r11 is already restored (see code above) */
			
 
				-	RESTORE_C_REGS_EXCEPT_R11
			
 
				-	movq RSP(%rsp),%rsp
			
 
				-	USERGS_SYSRET64
			
 
				-	CFI_RESTORE_STATE
			
 
				-
			
 
				-opportunistic_sysret_failed:
			
 
				 	SWAPGS
			
 
				-	jmp restore_args
			
 
				+	jmp	restore_c_regs_and_iret
			
 
				 
			
 
				 /* Returning to kernel space */
			
 
				 retint_kernel:
			
@@ -761,7 +770,12 @@ retint_kernel:
 
				 	 * The iretq could re-enable interrupts:
			
 
				 	 */
			
 
				 	TRACE_IRQS_IRETQ
			
 
				-restore_args:
			
 
				+
			
 
				+/*
			
 
				+ * At this label, code paths which return to kernel and to user,
			
 
				+ * which come from interrupts/exception and from syscalls, merge.
			
 
				+ */
			
 
				+restore_c_regs_and_iret:
			
 
				 	RESTORE_C_REGS
			
 
				 	REMOVE_PT_GPREGS_FROM_STACK 8