浏览代码

Merge commit 'upstream-x86-entry' into WIP.x86/mm

Pull in a minimal set of v4.15 entry code changes, for a base for the MM isolation patches.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 7 年之前
父节点
当前提交
0fd2e9c53d
共有 55 个文件被更改,包括 579 次插入361 次删除
  1. 1 1
      Documentation/x86/orc-unwinder.txt
  2. 2 2
      Makefile
  3. 1 1
      arch/x86/Kconfig
  4. 20 19
      arch/x86/Kconfig.debug
  5. 2 2
      arch/x86/configs/tiny.config
  6. 1 0
      arch/x86/configs/x86_64_defconfig
  7. 19 50
      arch/x86/entry/calling.h
  8. 83 58
      arch/x86/entry/entry_64.S
  9. 1 2
      arch/x86/entry/entry_64_compat.S
  10. 4 4
      arch/x86/include/asm/archrandom.h
  11. 5 5
      arch/x86/include/asm/bitops.h
  12. 1 0
      arch/x86/include/asm/compat.h
  13. 4 5
      arch/x86/include/asm/cpufeature.h
  14. 11 0
      arch/x86/include/asm/cpufeatures.h
  15. 1 1
      arch/x86/include/asm/module.h
  16. 2 3
      arch/x86/include/asm/paravirt.h
  17. 1 1
      arch/x86/include/asm/paravirt_types.h
  18. 1 1
      arch/x86/include/asm/percpu.h
  19. 21 31
      arch/x86/include/asm/processor.h
  20. 5 1
      arch/x86/include/asm/ptrace.h
  21. 1 1
      arch/x86/include/asm/rmwcc.h
  22. 24 0
      arch/x86/include/asm/switch_to.h
  23. 1 1
      arch/x86/include/asm/syscalls.h
  24. 0 10
      arch/x86/include/asm/trace/fpu.h
  25. 19 1
      arch/x86/include/asm/traps.h
  26. 4 4
      arch/x86/include/asm/unwind.h
  27. 3 0
      arch/x86/include/uapi/asm/processor-flags.h
  28. 3 4
      arch/x86/kernel/Makefile
  29. 1 0
      arch/x86/kernel/cpu/Makefile
  30. 17 12
      arch/x86/kernel/cpu/common.c
  31. 125 0
      arch/x86/kernel/cpu/cpuid-deps.c
  32. 11 0
      arch/x86/kernel/fpu/init.c
  33. 23 20
      arch/x86/kernel/fpu/xstate.c
  34. 1 4
      arch/x86/kernel/head_32.S
  35. 17 17
      arch/x86/kernel/head_64.S
  36. 13 3
      arch/x86/kernel/ldt.c
  37. 7 1
      arch/x86/kernel/process.c
  38. 4 2
      arch/x86/kernel/process_32.c
  39. 2 3
      arch/x86/kernel/process_64.c
  40. 1 2
      arch/x86/kernel/smpboot.c
  41. 1 2
      arch/x86/kernel/traps.c
  42. 2 1
      arch/x86/kernel/verify_cpu.S
  43. 11 9
      arch/x86/kernel/vm86_32.c
  44. 34 54
      arch/x86/mm/fault.c
  45. 5 2
      arch/x86/um/ldt.c
  46. 4 5
      arch/x86/xen/enlighten_pv.c
  47. 14 3
      arch/x86/xen/smp_pv.c
  48. 1 1
      arch/x86/xen/xen-asm_64.S
  49. 8 3
      arch/x86/xen/xen-head.S
  50. 1 1
      include/asm-generic/vmlinux.lds.h
  51. 26 0
      include/linux/bitops.h
  52. 1 1
      lib/Kconfig.debug
  53. 1 1
      scripts/Makefile.build
  54. 5 2
      tools/objtool/check.c
  55. 2 4
      tools/objtool/objtool.c

+ 1 - 1
Documentation/x86/orc-unwinder.txt

@@ -4,7 +4,7 @@ ORC unwinder
 Overview
 Overview
 --------
 --------
 
 
-The kernel CONFIG_ORC_UNWINDER option enables the ORC unwinder, which is
+The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is
 similar in concept to a DWARF unwinder.  The difference is that the
 similar in concept to a DWARF unwinder.  The difference is that the
 format of the ORC data is much simpler than DWARF, which in turn allows
 format of the ORC data is much simpler than DWARF, which in turn allows
 the ORC unwinder to be much simpler and faster.
 the ORC unwinder to be much simpler and faster.

+ 2 - 2
Makefile

@@ -934,8 +934,8 @@ ifdef CONFIG_STACK_VALIDATION
   ifeq ($(has_libelf),1)
   ifeq ($(has_libelf),1)
     objtool_target := tools/objtool FORCE
     objtool_target := tools/objtool FORCE
   else
   else
-    ifdef CONFIG_ORC_UNWINDER
-      $(error "Cannot generate ORC metadata for CONFIG_ORC_UNWINDER=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
+    ifdef CONFIG_UNWINDER_ORC
+      $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
     else
     else
       $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
       $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel")
     endif
     endif

+ 1 - 1
arch/x86/Kconfig

@@ -171,7 +171,7 @@ config X86
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE
 	select HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
+	select HAVE_RELIABLE_STACKTRACE		if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_UNSTABLE_SCHED_CLOCK

+ 20 - 19
arch/x86/Kconfig.debug

@@ -359,28 +359,14 @@ config PUNIT_ATOM_DEBUG
 
 
 choice
 choice
 	prompt "Choose kernel unwinder"
 	prompt "Choose kernel unwinder"
-	default FRAME_POINTER_UNWINDER
+	default UNWINDER_ORC if X86_64
+	default UNWINDER_FRAME_POINTER if X86_32
 	---help---
 	---help---
 	  This determines which method will be used for unwinding kernel stack
 	  This determines which method will be used for unwinding kernel stack
 	  traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
 	  traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
 	  livepatch, lockdep, and more.
 	  livepatch, lockdep, and more.
 
 
-config FRAME_POINTER_UNWINDER
-	bool "Frame pointer unwinder"
-	select FRAME_POINTER
-	---help---
-	  This option enables the frame pointer unwinder for unwinding kernel
-	  stack traces.
-
-	  The unwinder itself is fast and it uses less RAM than the ORC
-	  unwinder, but the kernel text size will grow by ~3% and the kernel's
-	  overall performance will degrade by roughly 5-10%.
-
-	  This option is recommended if you want to use the livepatch
-	  consistency model, as this is currently the only way to get a
-	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
-
-config ORC_UNWINDER
+config UNWINDER_ORC
 	bool "ORC unwinder"
 	bool "ORC unwinder"
 	depends on X86_64
 	depends on X86_64
 	select STACK_VALIDATION
 	select STACK_VALIDATION
@@ -396,7 +382,22 @@ config ORC_UNWINDER
 	  Enabling this option will increase the kernel's runtime memory usage
 	  Enabling this option will increase the kernel's runtime memory usage
 	  by roughly 2-4MB, depending on your kernel config.
 	  by roughly 2-4MB, depending on your kernel config.
 
 
-config GUESS_UNWINDER
+config UNWINDER_FRAME_POINTER
+	bool "Frame pointer unwinder"
+	select FRAME_POINTER
+	---help---
+	  This option enables the frame pointer unwinder for unwinding kernel
+	  stack traces.
+
+	  The unwinder itself is fast and it uses less RAM than the ORC
+	  unwinder, but the kernel text size will grow by ~3% and the kernel's
+	  overall performance will degrade by roughly 5-10%.
+
+	  This option is recommended if you want to use the livepatch
+	  consistency model, as this is currently the only way to get a
+	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config UNWINDER_GUESS
 	bool "Guess unwinder"
 	bool "Guess unwinder"
 	depends on EXPERT
 	depends on EXPERT
 	---help---
 	---help---
@@ -411,7 +412,7 @@ config GUESS_UNWINDER
 endchoice
 endchoice
 
 
 config FRAME_POINTER
 config FRAME_POINTER
-	depends on !ORC_UNWINDER && !GUESS_UNWINDER
+	depends on !UNWINDER_ORC && !UNWINDER_GUESS
 	bool
 	bool
 
 
 endmenu
 endmenu

+ 2 - 2
arch/x86/configs/tiny.config

@@ -1,5 +1,5 @@
 CONFIG_NOHIGHMEM=y
 CONFIG_NOHIGHMEM=y
 # CONFIG_HIGHMEM4G is not set
 # CONFIG_HIGHMEM4G is not set
 # CONFIG_HIGHMEM64G is not set
 # CONFIG_HIGHMEM64G is not set
-CONFIG_GUESS_UNWINDER=y
-# CONFIG_FRAME_POINTER_UNWINDER is not set
+CONFIG_UNWINDER_GUESS=y
+# CONFIG_UNWINDER_FRAME_POINTER is not set

+ 1 - 0
arch/x86/configs/x86_64_defconfig

@@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_OPTIMIZE_INLINING=y
 CONFIG_OPTIMIZE_INLINING=y
+CONFIG_UNWINDER_ORC=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
 CONFIG_SECURITY_SELINUX=y

+ 19 - 50
arch/x86/entry/calling.h

@@ -142,56 +142,25 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 	.endm
 
 
-	.macro RESTORE_EXTRA_REGS offset=0
-	movq 0*8+\offset(%rsp), %r15
-	movq 1*8+\offset(%rsp), %r14
-	movq 2*8+\offset(%rsp), %r13
-	movq 3*8+\offset(%rsp), %r12
-	movq 4*8+\offset(%rsp), %rbp
-	movq 5*8+\offset(%rsp), %rbx
-	UNWIND_HINT_REGS offset=\offset extra=0
-	.endm
-
-	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
-	.if \rstor_r11
-	movq 6*8(%rsp), %r11
-	.endif
-	.if \rstor_r8910
-	movq 7*8(%rsp), %r10
-	movq 8*8(%rsp), %r9
-	movq 9*8(%rsp), %r8
-	.endif
-	.if \rstor_rax
-	movq 10*8(%rsp), %rax
-	.endif
-	.if \rstor_rcx
-	movq 11*8(%rsp), %rcx
-	.endif
-	.if \rstor_rdx
-	movq 12*8(%rsp), %rdx
-	.endif
-	movq 13*8(%rsp), %rsi
-	movq 14*8(%rsp), %rdi
-	UNWIND_HINT_IRET_REGS offset=16*8
-	.endm
-	.macro RESTORE_C_REGS
-	RESTORE_C_REGS_HELPER 1,1,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RAX
-	RESTORE_C_REGS_HELPER 0,1,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RCX
-	RESTORE_C_REGS_HELPER 1,0,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_R11
-	RESTORE_C_REGS_HELPER 1,1,0,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
-	RESTORE_C_REGS_HELPER 1,0,0,1,1
-	.endm
-
-	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
-	subq $-(15*8+\addskip), %rsp
+	.macro POP_EXTRA_REGS
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+	.endm
+
+	.macro POP_C_REGS
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	popq %rdi
 	.endm
 	.endm
 
 
 	.macro icebp
 	.macro icebp

+ 83 - 58
arch/x86/entry/entry_64.S

@@ -221,10 +221,9 @@ entry_SYSCALL_64_fastpath:
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
 	movq	RIP(%rsp), %rcx
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
 	movq	EFLAGS(%rsp), %r11
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RSP(%rsp), %rsp
+	addq	$6*8, %rsp	/* skip extra regs -- they were preserved */
 	UNWIND_HINT_EMPTY
 	UNWIND_HINT_EMPTY
-	USERGS_SYSRET64
+	jmp	.Lpop_c_regs_except_rcx_r11_and_sysret
 
 
 1:
 1:
 	/*
 	/*
@@ -246,17 +245,18 @@ entry_SYSCALL64_slow_path:
 	call	do_syscall_64		/* returns with IRQs disabled */
 	call	do_syscall_64		/* returns with IRQs disabled */
 
 
 return_from_SYSCALL_64:
 return_from_SYSCALL_64:
-	RESTORE_EXTRA_REGS
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 
 	/*
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * Try to use SYSRET instead of IRET if we're returning to
-	 * a completely clean 64-bit userspace context.
+	 * a completely clean 64-bit userspace context.  If we're not,
+	 * go to the slow exit path.
 	 */
 	 */
 	movq	RCX(%rsp), %rcx
 	movq	RCX(%rsp), %rcx
 	movq	RIP(%rsp), %r11
 	movq	RIP(%rsp), %r11
-	cmpq	%rcx, %r11			/* RCX == RIP */
-	jne	opportunistic_sysret_failed
+
+	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 
 	/*
 	/*
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
@@ -274,14 +274,14 @@ return_from_SYSCALL_64:
 
 
 	/* If this changed %rcx, it was not canonical */
 	/* If this changed %rcx, it was not canonical */
 	cmpq	%rcx, %r11
 	cmpq	%rcx, %r11
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 
 	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
 	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 
 	movq	R11(%rsp), %r11
 	movq	R11(%rsp), %r11
 	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
 	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 
 	/*
 	/*
 	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
 	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
@@ -302,12 +302,12 @@ return_from_SYSCALL_64:
 	 * would never get past 'stuck_here'.
 	 * would never get past 'stuck_here'.
 	 */
 	 */
 	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
 	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz	opportunistic_sysret_failed
+	jnz	swapgs_restore_regs_and_return_to_usermode
 
 
 	/* nothing to check for RSP */
 	/* nothing to check for RSP */
 
 
 	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
 	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 
 	/*
 	/*
 	 * We win! This label is here just for ease of understanding
 	 * We win! This label is here just for ease of understanding
@@ -315,14 +315,20 @@ return_from_SYSCALL_64:
 	 */
 	 */
 syscall_return_via_sysret:
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	/* rcx and r11 are already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RSP(%rsp), %rsp
 	UNWIND_HINT_EMPTY
 	UNWIND_HINT_EMPTY
+	POP_EXTRA_REGS
+.Lpop_c_regs_except_rcx_r11_and_sysret:
+	popq	%rsi	/* skip r11 */
+	popq	%r10
+	popq	%r9
+	popq	%r8
+	popq	%rax
+	popq	%rsi	/* skip rcx */
+	popq	%rdx
+	popq	%rsi
+	popq	%rdi
+	movq	RSP-ORIG_RAX(%rsp), %rsp
 	USERGS_SYSRET64
 	USERGS_SYSRET64
-
-opportunistic_sysret_failed:
-	SWAPGS
-	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
 END(entry_SYSCALL_64)
 
 
 ENTRY(stub_ptregs_64)
 ENTRY(stub_ptregs_64)
@@ -423,8 +429,7 @@ ENTRY(ret_from_fork)
 	movq	%rsp, %rdi
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode
 
 
 1:
 1:
 	/* kernel thread */
 	/* kernel thread */
@@ -612,8 +617,21 @@ GLOBAL(retint_user)
 	mov	%rsp,%rdi
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
 	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
 	TRACE_IRQS_IRETQ
+
+GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates user mode. */
+	testb	$3, CS(%rsp)
+	jnz	1f
+	ud2
+1:
+#endif
 	SWAPGS
 	SWAPGS
-	jmp	restore_regs_and_iret
+	POP_EXTRA_REGS
+	POP_C_REGS
+	addq	$8, %rsp	/* skip regs->orig_ax */
+	INTERRUPT_RETURN
+
 
 
 /* Returning to kernel space */
 /* Returning to kernel space */
 retint_kernel:
 retint_kernel:
@@ -633,15 +651,17 @@ retint_kernel:
 	 */
 	 */
 	TRACE_IRQS_IRETQ
 	TRACE_IRQS_IRETQ
 
 
-/*
- * At this label, code paths which return to kernel and to user,
- * which come from interrupts/exception and from syscalls, merge.
- */
-GLOBAL(restore_regs_and_iret)
-	RESTORE_EXTRA_REGS
-restore_c_regs_and_iret:
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
+GLOBAL(restore_regs_and_return_to_kernel)
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates kernel mode. */
+	testb	$3, CS(%rsp)
+	jz	1f
+	ud2
+1:
+#endif
+	POP_EXTRA_REGS
+	POP_C_REGS
+	addq	$8, %rsp	/* skip regs->orig_ax */
 	INTERRUPT_RETURN
 	INTERRUPT_RETURN
 
 
 ENTRY(native_iret)
 ENTRY(native_iret)
@@ -818,7 +838,7 @@ ENTRY(\sym)
 
 
 	ASM_CLAC
 	ASM_CLAC
 
 
-	.ifeq \has_error_code
+	.if \has_error_code == 0
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 	.endif
 
 
@@ -1059,6 +1079,7 @@ idtentry int3			do_int3			has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
 idtentry stack_segment		do_stack_segment	has_error_code=1
 idtentry stack_segment		do_stack_segment	has_error_code=1
 
 
 #ifdef CONFIG_XEN
 #ifdef CONFIG_XEN
+idtentry xennmi			do_nmi			has_error_code=0
 idtentry xendebug		do_debug		has_error_code=0
 idtentry xendebug		do_debug		has_error_code=0
 idtentry xenint3		do_int3			has_error_code=0
 idtentry xenint3		do_int3			has_error_code=0
 #endif
 #endif
@@ -1112,17 +1133,14 @@ ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF_DEBUG
 	TRACE_IRQS_OFF_DEBUG
 	testl	%ebx, %ebx			/* swapgs needed? */
 	testl	%ebx, %ebx			/* swapgs needed? */
-	jnz	paranoid_exit_no_swapgs
+	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
 	SWAPGS_UNSAFE_STACK
-	jmp	paranoid_exit_restore
-paranoid_exit_no_swapgs:
+	jmp	.Lparanoid_exit_restore
+.Lparanoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
 	TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
-	RESTORE_EXTRA_REGS
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
-	INTERRUPT_RETURN
+.Lparanoid_exit_restore:
+	jmp restore_regs_and_return_to_kernel
 END(paranoid_exit)
 END(paranoid_exit)
 
 
 /*
 /*
@@ -1223,10 +1241,13 @@ ENTRY(error_exit)
 	jmp	retint_user
 	jmp	retint_user
 END(error_exit)
 END(error_exit)
 
 
-/* Runs on exception stack */
-/* XXX: broken on Xen PV */
+/*
+ * Runs on exception stack.  Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ */
 ENTRY(nmi)
 ENTRY(nmi)
 	UNWIND_HINT_IRET_REGS
 	UNWIND_HINT_IRET_REGS
+
 	/*
 	/*
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
 	 * the iretq it performs will take us out of NMI context.
 	 * the iretq it performs will take us out of NMI context.
@@ -1284,7 +1305,7 @@ ENTRY(nmi)
 	 * stacks lest we corrupt the "NMI executing" variable.
 	 * stacks lest we corrupt the "NMI executing" variable.
 	 */
 	 */
 
 
-	SWAPGS_UNSAFE_STACK
+	swapgs
 	cld
 	cld
 	movq	%rsp, %rdx
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1328,8 +1349,7 @@ ENTRY(nmi)
 	 * Return back to user mode.  We must *not* do the normal exit
 	 * Return back to user mode.  We must *not* do the normal exit
 	 * work, because we don't want to enable interrupts.
 	 * work, because we don't want to enable interrupts.
 	 */
 	 */
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode
 
 
 .Lnmi_from_kernel:
 .Lnmi_from_kernel:
 	/*
 	/*
@@ -1450,7 +1470,7 @@ nested_nmi_out:
 	popq	%rdx
 	popq	%rdx
 
 
 	/* We are returning to kernel mode, so this cannot result in a fault. */
 	/* We are returning to kernel mode, so this cannot result in a fault. */
-	INTERRUPT_RETURN
+	iretq
 
 
 first_nmi:
 first_nmi:
 	/* Restore rdx. */
 	/* Restore rdx. */
@@ -1481,7 +1501,7 @@ first_nmi:
 	pushfq			/* RFLAGS */
 	pushfq			/* RFLAGS */
 	pushq	$__KERNEL_CS	/* CS */
 	pushq	$__KERNEL_CS	/* CS */
 	pushq	$1f		/* RIP */
 	pushq	$1f		/* RIP */
-	INTERRUPT_RETURN	/* continues at repeat_nmi below */
+	iretq			/* continues at repeat_nmi below */
 	UNWIND_HINT_IRET_REGS
 	UNWIND_HINT_IRET_REGS
 1:
 1:
 #endif
 #endif
@@ -1544,29 +1564,34 @@ end_repeat_nmi:
 nmi_swapgs:
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 nmi_restore:
-	RESTORE_EXTRA_REGS
-	RESTORE_C_REGS
+	POP_EXTRA_REGS
+	POP_C_REGS
 
 
-	/* Point RSP at the "iret" frame. */
-	REMOVE_PT_GPREGS_FROM_STACK 6*8
+	/*
+	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+	 * at the "iret" frame.
+	 */
+	addq	$6*8, %rsp
 
 
 	/*
 	/*
 	 * Clear "NMI executing".  Set DF first so that we can easily
 	 * Clear "NMI executing".  Set DF first so that we can easily
 	 * distinguish the remaining code between here and IRET from
 	 * distinguish the remaining code between here and IRET from
-	 * the SYSCALL entry and exit paths.  On a native kernel, we
-	 * could just inspect RIP, but, on paravirt kernels,
-	 * INTERRUPT_RETURN can translate into a jump into a
-	 * hypercall page.
+	 * the SYSCALL entry and exit paths.
+	 *
+	 * We arguably should just inspect RIP instead, but I (Andy) wrote
+	 * this code when I had the misapprehension that Xen PV supported
+	 * NMIs, and Xen PV would break that approach.
 	 */
 	 */
 	std
 	std
 	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
 	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
 
 
 	/*
 	/*
-	 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
-	 * stack in a single instruction.  We are returning to kernel
-	 * mode, so this cannot result in a fault.
+	 * iretq reads the "iret" frame and exits the NMI stack in a
+	 * single instruction.  We are returning to kernel mode, so this
+	 * cannot result in a fault.  Similarly, we don't need to worry
+	 * about espfix64 on the way back to kernel mode.
 	 */
 	 */
-	INTERRUPT_RETURN
+	iretq
 END(nmi)
 END(nmi)
 
 
 ENTRY(ignore_sysret)
 ENTRY(ignore_sysret)

+ 1 - 2
arch/x86/entry/entry_64_compat.S

@@ -337,8 +337,7 @@ ENTRY(entry_INT80_compat)
 
 
 	/* Go back to user mode. */
 	/* Go back to user mode. */
 	TRACE_IRQS_ON
 	TRACE_IRQS_ON
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode
 END(entry_INT80_compat)
 END(entry_INT80_compat)
 
 
 ENTRY(stub32_clone)
 ENTRY(stub32_clone)

+ 4 - 4
arch/x86/include/asm/archrandom.h

@@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v)
 	bool ok;
 	bool ok;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	do {
 	do {
-		asm volatile(RDRAND_LONG "\n\t"
+		asm volatile(RDRAND_LONG
 			     CC_SET(c)
 			     CC_SET(c)
 			     : CC_OUT(c) (ok), "=a" (*v));
 			     : CC_OUT(c) (ok), "=a" (*v));
 		if (ok)
 		if (ok)
@@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v)
 	bool ok;
 	bool ok;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	do {
 	do {
-		asm volatile(RDRAND_INT "\n\t"
+		asm volatile(RDRAND_INT
 			     CC_SET(c)
 			     CC_SET(c)
 			     : CC_OUT(c) (ok), "=a" (*v));
 			     : CC_OUT(c) (ok), "=a" (*v));
 		if (ok)
 		if (ok)
@@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v)
 static inline bool rdseed_long(unsigned long *v)
 static inline bool rdseed_long(unsigned long *v)
 {
 {
 	bool ok;
 	bool ok;
-	asm volatile(RDSEED_LONG "\n\t"
+	asm volatile(RDSEED_LONG
 		     CC_SET(c)
 		     CC_SET(c)
 		     : CC_OUT(c) (ok), "=a" (*v));
 		     : CC_OUT(c) (ok), "=a" (*v));
 	return ok;
 	return ok;
@@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v)
 static inline bool rdseed_int(unsigned int *v)
 static inline bool rdseed_int(unsigned int *v)
 {
 {
 	bool ok;
 	bool ok;
-	asm volatile(RDSEED_INT "\n\t"
+	asm volatile(RDSEED_INT
 		     CC_SET(c)
 		     CC_SET(c)
 		     : CC_OUT(c) (ok), "=a" (*v));
 		     : CC_OUT(c) (ok), "=a" (*v));
 	return ok;
 	return ok;

+ 5 - 5
arch/x86/include/asm/bitops.h

@@ -143,7 +143,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
 static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
 {
 {
 	bool negative;
 	bool negative;
-	asm volatile(LOCK_PREFIX "andb %2,%1\n\t"
+	asm volatile(LOCK_PREFIX "andb %2,%1"
 		CC_SET(s)
 		CC_SET(s)
 		: CC_OUT(s) (negative), ADDR
 		: CC_OUT(s) (negative), ADDR
 		: "ir" ((char) ~(1 << nr)) : "memory");
 		: "ir" ((char) ~(1 << nr)) : "memory");
@@ -246,7 +246,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
 {
 {
 	bool oldbit;
 	bool oldbit;
 
 
-	asm("bts %2,%1\n\t"
+	asm("bts %2,%1"
 	    CC_SET(c)
 	    CC_SET(c)
 	    : CC_OUT(c) (oldbit), ADDR
 	    : CC_OUT(c) (oldbit), ADDR
 	    : "Ir" (nr));
 	    : "Ir" (nr));
@@ -286,7 +286,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
 {
 {
 	bool oldbit;
 	bool oldbit;
 
 
-	asm volatile("btr %2,%1\n\t"
+	asm volatile("btr %2,%1"
 		     CC_SET(c)
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit), ADDR
 		     : CC_OUT(c) (oldbit), ADDR
 		     : "Ir" (nr));
 		     : "Ir" (nr));
@@ -298,7 +298,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
 {
 {
 	bool oldbit;
 	bool oldbit;
 
 
-	asm volatile("btc %2,%1\n\t"
+	asm volatile("btc %2,%1"
 		     CC_SET(c)
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit), ADDR
 		     : CC_OUT(c) (oldbit), ADDR
 		     : "Ir" (nr) : "memory");
 		     : "Ir" (nr) : "memory");
@@ -329,7 +329,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
 {
 {
 	bool oldbit;
 	bool oldbit;
 
 
-	asm volatile("bt %2,%1\n\t"
+	asm volatile("bt %2,%1"
 		     CC_SET(c)
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit)
 		     : CC_OUT(c) (oldbit)
 		     : "m" (*(unsigned long *)addr), "Ir" (nr));
 		     : "m" (*(unsigned long *)addr), "Ir" (nr));

+ 1 - 0
arch/x86/include/asm/compat.h

@@ -7,6 +7,7 @@
  */
  */
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
 #include <asm/user32.h>
 #include <asm/user32.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>

+ 4 - 5
arch/x86/include/asm/cpufeature.h

@@ -126,11 +126,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 
 
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
-#define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
-#define setup_clear_cpu_cap(bit) do { \
-	clear_cpu_cap(&boot_cpu_data, bit);	\
-	set_bit(bit, (unsigned long *)cpu_caps_cleared); \
-} while (0)
+
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+
 #define setup_force_cpu_cap(bit) do { \
 #define setup_force_cpu_cap(bit) do { \
 	set_cpu_cap(&boot_cpu_data, bit);	\
 	set_cpu_cap(&boot_cpu_data, bit);	\
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\

+ 11 - 0
arch/x86/include/asm/cpufeatures.h

@@ -22,6 +22,11 @@
  * this feature bit is not displayed in /proc/cpuinfo at all.
  * this feature bit is not displayed in /proc/cpuinfo at all.
  */
  */
 
 
+/*
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c
+ */
+
 /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
 /* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
 #define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
 #define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
 #define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
 #define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
@@ -295,6 +300,12 @@
 #define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
 #define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
 #define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
 #define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
 #define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_GFNI	(16*32+ 8) /* Galois Field New Instructions */
+#define X86_FEATURE_VAES	(16*32+ 9) /* Vector AES */
+#define X86_FEATURE_VPCLMULQDQ	(16*32+ 10) /* Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI (16*32+ 11) /* Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB */
 #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57	(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_LA57	(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID	(16*32+22) /* RDPID instruction */
 #define X86_FEATURE_RDPID	(16*32+22) /* RDPID instruction */

+ 1 - 1
arch/x86/include/asm/module.h

@@ -6,7 +6,7 @@
 #include <asm/orc_types.h>
 #include <asm/orc_types.h>
 
 
 struct mod_arch_specific {
 struct mod_arch_specific {
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 	unsigned int num_orcs;
 	unsigned int num_orcs;
 	int *orc_unwind_ip;
 	int *orc_unwind_ip;
 	struct orc_entry *orc_unwind;
 	struct orc_entry *orc_unwind;

+ 2 - 3
arch/x86/include/asm/paravirt.h

@@ -16,10 +16,9 @@
 #include <linux/cpumask.h>
 #include <linux/cpumask.h>
 #include <asm/frame.h>
 #include <asm/frame.h>
 
 
-static inline void load_sp0(struct tss_struct *tss,
-			     struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
 {
 {
-	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+	PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
 }
 }
 
 
 /* The paravirtualized CPUID instruction. */
 /* The paravirtualized CPUID instruction. */

+ 1 - 1
arch/x86/include/asm/paravirt_types.h

@@ -134,7 +134,7 @@ struct pv_cpu_ops {
 	void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
 	void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
 	void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
 	void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
 
 
-	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+	void (*load_sp0)(unsigned long sp0);
 
 
 	void (*set_iopl_mask)(unsigned mask);
 	void (*set_iopl_mask)(unsigned mask);
 
 

+ 1 - 1
arch/x86/include/asm/percpu.h

@@ -526,7 +526,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
 {
 {
 	bool oldbit;
 	bool oldbit;
 
 
-	asm volatile("bt "__percpu_arg(2)",%1\n\t"
+	asm volatile("bt "__percpu_arg(2)",%1"
 			CC_SET(c)
 			CC_SET(c)
 			: CC_OUT(c) (oldbit)
 			: CC_OUT(c) (oldbit)
 			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
 			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));

+ 21 - 31
arch/x86/include/asm/processor.h

@@ -431,7 +431,9 @@ typedef struct {
 struct thread_struct {
 struct thread_struct {
 	/* Cached TLS descriptors: */
 	/* Cached TLS descriptors: */
 	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
 	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
+#ifdef CONFIG_X86_32
 	unsigned long		sp0;
 	unsigned long		sp0;
+#endif
 	unsigned long		sp;
 	unsigned long		sp;
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	unsigned long		sysenter_cs;
 	unsigned long		sysenter_cs;
@@ -518,16 +520,9 @@ static inline void native_set_iopl_mask(unsigned mask)
 }
 }
 
 
 static inline void
 static inline void
-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+native_load_sp0(unsigned long sp0)
 {
 {
-	tss->x86_tss.sp0 = thread->sp0;
-#ifdef CONFIG_X86_32
-	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
-	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-		tss->x86_tss.ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-#endif
+	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
 }
 }
 
 
 static inline void native_swapgs(void)
 static inline void native_swapgs(void)
@@ -547,15 +542,20 @@ static inline unsigned long current_top_of_stack(void)
 #endif
 #endif
 }
 }
 
 
+static inline bool on_thread_stack(void)
+{
+	return (unsigned long)(current_top_of_stack() -
+			       current_stack_pointer) < THREAD_SIZE;
+}
+
 #ifdef CONFIG_PARAVIRT
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #include <asm/paravirt.h>
 #else
 #else
 #define __cpuid			native_cpuid
 #define __cpuid			native_cpuid
 
 
-static inline void load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
 {
 {
-	native_load_sp0(tss, thread);
+	native_load_sp0(sp0);
 }
 }
 
 
 #define set_iopl_mask native_set_iopl_mask
 #define set_iopl_mask native_set_iopl_mask
@@ -804,6 +804,15 @@ static inline void spin_lock_prefetch(const void *x)
 #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
 #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
 			   TOP_OF_KERNEL_STACK_PADDING)
 			   TOP_OF_KERNEL_STACK_PADDING)
 
 
+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
+#define task_pt_regs(task) \
+({									\
+	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
+	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
+	((struct pt_regs *)__ptr) - 1;					\
+})
+
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 /*
 /*
  * User space process size: 3GB (default).
  * User space process size: 3GB (default).
@@ -823,23 +832,6 @@ static inline void spin_lock_prefetch(const void *x)
 	.addr_limit		= KERNEL_DS,				  \
 	.addr_limit		= KERNEL_DS,				  \
 }
 }
 
 
-/*
- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessible even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the ss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task) \
-({									\
-	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
-	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
-	((struct pt_regs *)__ptr) - 1;					\
-})
-
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
 
 #else
 #else
@@ -873,11 +865,9 @@ static inline void spin_lock_prefetch(const void *x)
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 
 
 #define INIT_THREAD  {						\
 #define INIT_THREAD  {						\
-	.sp0			= TOP_OF_INIT_STACK,		\
 	.addr_limit		= KERNEL_DS,			\
 	.addr_limit		= KERNEL_DS,			\
 }
 }
 
 
-#define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
 
 #endif /* CONFIG_X86_64 */
 #endif /* CONFIG_X86_64 */

+ 5 - 1
arch/x86/include/asm/ptrace.h

@@ -136,9 +136,9 @@ static inline int v8086_mode(struct pt_regs *regs)
 #endif
 #endif
 }
 }
 
 
-#ifdef CONFIG_X86_64
 static inline bool user_64bit_mode(struct pt_regs *regs)
 static inline bool user_64bit_mode(struct pt_regs *regs)
 {
 {
+#ifdef CONFIG_X86_64
 #ifndef CONFIG_PARAVIRT
 #ifndef CONFIG_PARAVIRT
 	/*
 	/*
 	 * On non-paravirt systems, this is the only long mode CPL 3
 	 * On non-paravirt systems, this is the only long mode CPL 3
@@ -149,8 +149,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
 	/* Headers are too twisted for this to go in paravirt.h. */
 	/* Headers are too twisted for this to go in paravirt.h. */
 	return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
 	return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
 #endif
 #endif
+#else /* !CONFIG_X86_64 */
+	return false;
+#endif
 }
 }
 
 
+#ifdef CONFIG_X86_64
 #define current_user_stack_pointer()	current_pt_regs()->sp
 #define current_user_stack_pointer()	current_pt_regs()->sp
 #define compat_user_stack_pointer()	current_pt_regs()->sp
 #define compat_user_stack_pointer()	current_pt_regs()->sp
 #endif
 #endif

+ 1 - 1
arch/x86/include/asm/rmwcc.h

@@ -29,7 +29,7 @@ cc_label:								\
 #define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 #define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 do {									\
 do {									\
 	bool c;								\
 	bool c;								\
-	asm volatile (fullop ";" CC_SET(cc)				\
+	asm volatile (fullop CC_SET(cc)					\
 			: [counter] "+m" (var), CC_OUT(cc) (c)		\
 			: [counter] "+m" (var), CC_OUT(cc) (c)		\
 			: __VA_ARGS__ : clobbers);			\
 			: __VA_ARGS__ : clobbers);			\
 	return c;							\
 	return c;							\

+ 24 - 0
arch/x86/include/asm/switch_to.h

@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_SWITCH_TO_H
 #ifndef _ASM_X86_SWITCH_TO_H
 #define _ASM_X86_SWITCH_TO_H
 #define _ASM_X86_SWITCH_TO_H
 
 
+#include <linux/sched/task_stack.h>
+
 struct task_struct; /* one of the stranger aspects of C forward declarations */
 struct task_struct; /* one of the stranger aspects of C forward declarations */
 
 
 struct task_struct *__switch_to_asm(struct task_struct *prev,
 struct task_struct *__switch_to_asm(struct task_struct *prev,
@@ -73,4 +75,26 @@ do {									\
 	((last) = __switch_to_asm((prev), (next)));			\
 	((last) = __switch_to_asm((prev), (next)));			\
 } while (0)
 } while (0)
 
 
+#ifdef CONFIG_X86_32
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
+	if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+		return;
+
+	this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+	wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+}
+#endif
+
+/* This is used when switching tasks or entering/exiting vm86 mode. */
+static inline void update_sp0(struct task_struct *task)
+{
+#ifdef CONFIG_X86_32
+	load_sp0(task->thread.sp0);
+#else
+	load_sp0(task_top_of_stack(task));
+#endif
+}
+
 #endif /* _ASM_X86_SWITCH_TO_H */
 #endif /* _ASM_X86_SWITCH_TO_H */

+ 1 - 1
arch/x86/include/asm/syscalls.h

@@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
 asmlinkage long sys_iopl(unsigned int);
 asmlinkage long sys_iopl(unsigned int);
 
 
 /* kernel/ldt.c */
 /* kernel/ldt.c */
-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+asmlinkage long sys_modify_ldt(int, void __user *, unsigned long);
 
 
 /* kernel/signal.c */
 /* kernel/signal.c */
 asmlinkage long sys_rt_sigreturn(void);
 asmlinkage long sys_rt_sigreturn(void);

+ 0 - 10
arch/x86/include/asm/trace/fpu.h

@@ -34,11 +34,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
 	)
 	)
 );
 );
 
 
-DEFINE_EVENT(x86_fpu, x86_fpu_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
 DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
 	TP_PROTO(struct fpu *fpu),
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
 	TP_ARGS(fpu)
@@ -74,11 +69,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
 	TP_ARGS(fpu)
 	TP_ARGS(fpu)
 );
 );
 
 
-DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
 DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
 	TP_PROTO(struct fpu *fpu),
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
 	TP_ARGS(fpu)

+ 19 - 1
arch/x86/include/asm/traps.h

@@ -38,9 +38,9 @@ asmlinkage void simd_coprocessor_error(void);
 
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
 #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
 asmlinkage void xen_divide_error(void);
 asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xennmi(void);
 asmlinkage void xen_xendebug(void);
 asmlinkage void xen_xendebug(void);
 asmlinkage void xen_xenint3(void);
 asmlinkage void xen_xenint3(void);
-asmlinkage void xen_nmi(void);
 asmlinkage void xen_overflow(void);
 asmlinkage void xen_overflow(void);
 asmlinkage void xen_bounds(void);
 asmlinkage void xen_bounds(void);
 asmlinkage void xen_invalid_op(void);
 asmlinkage void xen_invalid_op(void);
@@ -145,4 +145,22 @@ enum {
 	X86_TRAP_IRET = 32,	/* 32, IRET Exception */
 	X86_TRAP_IRET = 32,	/* 32, IRET Exception */
 };
 };
 
 
+/*
+ * Page fault error code bits:
+ *
+ *   bit 0 ==	 0: no page found	1: protection fault
+ *   bit 1 ==	 0: read access		1: write access
+ *   bit 2 ==	 0: kernel-mode access	1: user-mode access
+ *   bit 3 ==				1: use of reserved bit detected
+ *   bit 4 ==				1: fault was an instruction fetch
+ *   bit 5 ==				1: protection keys block access
+ */
+enum x86_pf_error_code {
+	X86_PF_PROT	=		1 << 0,
+	X86_PF_WRITE	=		1 << 1,
+	X86_PF_USER	=		1 << 2,
+	X86_PF_RSVD	=		1 << 3,
+	X86_PF_INSTR	=		1 << 4,
+	X86_PF_PK	=		1 << 5,
+};
 #endif /* _ASM_X86_TRAPS_H */
 #endif /* _ASM_X86_TRAPS_H */

+ 4 - 4
arch/x86/include/asm/unwind.h

@@ -13,11 +13,11 @@ struct unwind_state {
 	struct task_struct *task;
 	struct task_struct *task;
 	int graph_idx;
 	int graph_idx;
 	bool error;
 	bool error;
-#if defined(CONFIG_ORC_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC)
 	bool signal, full_regs;
 	bool signal, full_regs;
 	unsigned long sp, bp, ip;
 	unsigned long sp, bp, ip;
 	struct pt_regs *regs;
 	struct pt_regs *regs;
-#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
+#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
 	bool got_irq;
 	bool got_irq;
 	unsigned long *bp, *orig_sp, ip;
 	unsigned long *bp, *orig_sp, ip;
 	struct pt_regs *regs;
 	struct pt_regs *regs;
@@ -51,7 +51,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 	__unwind_start(state, task, regs, first_frame);
 	__unwind_start(state, task, regs, first_frame);
 }
 }
 
 
-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 {
 	if (unwind_done(state))
 	if (unwind_done(state))
@@ -66,7 +66,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 }
 }
 #endif
 #endif
 
 
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 void unwind_init(void);
 void unwind_init(void);
 void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
 void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
 			void *orc, size_t orc_size);
 			void *orc, size_t orc_size);

+ 3 - 0
arch/x86/include/uapi/asm/processor-flags.h

@@ -152,5 +152,8 @@
 #define CX86_ARR_BASE	0xc4
 #define CX86_ARR_BASE	0xc4
 #define CX86_RCR_BASE	0xdc
 #define CX86_RCR_BASE	0xdc
 
 
+#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+			 X86_CR0_PG)
 
 
 #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */
 #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */

+ 3 - 4
arch/x86/kernel/Makefile

@@ -27,7 +27,6 @@ KASAN_SANITIZE_dumpstack.o				:= n
 KASAN_SANITIZE_dumpstack_$(BITS).o			:= n
 KASAN_SANITIZE_dumpstack_$(BITS).o			:= n
 KASAN_SANITIZE_stacktrace.o := n
 KASAN_SANITIZE_stacktrace.o := n
 
 
-OBJECT_FILES_NON_STANDARD_head_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
@@ -128,9 +127,9 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
 
 
-obj-$(CONFIG_ORC_UNWINDER)		+= unwind_orc.o
-obj-$(CONFIG_FRAME_POINTER_UNWINDER)	+= unwind_frame.o
-obj-$(CONFIG_GUESS_UNWINDER)		+= unwind_guess.o
+obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER)	+= unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
 
 
 ###
 ###
 # 64 bit specific files
 # 64 bit specific files

+ 1 - 0
arch/x86/kernel/cpu/Makefile

@@ -23,6 +23,7 @@ obj-y			+= rdrand.o
 obj-y			+= match.o
 obj-y			+= match.o
 obj-y			+= bugs.o
 obj-y			+= bugs.o
 obj-$(CONFIG_CPU_FREQ)	+= aperfmperf.o
 obj-$(CONFIG_CPU_FREQ)	+= aperfmperf.o
+obj-y			+= cpuid-deps.o
 
 
 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o

+ 17 - 12
arch/x86/kernel/cpu/common.c

@@ -1301,18 +1301,16 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		pr_cont(")\n");
 		pr_cont(")\n");
 }
 }
 
 
-static __init int setup_disablecpuid(char *arg)
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
 {
 {
-	int bit;
-
-	if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
-		setup_clear_cpu_cap(bit);
-	else
-		return 0;
-
 	return 1;
 	return 1;
 }
 }
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
@@ -1572,9 +1570,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, me);
 	enter_lazy_tlb(&init_mm, me);
 
 
-	load_sp0(t, &current->thread);
+	/*
+	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
+	 * task never enters user mode.
+	 */
 	set_tss_desc(cpu, t);
 	set_tss_desc(cpu, t);
 	load_TR_desc();
 	load_TR_desc();
+
 	load_mm_ldt(&init_mm);
 	load_mm_ldt(&init_mm);
 
 
 	clear_all_debug_regs();
 	clear_all_debug_regs();
@@ -1596,7 +1598,6 @@ void cpu_init(void)
 	int cpu = smp_processor_id();
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
 	struct task_struct *curr = current;
 	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
-	struct thread_struct *thread = &curr->thread;
 
 
 	wait_for_master_cpu(cpu);
 	wait_for_master_cpu(cpu);
 
 
@@ -1627,9 +1628,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, curr);
 	enter_lazy_tlb(&init_mm, curr);
 
 
-	load_sp0(t, thread);
+	/*
+	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
+	 * task never enters user mode.
+	 */
 	set_tss_desc(cpu, t);
 	set_tss_desc(cpu, t);
 	load_TR_desc();
 	load_TR_desc();
+
 	load_mm_ldt(&init_mm);
 	load_mm_ldt(&init_mm);
 
 
 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);

+ 125 - 0
arch/x86/kernel/cpu/cpuid-deps.c

@@ -0,0 +1,125 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+	unsigned int	feature;
+	unsigned int	depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+	{ X86_FEATURE_XSAVEOPT,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XSAVEC,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XSAVES,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_AVX,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_PKU,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_MPX,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XGETBV1,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_FXSR_OPT,		X86_FEATURE_FXSR      },
+	{ X86_FEATURE_XMM,		X86_FEATURE_FXSR      },
+	{ X86_FEATURE_XMM2,		X86_FEATURE_XMM       },
+	{ X86_FEATURE_XMM3,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM4_1,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM4_2,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM3,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_PCLMULQDQ,	X86_FEATURE_XMM2      },
+	{ X86_FEATURE_SSSE3,		X86_FEATURE_XMM2,     },
+	{ X86_FEATURE_F16C,		X86_FEATURE_XMM2,     },
+	{ X86_FEATURE_AES,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_SHA_NI,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_FMA,		X86_FEATURE_AVX       },
+	{ X86_FEATURE_AVX2,		X86_FEATURE_AVX,      },
+	{ X86_FEATURE_AVX512F,		X86_FEATURE_AVX,      },
+	{ X86_FEATURE_AVX512IFMA,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512PF,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512ER,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512CD,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512DQ,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512BW,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512VL,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512VBMI,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_VBMI2,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_GFNI,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_VAES,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_VPCLMULQDQ,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_VNNI,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_BITALG,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_4VNNIW,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_4FMAPS,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F   },
+	{}
+};
+
+static inline void __clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit)
+{
+	clear_bit32(bit, c->x86_capability);
+}
+
+static inline void __setup_clear_cpu_cap(unsigned int bit)
+{
+	clear_cpu_cap(&boot_cpu_data, bit);
+	set_bit32(bit, cpu_caps_cleared);
+}
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	if (!c)
+		__setup_clear_cpu_cap(feature);
+	else
+		__clear_cpu_cap(c, feature);
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+	const struct cpuid_dep *d;
+	bool changed;
+
+	if (WARN_ON(feature >= MAX_FEATURE_BITS))
+		return;
+
+	clear_feature(c, feature);
+
+	/* Collect all features to disable, handling dependencies */
+	memset(disable, 0, sizeof(disable));
+	__set_bit(feature, disable);
+
+	/* Loop until we get a stable state. */
+	do {
+		changed = false;
+		for (d = cpuid_deps; d->feature; d++) {
+			if (!test_bit(d->depends, disable))
+				continue;
+			if (__test_and_set_bit(d->feature, disable))
+				continue;
+
+			changed = true;
+			clear_feature(c, d->feature);
+		}
+	} while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+	do_clear_cpu_cap(NULL, feature);
+}

+ 11 - 0
arch/x86/kernel/fpu/init.c

@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void)
  */
  */
 static void __init fpu__init_parse_early_param(void)
 static void __init fpu__init_parse_early_param(void)
 {
 {
+	char arg[32];
+	char *argptr = arg;
+	int bit;
+
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
 		setup_clear_cpu_cap(X86_FEATURE_FPU);
 		setup_clear_cpu_cap(X86_FEATURE_FPU);
 
 
@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void)
 
 
 	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
 	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
 		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
 		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+				sizeof(arg)) &&
+	    get_option(&argptr, &bit) &&
+	    bit >= 0 &&
+	    bit < NCAPINTS * 32)
+		setup_clear_cpu_cap(bit);
 }
 }
 
 
 /*
 /*

+ 23 - 20
arch/x86/kernel/fpu/xstate.c

@@ -15,6 +15,7 @@
 #include <asm/fpu/xstate.h>
 #include <asm/fpu/xstate.h>
 
 
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
 
 
 /*
 /*
  * Although we spell it out in here, the Processor Trace
  * Although we spell it out in here, the Processor Trace
@@ -36,6 +37,19 @@ static const char *xfeature_names[] =
 	"unknown xstate feature"	,
 	"unknown xstate feature"	,
 };
 };
 
 
+static short xsave_cpuid_features[] __initdata = {
+	X86_FEATURE_FPU,
+	X86_FEATURE_XMM,
+	X86_FEATURE_AVX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_INTEL_PT,
+	X86_FEATURE_PKU,
+};
+
 /*
 /*
  * Mask of xstate features supported by the CPU and the kernel:
  * Mask of xstate features supported by the CPU and the kernel:
  */
  */
@@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size;
 void fpu__xstate_clear_all_cpu_caps(void)
 void fpu__xstate_clear_all_cpu_caps(void)
 {
 {
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
-	setup_clear_cpu_cap(X86_FEATURE_MPX);
-	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
-	setup_clear_cpu_cap(X86_FEATURE_PKU);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
 }
 }
 
 
 /*
 /*
@@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void)
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int eax, ebx, ecx, edx;
 	static int on_boot_cpu __initdata = 1;
 	static int on_boot_cpu __initdata = 1;
 	int err;
 	int err;
+	int i;
 
 
 	WARN_ON_FPU(!on_boot_cpu);
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
 	on_boot_cpu = 0;
@@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void)
 		goto out_disable;
 		goto out_disable;
 	}
 	}
 
 
+	/*
+	 * Clear XSAVE features that are disabled in the normal CPUID.
+	 */
+	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+		if (!boot_cpu_has(xsave_cpuid_features[i]))
+			xfeatures_mask &= ~BIT(i);
+	}
+
 	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
 	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
 
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	/* Enable xstate instructions to be able to continue with initialization: */

+ 1 - 4
arch/x86/kernel/head_32.S

@@ -212,9 +212,6 @@ ENTRY(startup_32_smp)
 #endif
 #endif
 
 
 .Ldefault_entry:
 .Ldefault_entry:
-#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
-			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
-			 X86_CR0_PG)
 	movl $(CR0_STATE & ~X86_CR0_PG),%eax
 	movl $(CR0_STATE & ~X86_CR0_PG),%eax
 	movl %eax,%cr0
 	movl %eax,%cr0
 
 
@@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array)
 	# 24(%rsp) error code
 	# 24(%rsp) error code
 	i = 0
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
 	.rept NUM_EXCEPTION_VECTORS
-	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+	.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
 	pushl $0		# Dummy error code, to make stack frame uniform
 	pushl $0		# Dummy error code, to make stack frame uniform
 	.endif
 	.endif
 	pushl $i		# 20(%esp) Vector number
 	pushl $i		# 20(%esp) Vector number

+ 17 - 17
arch/x86/kernel/head_64.S

@@ -50,6 +50,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
 	.code64
 	.code64
 	.globl startup_64
 	.globl startup_64
 startup_64:
 startup_64:
+	UNWIND_HINT_EMPTY
 	/*
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * and someone has loaded an identity mapped page table
 	 * and someone has loaded an identity mapped page table
@@ -89,6 +90,7 @@ startup_64:
 	addq	$(early_top_pgt - __START_KERNEL_map), %rax
 	addq	$(early_top_pgt - __START_KERNEL_map), %rax
 	jmp 1f
 	jmp 1f
 ENTRY(secondary_startup_64)
 ENTRY(secondary_startup_64)
+	UNWIND_HINT_EMPTY
 	/*
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * and someone has loaded a mapped page table.
 	 * and someone has loaded a mapped page table.
@@ -133,6 +135,7 @@ ENTRY(secondary_startup_64)
 	movq	$1f, %rax
 	movq	$1f, %rax
 	jmp	*%rax
 	jmp	*%rax
 1:
 1:
+	UNWIND_HINT_EMPTY
 
 
 	/* Check if nx is implemented */
 	/* Check if nx is implemented */
 	movl	$0x80000001, %eax
 	movl	$0x80000001, %eax
@@ -150,9 +153,6 @@ ENTRY(secondary_startup_64)
 1:	wrmsr				/* Make changes effective */
 1:	wrmsr				/* Make changes effective */
 
 
 	/* Setup cr0 */
 	/* Setup cr0 */
-#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
-			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
-			 X86_CR0_PG)
 	movl	$CR0_STATE, %eax
 	movl	$CR0_STATE, %eax
 	/* Make changes effective */
 	/* Make changes effective */
 	movq	%rax, %cr0
 	movq	%rax, %cr0
@@ -235,7 +235,7 @@ ENTRY(secondary_startup_64)
 	pushq	%rax		# target address in negative space
 	pushq	%rax		# target address in negative space
 	lretq
 	lretq
 .Lafter_lret:
 .Lafter_lret:
-ENDPROC(secondary_startup_64)
+END(secondary_startup_64)
 
 
 #include "verify_cpu.S"
 #include "verify_cpu.S"
 
 
@@ -247,6 +247,7 @@ ENDPROC(secondary_startup_64)
  */
  */
 ENTRY(start_cpu0)
 ENTRY(start_cpu0)
 	movq	initial_stack(%rip), %rsp
 	movq	initial_stack(%rip), %rsp
+	UNWIND_HINT_EMPTY
 	jmp	.Ljump_to_C_code
 	jmp	.Ljump_to_C_code
 ENDPROC(start_cpu0)
 ENDPROC(start_cpu0)
 #endif
 #endif
@@ -266,26 +267,24 @@ ENDPROC(start_cpu0)
 	.quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
 	.quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
 	__FINITDATA
 	__FINITDATA
 
 
-bad_address:
-	jmp bad_address
-
 	__INIT
 	__INIT
 ENTRY(early_idt_handler_array)
 ENTRY(early_idt_handler_array)
-	# 104(%rsp) %rflags
-	#  96(%rsp) %cs
-	#  88(%rsp) %rip
-	#  80(%rsp) error code
 	i = 0
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
 	.rept NUM_EXCEPTION_VECTORS
-	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
-	pushq $0		# Dummy error code, to make stack frame uniform
+	.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+		UNWIND_HINT_IRET_REGS
+		pushq $0	# Dummy error code, to make stack frame uniform
+	.else
+		UNWIND_HINT_IRET_REGS offset=8
 	.endif
 	.endif
 	pushq $i		# 72(%rsp) Vector number
 	pushq $i		# 72(%rsp) Vector number
 	jmp early_idt_handler_common
 	jmp early_idt_handler_common
+	UNWIND_HINT_IRET_REGS
 	i = i + 1
 	i = i + 1
 	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.endr
 	.endr
-ENDPROC(early_idt_handler_array)
+	UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)
 
 
 early_idt_handler_common:
 early_idt_handler_common:
 	/*
 	/*
@@ -313,6 +312,7 @@ early_idt_handler_common:
 	pushq %r13				/* pt_regs->r13 */
 	pushq %r13				/* pt_regs->r13 */
 	pushq %r14				/* pt_regs->r14 */
 	pushq %r14				/* pt_regs->r14 */
 	pushq %r15				/* pt_regs->r15 */
 	pushq %r15				/* pt_regs->r15 */
+	UNWIND_HINT_REGS
 
 
 	cmpq $14,%rsi		/* Page fault? */
 	cmpq $14,%rsi		/* Page fault? */
 	jnz 10f
 	jnz 10f
@@ -327,8 +327,8 @@ early_idt_handler_common:
 
 
 20:
 20:
 	decl early_recursion_flag(%rip)
 	decl early_recursion_flag(%rip)
-	jmp restore_regs_and_iret
-ENDPROC(early_idt_handler_common)
+	jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)
 
 
 	__INITDATA
 	__INITDATA
 
 
@@ -435,7 +435,7 @@ ENTRY(phys_base)
 EXPORT_SYMBOL(phys_base)
 EXPORT_SYMBOL(phys_base)
 
 
 #include "../../x86/xen/xen-head.S"
 #include "../../x86/xen/xen-head.S"
-	
+
 	__PAGE_ALIGNED_BSS
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(empty_zero_page)
 NEXT_PAGE(empty_zero_page)
 	.skip PAGE_SIZE
 	.skip PAGE_SIZE

+ 13 - 3
arch/x86/kernel/ldt.c

@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
+#include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
@@ -295,8 +296,8 @@ out:
 	return error;
 	return error;
 }
 }
 
 
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
-			      unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+		unsigned long , bytecount)
 {
 {
 	int ret = -ENOSYS;
 	int ret = -ENOSYS;
 
 
@@ -314,5 +315,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
 		ret = write_ldt(ptr, bytecount, 0);
 		ret = write_ldt(ptr, bytecount, 0);
 		break;
 		break;
 	}
 	}
-	return ret;
+	/*
+	 * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+	 * return type, but tht ABI for sys_modify_ldt() expects
+	 * 'int'.  This cast gives us an int-sized value in %rax
+	 * for the return code.  The 'unsigned' is necessary so
+	 * the compiler does not try to sign-extend the negative
+	 * return codes into the high half of the register when
+	 * taking the value from int->long.
+	 */
+	return (unsigned int)ret;
 }
 }

+ 7 - 1
arch/x86/kernel/process.c

@@ -49,7 +49,13 @@
  */
  */
 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
 	.x86_tss = {
-		.sp0 = TOP_OF_INIT_STACK,
+		/*
+		 * .sp0 is only used when entering ring 0 from a lower
+		 * privilege level.  Since the init task never runs anything
+		 * but ring 0 code, there is no need for a valid value here.
+		 * Poison it.
+		 */
+		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
 		.ss1 = __KERNEL_CS,

+ 4 - 2
arch/x86/kernel/process_32.c

@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 
 	/*
 	/*
 	 * Reload esp0 and cpu_current_top_of_stack.  This changes
 	 * Reload esp0 and cpu_current_top_of_stack.  This changes
-	 * current_thread_info().
+	 * current_thread_info().  Refresh the SYSENTER configuration in
+	 * case prev or next is vm86.
 	 */
 	 */
-	load_sp0(tss, next);
+	update_sp0(next_p);
+	refresh_sysenter_cs(next);
 	this_cpu_write(cpu_current_top_of_stack,
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
 		       THREAD_SIZE);

+ 2 - 3
arch/x86/kernel/process_64.c

@@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	struct inactive_task_frame *frame;
 	struct inactive_task_frame *frame;
 	struct task_struct *me = current;
 	struct task_struct *me = current;
 
 
-	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 	childregs = task_pt_regs(p);
 	childregs = task_pt_regs(p);
 	fork_frame = container_of(childregs, struct fork_frame, regs);
 	fork_frame = container_of(childregs, struct fork_frame, regs);
 	frame = &fork_frame->frame;
 	frame = &fork_frame->frame;
@@ -464,8 +463,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	 */
 	this_cpu_write(current_task, next_p);
 	this_cpu_write(current_task, next_p);
 
 
-	/* Reload esp0 and ss1.  This changes current_thread_info(). */
-	load_sp0(tss, next);
+	/* Reload sp0. */
+	update_sp0(next_p);
 
 
 	/*
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 * Now maybe reload the debug registers and handle I/O bitmaps

+ 1 - 2
arch/x86/kernel/smpboot.c

@@ -962,8 +962,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
 	irq_ctx_init(cpu);
-	per_cpu(cpu_current_top_of_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
+	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
 #else
 #else
 	initial_gs = per_cpu_offset(cpu);
 	initial_gs = per_cpu_offset(cpu);
 #endif
 #endif

+ 1 - 2
arch/x86/kernel/traps.c

@@ -141,8 +141,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * will catch asm bugs and any attempt to use ist_preempt_enable
 	 * will catch asm bugs and any attempt to use ist_preempt_enable
 	 * from double_fault.
 	 * from double_fault.
 	 */
 	 */
-	BUG_ON((unsigned long)(current_top_of_stack() -
-			       current_stack_pointer) >= THREAD_SIZE);
+	BUG_ON(!on_thread_stack());
 
 
 	preempt_enable_no_resched();
 	preempt_enable_no_resched();
 }
 }

+ 2 - 1
arch/x86/kernel/verify_cpu.S

@@ -33,7 +33,7 @@
 #include <asm/cpufeatures.h>
 #include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 #include <asm/msr-index.h>
 
 
-verify_cpu:
+ENTRY(verify_cpu)
 	pushf				# Save caller passed flags
 	pushf				# Save caller passed flags
 	push	$0			# Kill any dangerous flags
 	push	$0			# Kill any dangerous flags
 	popf
 	popf
@@ -139,3 +139,4 @@ verify_cpu:
 	popf				# Restore caller passed flags
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	xorl %eax, %eax
 	ret
 	ret
+ENDPROC(verify_cpu)

+ 11 - 9
arch/x86/kernel/vm86_32.c

@@ -55,6 +55,7 @@
 #include <asm/irq.h>
 #include <asm/irq.h>
 #include <asm/traps.h>
 #include <asm/traps.h>
 #include <asm/vm86.h>
 #include <asm/vm86.h>
+#include <asm/switch_to.h>
 
 
 /*
 /*
  * Known problems:
  * Known problems:
@@ -94,7 +95,6 @@
 
 
 void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
 {
-	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
 	struct vm86plus_struct __user *user;
 	struct vm86 *vm86 = current->thread.vm86;
 	struct vm86 *vm86 = current->thread.vm86;
@@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 		do_exit(SIGSEGV);
 		do_exit(SIGSEGV);
 	}
 	}
 
 
-	tss = &per_cpu(cpu_tss, get_cpu());
+	preempt_disable();
 	tsk->thread.sp0 = vm86->saved_sp0;
 	tsk->thread.sp0 = vm86->saved_sp0;
 	tsk->thread.sysenter_cs = __KERNEL_CS;
 	tsk->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &tsk->thread);
+	update_sp0(tsk);
+	refresh_sysenter_cs(&tsk->thread);
 	vm86->saved_sp0 = 0;
 	vm86->saved_sp0 = 0;
-	put_cpu();
+	preempt_enable();
 
 
 	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 
 
@@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 
 
 static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
 {
-	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct task_struct *tsk = current;
 	struct vm86 *vm86 = tsk->thread.vm86;
 	struct vm86 *vm86 = tsk->thread.vm86;
 	struct kernel_vm86_regs vm86regs;
 	struct kernel_vm86_regs vm86regs;
@@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	vm86->saved_sp0 = tsk->thread.sp0;
 	vm86->saved_sp0 = tsk->thread.sp0;
 	lazy_save_gs(vm86->regs32.gs);
 	lazy_save_gs(vm86->regs32.gs);
 
 
-	tss = &per_cpu(cpu_tss, get_cpu());
 	/* make room for real-mode segments */
 	/* make room for real-mode segments */
+	preempt_disable();
 	tsk->thread.sp0 += 16;
 	tsk->thread.sp0 += 16;
 
 
-	if (static_cpu_has(X86_FEATURE_SEP))
+	if (static_cpu_has(X86_FEATURE_SEP)) {
 		tsk->thread.sysenter_cs = 0;
 		tsk->thread.sysenter_cs = 0;
+		refresh_sysenter_cs(&tsk->thread);
+	}
 
 
-	load_sp0(tss, &tsk->thread);
-	put_cpu();
+	update_sp0(tsk);
+	preempt_enable();
 
 
 	if (vm86->flags & VM86_SCREEN_BITMAP)
 	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 		mark_screen_rdonly(tsk->mm);

+ 34 - 54
arch/x86/mm/fault.c

@@ -29,26 +29,6 @@
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
 #include <asm/trace/exceptions.h>
 
 
-/*
- * Page fault error code bits:
- *
- *   bit 0 ==	 0: no page found	1: protection fault
- *   bit 1 ==	 0: read access		1: write access
- *   bit 2 ==	 0: kernel-mode access	1: user-mode access
- *   bit 3 ==				1: use of reserved bit detected
- *   bit 4 ==				1: fault was an instruction fetch
- *   bit 5 ==				1: protection keys block access
- */
-enum x86_pf_error_code {
-
-	PF_PROT		=		1 << 0,
-	PF_WRITE	=		1 << 1,
-	PF_USER		=		1 << 2,
-	PF_RSVD		=		1 << 3,
-	PF_INSTR	=		1 << 4,
-	PF_PK		=		1 << 5,
-};
-
 /*
 /*
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
  * handled by mmiotrace:
@@ -150,7 +130,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 	 * If it was a exec (instruction fetch) fault on NX page, then
 	 * If it was a exec (instruction fetch) fault on NX page, then
 	 * do not ignore the fault:
 	 * do not ignore the fault:
 	 */
 	 */
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		return 0;
 		return 0;
 
 
 	instr = (void *)convert_ip_to_linear(current, regs);
 	instr = (void *)convert_ip_to_linear(current, regs);
@@ -180,7 +160,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
  * siginfo so userspace can discover which protection key was set
  * siginfo so userspace can discover which protection key was set
  * on the PTE.
  * on the PTE.
  *
  *
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
  * fault and that there was a VMA once we got in the fault
  * fault and that there was a VMA once we got in the fault
  * handler.  It does *not* guarantee that the VMA we find here
  * handler.  It does *not* guarantee that the VMA we find here
  * was the one that we faulted on.
  * was the one that we faulted on.
@@ -205,7 +185,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
 	/*
 	/*
 	 * force_sig_info_fault() is called from a number of
 	 * force_sig_info_fault() is called from a number of
 	 * contexts, some of which have a VMA and some of which
 	 * contexts, some of which have a VMA and some of which
-	 * do not.  The PF_PK handing happens after we have a
+	 * do not.  The X86_PF_PK handing happens after we have a
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA.
 	 * valid VMA.
 	 */
 	 */
@@ -698,7 +678,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (!oops_may_print())
 	if (!oops_may_print())
 		return;
 		return;
 
 
-	if (error_code & PF_INSTR) {
+	if (error_code & X86_PF_INSTR) {
 		unsigned int level;
 		unsigned int level;
 		pgd_t *pgd;
 		pgd_t *pgd;
 		pte_t *pte;
 		pte_t *pte;
@@ -780,7 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		 */
 		 */
 		if (current->thread.sig_on_uaccess_err && signal) {
 		if (current->thread.sig_on_uaccess_err && signal) {
 			tsk->thread.trap_nr = X86_TRAP_PF;
 			tsk->thread.trap_nr = X86_TRAP_PF;
-			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.error_code = error_code | X86_PF_USER;
 			tsk->thread.cr2 = address;
 			tsk->thread.cr2 = address;
 
 
 			/* XXX: hwpoison faults will set the wrong code. */
 			/* XXX: hwpoison faults will set the wrong code. */
@@ -898,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 	struct task_struct *tsk = current;
 
 
 	/* User mode accesses just cause a SIGSEGV */
 	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		/*
 		 * It's possible to have interrupts off here:
 		 * It's possible to have interrupts off here:
 		 */
 		 */
@@ -919,7 +899,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * Instruction fetch faults in the vsyscall page might need
 		 * Instruction fetch faults in the vsyscall page might need
 		 * emulation.
 		 * emulation.
 		 */
 		 */
-		if (unlikely((error_code & PF_INSTR) &&
+		if (unlikely((error_code & X86_PF_INSTR) &&
 			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
 			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
 			if (emulate_vsyscall(regs, address))
 			if (emulate_vsyscall(regs, address))
 				return;
 				return;
@@ -932,7 +912,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * are always protection faults.
 		 * are always protection faults.
 		 */
 		 */
 		if (address >= TASK_SIZE_MAX)
 		if (address >= TASK_SIZE_MAX)
-			error_code |= PF_PROT;
+			error_code |= X86_PF_PROT;
 
 
 		if (likely(show_unhandled_signals))
 		if (likely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
 			show_signal_msg(regs, error_code, address, tsk);
@@ -993,11 +973,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 
 
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return false;
 		return false;
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return true;
 		return true;
 	/* this checks permission keys on the VMA: */
 	/* this checks permission keys on the VMA: */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return true;
 		return true;
 	return false;
 	return false;
 }
 }
@@ -1025,7 +1005,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	int code = BUS_ADRERR;
 	int code = BUS_ADRERR;
 
 
 	/* Kernel mode? Handle exceptions or die: */
 	/* Kernel mode? Handle exceptions or die: */
-	if (!(error_code & PF_USER)) {
+	if (!(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 		return;
 		return;
 	}
 	}
@@ -1053,14 +1033,14 @@ static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	       unsigned long address, u32 *pkey, unsigned int fault)
 	       unsigned long address, u32 *pkey, unsigned int fault)
 {
 {
-	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
 		no_context(regs, error_code, address, 0, 0);
 		return;
 		return;
 	}
 	}
 
 
 	if (fault & VM_FAULT_OOM) {
 	if (fault & VM_FAULT_OOM) {
 		/* Kernel mode? Handle exceptions or die: */
 		/* Kernel mode? Handle exceptions or die: */
-		if (!(error_code & PF_USER)) {
+		if (!(error_code & X86_PF_USER)) {
 			no_context(regs, error_code, address,
 			no_context(regs, error_code, address,
 				   SIGSEGV, SEGV_MAPERR);
 				   SIGSEGV, SEGV_MAPERR);
 			return;
 			return;
@@ -1085,16 +1065,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
 
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
 {
-	if ((error_code & PF_WRITE) && !pte_write(*pte))
+	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
 		return 0;
 		return 0;
 
 
-	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
 		return 0;
 		return 0;
 	/*
 	/*
 	 * Note: We do not do lazy flushing on protection key
 	 * Note: We do not do lazy flushing on protection key
-	 * changes, so no spurious fault will ever set PF_PK.
+	 * changes, so no spurious fault will ever set X86_PF_PK.
 	 */
 	 */
-	if ((error_code & PF_PK))
+	if ((error_code & X86_PF_PK))
 		return 1;
 		return 1;
 
 
 	return 1;
 	return 1;
@@ -1140,8 +1120,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	 * change, so user accesses are not expected to cause spurious
 	 * change, so user accesses are not expected to cause spurious
 	 * faults.
 	 * faults.
 	 */
 	 */
-	if (error_code != (PF_WRITE | PF_PROT)
-	    && error_code != (PF_INSTR | PF_PROT))
+	if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+	    error_code != (X86_PF_INSTR | X86_PF_PROT))
 		return 0;
 		return 0;
 
 
 	pgd = init_mm.pgd + pgd_index(address);
 	pgd = init_mm.pgd + pgd_index(address);
@@ -1201,19 +1181,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	 * always an unconditional error and can never result in
 	 * always an unconditional error and can never result in
 	 * a follow-up action to resolve the fault, like a COW.
 	 * a follow-up action to resolve the fault, like a COW.
 	 */
 	 */
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return 1;
 		return 1;
 
 
 	/*
 	/*
 	 * Make sure to check the VMA so that we do not perform
 	 * Make sure to check the VMA so that we do not perform
-	 * faults just to hit a PF_PK as soon as we fill in a
+	 * faults just to hit a X86_PF_PK as soon as we fill in a
 	 * page.
 	 * page.
 	 */
 	 */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return 1;
 		return 1;
 
 
-	if (error_code & PF_WRITE) {
+	if (error_code & X86_PF_WRITE) {
 		/* write, present and write, not present: */
 		/* write, present and write, not present: */
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			return 1;
 			return 1;
@@ -1221,7 +1201,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	}
 	}
 
 
 	/* read, present: */
 	/* read, present: */
-	if (unlikely(error_code & PF_PROT))
+	if (unlikely(error_code & X86_PF_PROT))
 		return 1;
 		return 1;
 
 
 	/* read, not present: */
 	/* read, not present: */
@@ -1244,7 +1224,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 	if (!static_cpu_has(X86_FEATURE_SMAP))
 	if (!static_cpu_has(X86_FEATURE_SMAP))
 		return false;
 		return false;
 
 
-	if (error_code & PF_USER)
+	if (error_code & X86_PF_USER)
 		return false;
 		return false;
 
 
 	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
 	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@@ -1297,7 +1277,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * protection error (error_code & 9) == 0.
 	 * protection error (error_code & 9) == 0.
 	 */
 	 */
 	if (unlikely(fault_in_kernel_space(address))) {
 	if (unlikely(fault_in_kernel_space(address))) {
-		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 			if (vmalloc_fault(address) >= 0)
 			if (vmalloc_fault(address) >= 0)
 				return;
 				return;
 
 
@@ -1325,7 +1305,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	if (unlikely(kprobes_fault(regs)))
 	if (unlikely(kprobes_fault(regs)))
 		return;
 		return;
 
 
-	if (unlikely(error_code & PF_RSVD))
+	if (unlikely(error_code & X86_PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 		pgtable_bad(regs, error_code, address);
 
 
 	if (unlikely(smap_violation(error_code, regs))) {
 	if (unlikely(smap_violation(error_code, regs))) {
@@ -1351,7 +1331,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 */
 	 */
 	if (user_mode(regs)) {
 	if (user_mode(regs)) {
 		local_irq_enable();
 		local_irq_enable();
-		error_code |= PF_USER;
+		error_code |= X86_PF_USER;
 		flags |= FAULT_FLAG_USER;
 		flags |= FAULT_FLAG_USER;
 	} else {
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
 		if (regs->flags & X86_EFLAGS_IF)
@@ -1360,9 +1340,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 
-	if (error_code & PF_WRITE)
+	if (error_code & X86_PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
 		flags |= FAULT_FLAG_WRITE;
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		flags |= FAULT_FLAG_INSTRUCTION;
 		flags |= FAULT_FLAG_INSTRUCTION;
 
 
 	/*
 	/*
@@ -1382,7 +1362,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * space check, thus avoiding the deadlock:
 	 * space check, thus avoiding the deadlock:
 	 */
 	 */
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
-		if ((error_code & PF_USER) == 0 &&
+		if (!(error_code & X86_PF_USER) &&
 		    !search_exception_tables(regs->ip)) {
 		    !search_exception_tables(regs->ip)) {
 			bad_area_nosemaphore(regs, error_code, address, NULL);
 			bad_area_nosemaphore(regs, error_code, address, NULL);
 			return;
 			return;
@@ -1409,7 +1389,7 @@ retry:
 		bad_area(regs, error_code, address);
 		bad_area(regs, error_code, address);
 		return;
 		return;
 	}
 	}
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		/*
 		 * Accessing the stack below %sp is always a bug.
 		 * Accessing the stack below %sp is always a bug.
 		 * The large cushion allows instructions like enter
 		 * The large cushion allows instructions like enter

+ 5 - 2
arch/x86/um/ldt.c

@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
+#include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
 #include <os.h>
 #include <os.h>
@@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm)
 	mm->arch.ldt.entry_count = 0;
 	mm->arch.ldt.entry_count = 0;
 }
 }
 
 
-int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+		unsigned long , bytecount)
 {
 {
-	return do_modify_ldt_skas(func, ptr, bytecount);
+	/* See non-um modify_ldt() for why we do this cast */
+	return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount);
 }
 }

+ 4 - 5
arch/x86/xen/enlighten_pv.c

@@ -601,7 +601,7 @@ static struct trap_array_entry trap_array[] = {
 #ifdef CONFIG_X86_MCE
 #ifdef CONFIG_X86_MCE
 	{ machine_check,               xen_machine_check,               true },
 	{ machine_check,               xen_machine_check,               true },
 #endif
 #endif
-	{ nmi,                         xen_nmi,                         true },
+	{ nmi,                         xen_xennmi,                      true },
 	{ overflow,                    xen_overflow,                    false },
 	{ overflow,                    xen_overflow,                    false },
 #ifdef CONFIG_IA32_EMULATION
 #ifdef CONFIG_IA32_EMULATION
 	{ entry_INT80_compat,          xen_entry_INT80_compat,          false },
 	{ entry_INT80_compat,          xen_entry_INT80_compat,          false },
@@ -811,15 +811,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 	}
 	}
 }
 }
 
 
-static void xen_load_sp0(struct tss_struct *tss,
-			 struct thread_struct *thread)
+static void xen_load_sp0(unsigned long sp0)
 {
 {
 	struct multicall_space mcs;
 	struct multicall_space mcs;
 
 
 	mcs = xen_mc_entry(0);
 	mcs = xen_mc_entry(0);
-	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
-	tss->x86_tss.sp0 = thread->sp0;
+	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
 }
 }
 
 
 void xen_set_iopl_mask(unsigned mask)
 void xen_set_iopl_mask(unsigned mask)

+ 14 - 3
arch/x86/xen/smp_pv.c

@@ -14,6 +14,7 @@
  * single-threaded.
  * single-threaded.
  */
  */
 #include <linux/sched.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/err.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
@@ -294,12 +295,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 #endif
 #endif
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
 
+	/*
+	 * Bring up the CPU in cpu_bringup_and_idle() with the stack
+	 * pointing just below where pt_regs would be if it were a normal
+	 * kernel entry.
+	 */
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->flags = VGCF_IN_KERNEL;
 	ctxt->flags = VGCF_IN_KERNEL;
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 	ctxt->user_regs.ds = __USER_DS;
 	ctxt->user_regs.ds = __USER_DS;
 	ctxt->user_regs.es = __USER_DS;
 	ctxt->user_regs.es = __USER_DS;
 	ctxt->user_regs.ss = __KERNEL_DS;
 	ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle);
 
 
 	xen_copy_trap_info(ctxt->trap_ctxt);
 	xen_copy_trap_info(ctxt->trap_ctxt);
 
 
@@ -314,8 +322,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_ents      = GDT_ENTRIES;
 	ctxt->gdt_ents      = GDT_ENTRIES;
 
 
+	/*
+	 * Set SS:SP that Xen will use when entering guest kernel mode
+	 * from guest user mode.  Subsequent calls to load_sp0() can
+	 * change this value.
+	 */
 	ctxt->kernel_ss = __KERNEL_DS;
 	ctxt->kernel_ss = __KERNEL_DS;
-	ctxt->kernel_sp = idle->thread.sp0;
+	ctxt->kernel_sp = task_top_of_stack(idle);
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	ctxt->event_callback_cs     = __KERNEL_CS;
 	ctxt->event_callback_cs     = __KERNEL_CS;
@@ -327,10 +340,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 		(unsigned long)xen_hypervisor_callback;
 		(unsigned long)xen_hypervisor_callback;
 	ctxt->failsafe_callback_eip =
 	ctxt->failsafe_callback_eip =
 		(unsigned long)xen_failsafe_callback;
 		(unsigned long)xen_failsafe_callback;
-	ctxt->user_regs.cs = __KERNEL_CS;
 	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
 	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
 
 
-	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
 		BUG();
 		BUG();

+ 1 - 1
arch/x86/xen/xen-asm_64.S

@@ -30,7 +30,7 @@ xen_pv_trap debug
 xen_pv_trap xendebug
 xen_pv_trap xendebug
 xen_pv_trap int3
 xen_pv_trap int3
 xen_pv_trap xenint3
 xen_pv_trap xenint3
-xen_pv_trap nmi
+xen_pv_trap xennmi
 xen_pv_trap overflow
 xen_pv_trap overflow
 xen_pv_trap bounds
 xen_pv_trap bounds
 xen_pv_trap invalid_op
 xen_pv_trap invalid_op

+ 8 - 3
arch/x86/xen/xen-head.S

@@ -10,6 +10,7 @@
 #include <asm/boot.h>
 #include <asm/boot.h>
 #include <asm/asm.h>
 #include <asm/asm.h>
 #include <asm/page_types.h>
 #include <asm/page_types.h>
+#include <asm/unwind_hints.h>
 
 
 #include <xen/interface/elfnote.h>
 #include <xen/interface/elfnote.h>
 #include <xen/interface/features.h>
 #include <xen/interface/features.h>
@@ -20,6 +21,7 @@
 #ifdef CONFIG_XEN_PV
 #ifdef CONFIG_XEN_PV
 	__INIT
 	__INIT
 ENTRY(startup_xen)
 ENTRY(startup_xen)
+	UNWIND_HINT_EMPTY
 	cld
 	cld
 
 
 	/* Clear .bss */
 	/* Clear .bss */
@@ -34,21 +36,24 @@ ENTRY(startup_xen)
 	mov $init_thread_union+THREAD_SIZE, %_ASM_SP
 	mov $init_thread_union+THREAD_SIZE, %_ASM_SP
 
 
 	jmp xen_start_kernel
 	jmp xen_start_kernel
-
+END(startup_xen)
 	__FINIT
 	__FINIT
 #endif
 #endif
 
 
 .pushsection .text
 .pushsection .text
 	.balign PAGE_SIZE
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
 ENTRY(hypercall_page)
-	.skip PAGE_SIZE
+	.rept (PAGE_SIZE / 32)
+		UNWIND_HINT_EMPTY
+		.skip 32
+	.endr
 
 
 #define HYPERCALL(n) \
 #define HYPERCALL(n) \
 	.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
 	.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
 	.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
 	.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
 #include <asm/xen-hypercalls.h>
 #include <asm/xen-hypercalls.h>
 #undef HYPERCALL
 #undef HYPERCALL
-
+END(hypercall_page)
 .popsection
 .popsection
 
 
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")

+ 1 - 1
include/asm-generic/vmlinux.lds.h

@@ -687,7 +687,7 @@
 #define BUG_TABLE
 #define BUG_TABLE
 #endif
 #endif
 
 
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 #define ORC_UNWIND_TABLE						\
 #define ORC_UNWIND_TABLE						\
 	. = ALIGN(4);							\
 	. = ALIGN(4);							\
 	.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) {	\
 	.orc_unwind_ip : AT(ADDR(.orc_unwind_ip) - LOAD_OFFSET) {	\

+ 26 - 0
include/linux/bitops.h

@@ -228,6 +228,32 @@ static inline unsigned long __ffs64(u64 word)
 	return __ffs((unsigned long)word);
 	return __ffs((unsigned long)word);
 }
 }
 
 
+/*
+ * clear_bit32 - Clear a bit in memory for u32 array
+ * @nr: Bit to clear
+ * @addr: u32 * address of bitmap
+ *
+ * Same as clear_bit, but avoids needing casts for u32 arrays.
+ */
+
+static __always_inline void clear_bit32(long nr, volatile u32 *addr)
+{
+	clear_bit(nr, (volatile unsigned long *)addr);
+}
+
+/*
+ * set_bit32 - Set a bit in memory for u32 array
+ * @nr: Bit to clear
+ * @addr: u32 * address of bitmap
+ *
+ * Same as set_bit, but avoids needing casts for u32 arrays.
+ */
+
+static __always_inline void set_bit32(long nr, volatile u32 *addr)
+{
+	set_bit(nr, (volatile unsigned long *)addr);
+}
+
 #ifdef __KERNEL__
 #ifdef __KERNEL__
 
 
 #ifndef set_mask_bits
 #ifndef set_mask_bits

+ 1 - 1
lib/Kconfig.debug

@@ -376,7 +376,7 @@ config STACK_VALIDATION
 	  that runtime stack traces are more reliable.
 	  that runtime stack traces are more reliable.
 
 
 	  This is also a prerequisite for generation of ORC unwind data, which
 	  This is also a prerequisite for generation of ORC unwind data, which
-	  is needed for CONFIG_ORC_UNWINDER.
+	  is needed for CONFIG_UNWINDER_ORC.
 
 
 	  For more information, see
 	  For more information, see
 	  tools/objtool/Documentation/stack-validation.txt.
 	  tools/objtool/Documentation/stack-validation.txt.

+ 1 - 1
scripts/Makefile.build

@@ -259,7 +259,7 @@ ifneq ($(SKIP_STACK_VALIDATION),1)
 
 
 __objtool_obj := $(objtree)/tools/objtool/objtool
 __objtool_obj := $(objtree)/tools/objtool/objtool
 
 
-objtool_args = $(if $(CONFIG_ORC_UNWINDER),orc generate,check)
+objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check)
 
 
 ifndef CONFIG_FRAME_POINTER
 ifndef CONFIG_FRAME_POINTER
 objtool_args += --no-fp
 objtool_args += --no-fp

+ 5 - 2
tools/objtool/check.c

@@ -1757,11 +1757,14 @@ static int validate_branch(struct objtool_file *file, struct instruction *first,
 		if (insn->dead_end)
 		if (insn->dead_end)
 			return 0;
 			return 0;
 
 
-		insn = next_insn;
-		if (!insn) {
+		if (!next_insn) {
+			if (state.cfa.base == CFI_UNDEFINED)
+				return 0;
 			WARN("%s: unexpected end of section", sec->name);
 			WARN("%s: unexpected end of section", sec->name);
 			return 1;
 			return 1;
 		}
 		}
+
+		insn = next_insn;
 	}
 	}
 
 
 	return 0;
 	return 0;

+ 2 - 4
tools/objtool/objtool.c

@@ -70,7 +70,7 @@ static void cmd_usage(void)
 
 
 	printf("\n");
 	printf("\n");
 
 
-	exit(1);
+	exit(129);
 }
 }
 
 
 static void handle_options(int *argc, const char ***argv)
 static void handle_options(int *argc, const char ***argv)
@@ -86,9 +86,7 @@ static void handle_options(int *argc, const char ***argv)
 			break;
 			break;
 		} else {
 		} else {
 			fprintf(stderr, "Unknown option: %s\n", cmd);
 			fprintf(stderr, "Unknown option: %s\n", cmd);
-			fprintf(stderr, "\n Usage: %s\n",
-				objtool_usage_string);
-			exit(1);
+			cmd_usage();
 		}
 		}
 
 
 		(*argv)++;
 		(*argv)++;