Browse Source

Merge branch 'x86/pti' into x86/mm, to pick up dependencies

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 7 years ago
parent
commit
3c76db70eb
69 changed files with 910 additions and 471 deletions
  1. 5 0
      Makefile
  2. 2 10
      arch/x86/Kconfig
  3. 3 4
      arch/x86/Makefile
  4. 19 15
      arch/x86/entry/calling.h
  5. 1 2
      arch/x86/entry/entry_32.S
  6. 92 61
      arch/x86/entry/entry_64.S
  7. 41 42
      arch/x86/entry/entry_64_compat.S
  8. 19 19
      arch/x86/entry/syscalls/syscall_32.tbl
  9. 3 13
      arch/x86/entry/vsyscall/vsyscall_64.c
  10. 44 30
      arch/x86/ia32/sys_ia32.c
  11. 6 0
      arch/x86/include/asm/apm.h
  12. 0 3
      arch/x86/include/asm/asm-prototypes.h
  13. 3 0
      arch/x86/include/asm/cpufeatures.h
  14. 15 2
      arch/x86/include/asm/efi.h
  15. 7 2
      arch/x86/include/asm/microcode.h
  16. 1 0
      arch/x86/include/asm/mmu_context.h
  17. 118 20
      arch/x86/include/asm/nospec-branch.h
  18. 13 4
      arch/x86/include/asm/paravirt.h
  19. 4 1
      arch/x86/include/asm/paravirt_types.h
  20. 4 4
      arch/x86/include/asm/pgtable.h
  21. 1 0
      arch/x86/include/asm/pgtable_32.h
  22. 1 0
      arch/x86/include/asm/pgtable_64.h
  23. 10 2
      arch/x86/include/asm/pgtable_types.h
  24. 1 0
      arch/x86/include/asm/processor.h
  25. 2 2
      arch/x86/include/asm/refcount.h
  26. 8 8
      arch/x86/include/asm/rmwcc.h
  27. 1 0
      arch/x86/include/asm/sections.h
  28. 30 18
      arch/x86/include/asm/sys_ia32.h
  29. 1 1
      arch/x86/kernel/apic/io_apic.c
  30. 11 1
      arch/x86/kernel/cpu/bugs.c
  31. 30 0
      arch/x86/kernel/cpu/common.c
  32. 7 0
      arch/x86/kernel/cpu/intel.c
  33. 5 5
      arch/x86/kernel/cpu/microcode/amd.c
  34. 123 38
      arch/x86/kernel/cpu/microcode/core.c
  35. 45 13
      arch/x86/kernel/cpu/microcode/intel.c
  36. 2 0
      arch/x86/kernel/head_64.S
  37. 1 1
      arch/x86/kernel/ioport.c
  38. 9 1
      arch/x86/kernel/kprobes/core.c
  39. 5 12
      arch/x86/kernel/setup.c
  40. 4 13
      arch/x86/kernel/setup_percpu.c
  41. 1 2
      arch/x86/kernel/unwind_orc.c
  42. 2 0
      arch/x86/kernel/vmlinux.lds.S
  43. 5 4
      arch/x86/kvm/svm.c
  44. 5 4
      arch/x86/kvm/vmx.c
  45. 0 1
      arch/x86/lib/Makefile
  46. 0 56
      arch/x86/lib/retpoline.S
  47. 6 0
      arch/x86/mm/cpu_entry_area.c
  48. 0 4
      arch/x86/mm/fault.c
  49. 15 0
      arch/x86/mm/init_32.c
  50. 2 0
      arch/x86/mm/mem_encrypt_boot.S
  51. 1 1
      arch/x86/mm/pti.c
  52. 1 1
      arch/x86/realmode/rm/trampoline_64.S
  53. 16 0
      arch/x86/xen/suspend.c
  54. 5 0
      include/linux/compiler-clang.h
  55. 4 0
      include/linux/compiler-gcc.h
  56. 4 4
      include/linux/init.h
  57. 3 0
      include/linux/jump_label.h
  58. 1 0
      include/linux/kernel.h
  59. 3 23
      include/linux/nospec.h
  60. 2 0
      init/main.c
  61. 1 1
      kernel/extable.c
  62. 22 5
      kernel/jump_label.c
  63. 8 0
      scripts/Makefile.build
  64. 4 2
      tools/objtool/builtin-check.c
  65. 1 5
      tools/objtool/builtin-orc.c
  66. 5 0
      tools/objtool/builtin.h
  67. 88 5
      tools/objtool/check.c
  68. 2 1
      tools/objtool/check.h
  69. 6 5
      tools/testing/selftests/x86/test_vsyscall.c

+ 5 - 0
Makefile

@@ -489,6 +489,11 @@ KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 endif
 endif
 
 
+RETPOLINE_CFLAGS_GCC := -mindirect-branch=thunk-extern -mindirect-branch-register
+RETPOLINE_CFLAGS_CLANG := -mretpoline-external-thunk
+RETPOLINE_CFLAGS := $(call cc-option,$(RETPOLINE_CFLAGS_GCC),$(call cc-option,$(RETPOLINE_CFLAGS_CLANG)))
+export RETPOLINE_CFLAGS
+
 ifeq ($(config-targets),1)
 ifeq ($(config-targets),1)
 # ===========================================================================
 # ===========================================================================
 # *config targets only - make sure prerequisites are updated, and descend
 # *config targets only - make sure prerequisites are updated, and descend

+ 2 - 10
arch/x86/Kconfig

@@ -430,6 +430,7 @@ config GOLDFISH
 config RETPOLINE
 config RETPOLINE
 	bool "Avoid speculative indirect branches in kernel"
 	bool "Avoid speculative indirect branches in kernel"
 	default y
 	default y
+	select STACK_VALIDATION if HAVE_STACK_VALIDATION
 	help
 	help
 	  Compile kernel with the retpoline compiler options to guard against
 	  Compile kernel with the retpoline compiler options to guard against
 	  kernel-to-user data leaks by avoiding speculative indirect
 	  kernel-to-user data leaks by avoiding speculative indirect
@@ -2315,7 +2316,7 @@ choice
 	  it can be used to assist security vulnerability exploitation.
 	  it can be used to assist security vulnerability exploitation.
 
 
 	  This setting can be changed at boot time via the kernel command
 	  This setting can be changed at boot time via the kernel command
-	  line parameter vsyscall=[native|emulate|none].
+	  line parameter vsyscall=[emulate|none].
 
 
 	  On a system with recent enough glibc (2.14 or newer) and no
 	  On a system with recent enough glibc (2.14 or newer) and no
 	  static binaries, you can say None without a performance penalty
 	  static binaries, you can say None without a performance penalty
@@ -2323,15 +2324,6 @@ choice
 
 
 	  If unsure, select "Emulate".
 	  If unsure, select "Emulate".
 
 
-	config LEGACY_VSYSCALL_NATIVE
-		bool "Native"
-		help
-		  Actual executable code is located in the fixed vsyscall
-		  address mapping, implementing time() efficiently. Since
-		  this makes the mapping executable, it can be used during
-		  security vulnerability exploitation (traditionally as
-		  ROP gadgets). This configuration is not recommended.
-
 	config LEGACY_VSYSCALL_EMULATE
 	config LEGACY_VSYSCALL_EMULATE
 		bool "Emulate"
 		bool "Emulate"
 		help
 		help

+ 3 - 4
arch/x86/Makefile

@@ -232,10 +232,9 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
 
 # Avoid indirect branches in kernel to deal with Spectre
 # Avoid indirect branches in kernel to deal with Spectre
 ifdef CONFIG_RETPOLINE
 ifdef CONFIG_RETPOLINE
-    RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
-    ifneq ($(RETPOLINE_CFLAGS),)
-        KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
-    endif
+ifneq ($(RETPOLINE_CFLAGS),)
+  KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
+endif
 endif
 endif
 
 
 archscripts: scripts_basic
 archscripts: scripts_basic

+ 19 - 15
arch/x86/entry/calling.h

@@ -97,7 +97,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 
 #define SIZEOF_PTREGS	21*8
 #define SIZEOF_PTREGS	21*8
 
 
-.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax
+.macro PUSH_AND_CLEAR_REGS rdx=%rdx rax=%rax save_ret=0
 	/*
 	/*
 	 * Push registers and sanitize registers of values that a
 	 * Push registers and sanitize registers of values that a
 	 * speculation attack might otherwise want to exploit. The
 	 * speculation attack might otherwise want to exploit. The
@@ -105,32 +105,41 @@ For 32-bit we have the following conventions - kernel is built with
 	 * could be put to use in a speculative execution gadget.
 	 * could be put to use in a speculative execution gadget.
 	 * Interleave XOR with PUSH for better uop scheduling:
 	 * Interleave XOR with PUSH for better uop scheduling:
 	 */
 	 */
+	.if \save_ret
+	pushq	%rsi		/* pt_regs->si */
+	movq	8(%rsp), %rsi	/* temporarily store the return address in %rsi */
+	movq	%rdi, 8(%rsp)	/* pt_regs->di (overwriting original return address) */
+	.else
 	pushq   %rdi		/* pt_regs->di */
 	pushq   %rdi		/* pt_regs->di */
 	pushq   %rsi		/* pt_regs->si */
 	pushq   %rsi		/* pt_regs->si */
+	.endif
 	pushq	\rdx		/* pt_regs->dx */
 	pushq	\rdx		/* pt_regs->dx */
 	pushq   %rcx		/* pt_regs->cx */
 	pushq   %rcx		/* pt_regs->cx */
 	pushq   \rax		/* pt_regs->ax */
 	pushq   \rax		/* pt_regs->ax */
 	pushq   %r8		/* pt_regs->r8 */
 	pushq   %r8		/* pt_regs->r8 */
-	xorq    %r8, %r8	/* nospec   r8 */
+	xorl	%r8d, %r8d	/* nospec   r8 */
 	pushq   %r9		/* pt_regs->r9 */
 	pushq   %r9		/* pt_regs->r9 */
-	xorq    %r9, %r9	/* nospec   r9 */
+	xorl	%r9d, %r9d	/* nospec   r9 */
 	pushq   %r10		/* pt_regs->r10 */
 	pushq   %r10		/* pt_regs->r10 */
-	xorq    %r10, %r10	/* nospec   r10 */
+	xorl	%r10d, %r10d	/* nospec   r10 */
 	pushq   %r11		/* pt_regs->r11 */
 	pushq   %r11		/* pt_regs->r11 */
-	xorq    %r11, %r11	/* nospec   r11*/
+	xorl	%r11d, %r11d	/* nospec   r11*/
 	pushq	%rbx		/* pt_regs->rbx */
 	pushq	%rbx		/* pt_regs->rbx */
 	xorl    %ebx, %ebx	/* nospec   rbx*/
 	xorl    %ebx, %ebx	/* nospec   rbx*/
 	pushq	%rbp		/* pt_regs->rbp */
 	pushq	%rbp		/* pt_regs->rbp */
 	xorl    %ebp, %ebp	/* nospec   rbp*/
 	xorl    %ebp, %ebp	/* nospec   rbp*/
 	pushq	%r12		/* pt_regs->r12 */
 	pushq	%r12		/* pt_regs->r12 */
-	xorq    %r12, %r12	/* nospec   r12*/
+	xorl	%r12d, %r12d	/* nospec   r12*/
 	pushq	%r13		/* pt_regs->r13 */
 	pushq	%r13		/* pt_regs->r13 */
-	xorq    %r13, %r13	/* nospec   r13*/
+	xorl	%r13d, %r13d	/* nospec   r13*/
 	pushq	%r14		/* pt_regs->r14 */
 	pushq	%r14		/* pt_regs->r14 */
-	xorq    %r14, %r14	/* nospec   r14*/
+	xorl	%r14d, %r14d	/* nospec   r14*/
 	pushq	%r15		/* pt_regs->r15 */
 	pushq	%r15		/* pt_regs->r15 */
-	xorq    %r15, %r15	/* nospec   r15*/
+	xorl	%r15d, %r15d	/* nospec   r15*/
 	UNWIND_HINT_REGS
 	UNWIND_HINT_REGS
+	.if \save_ret
+	pushq	%rsi		/* return address on top of stack */
+	.endif
 .endm
 .endm
 
 
 .macro POP_REGS pop_rdi=1 skip_r11rcx=0
 .macro POP_REGS pop_rdi=1 skip_r11rcx=0
@@ -172,12 +181,7 @@ For 32-bit we have the following conventions - kernel is built with
  */
  */
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
 .macro ENCODE_FRAME_POINTER ptregs_offset=0
 #ifdef CONFIG_FRAME_POINTER
 #ifdef CONFIG_FRAME_POINTER
-	.if \ptregs_offset
-		leaq \ptregs_offset(%rsp), %rbp
-	.else
-		mov %rsp, %rbp
-	.endif
-	orq	$0x1, %rbp
+	leaq 1+\ptregs_offset(%rsp), %rbp
 #endif
 #endif
 .endm
 .endm
 
 

+ 1 - 2
arch/x86/entry/entry_32.S

@@ -252,8 +252,7 @@ ENTRY(__switch_to_asm)
 	 * exist, overwrite the RSB with entries which capture
 	 * exist, overwrite the RSB with entries which capture
 	 * speculative execution to prevent attack.
 	 * speculative execution to prevent attack.
 	 */
 	 */
-	/* Clobbers %ebx */
-	FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+	FILL_RETURN_BUFFER %ebx, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
 #endif
 #endif
 
 
 	/* restore callee-saved registers */
 	/* restore callee-saved registers */

+ 92 - 61
arch/x86/entry/entry_64.S

@@ -369,8 +369,7 @@ ENTRY(__switch_to_asm)
 	 * exist, overwrite the RSB with entries which capture
 	 * exist, overwrite the RSB with entries which capture
 	 * speculative execution to prevent attack.
 	 * speculative execution to prevent attack.
 	 */
 	 */
-	/* Clobbers %rbx */
-	FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
+	FILL_RETURN_BUFFER %r12, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW
 #endif
 #endif
 
 
 	/* restore callee-saved registers */
 	/* restore callee-saved registers */
@@ -454,9 +453,19 @@ END(irq_entries_start)
  *
  *
  * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
  * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
  */
  */
-.macro ENTER_IRQ_STACK regs=1 old_rsp
+.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
 	DEBUG_ENTRY_ASSERT_IRQS_OFF
 	DEBUG_ENTRY_ASSERT_IRQS_OFF
+
+	.if \save_ret
+	/*
+	 * If save_ret is set, the original stack contains one additional
+	 * entry -- the return address. Therefore, move the address one
+	 * entry below %rsp to \old_rsp.
+	 */
+	leaq	8(%rsp), \old_rsp
+	.else
 	movq	%rsp, \old_rsp
 	movq	%rsp, \old_rsp
+	.endif
 
 
 	.if \regs
 	.if \regs
 	UNWIND_HINT_REGS base=\old_rsp
 	UNWIND_HINT_REGS base=\old_rsp
@@ -502,6 +511,15 @@ END(irq_entries_start)
 	.if \regs
 	.if \regs
 	UNWIND_HINT_REGS indirect=1
 	UNWIND_HINT_REGS indirect=1
 	.endif
 	.endif
+
+	.if \save_ret
+	/*
+	 * Push the return address to the stack. This return address can
+	 * be found at the "real" original RSP, which was offset by 8 at
+	 * the beginning of this macro.
+	 */
+	pushq	-8(\old_rsp)
+	.endif
 .endm
 .endm
 
 
 /*
 /*
@@ -525,27 +543,65 @@ END(irq_entries_start)
 .endm
 .endm
 
 
 /*
 /*
- * Interrupt entry/exit.
- *
- * Interrupt entry points save only callee clobbered registers in fast path.
+ * Interrupt entry helper function.
  *
  *
- * Entry runs with interrupts off.
+ * Entry runs with interrupts off. Stack layout at entry:
+ * +----------------------------------------------------+
+ * | regs->ss						|
+ * | regs->rsp						|
+ * | regs->eflags					|
+ * | regs->cs						|
+ * | regs->ip						|
+ * +----------------------------------------------------+
+ * | regs->orig_ax = ~(interrupt number)		|
+ * +----------------------------------------------------+
+ * | return address					|
+ * +----------------------------------------------------+
  */
  */
-
-/* 0(%rsp): ~(interrupt number) */
-	.macro interrupt func
+ENTRY(interrupt_entry)
+	UNWIND_HINT_FUNC
+	ASM_CLAC
 	cld
 	cld
 
 
-	testb	$3, CS-ORIG_RAX(%rsp)
+	testb	$3, CS-ORIG_RAX+8(%rsp)
 	jz	1f
 	jz	1f
 	SWAPGS
 	SWAPGS
-	call	switch_to_thread_stack
+
+	/*
+	 * Switch to the thread stack. The IRET frame and orig_ax are
+	 * on the stack, as well as the return address. RDI..R12 are
+	 * not (yet) on the stack and space has not (yet) been
+	 * allocated for them.
+	 */
+	pushq	%rdi
+
+	/* Need to switch before accessing the thread stack. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+
+	 /*
+	  * We have RDI, return address, and orig_ax on the stack on
+	  * top of the IRET frame. That means offset=24
+	  */
+	UNWIND_HINT_IRET_REGS base=%rdi offset=24
+
+	pushq	7*8(%rdi)		/* regs->ss */
+	pushq	6*8(%rdi)		/* regs->rsp */
+	pushq	5*8(%rdi)		/* regs->eflags */
+	pushq	4*8(%rdi)		/* regs->cs */
+	pushq	3*8(%rdi)		/* regs->ip */
+	pushq	2*8(%rdi)		/* regs->orig_ax */
+	pushq	8(%rdi)			/* return address */
+	UNWIND_HINT_FUNC
+
+	movq	(%rdi), %rdi
 1:
 1:
 
 
-	PUSH_AND_CLEAR_REGS
-	ENCODE_FRAME_POINTER
+	PUSH_AND_CLEAR_REGS save_ret=1
+	ENCODE_FRAME_POINTER 8
 
 
-	testb	$3, CS(%rsp)
+	testb	$3, CS+8(%rsp)
 	jz	1f
 	jz	1f
 
 
 	/*
 	/*
@@ -553,7 +609,7 @@ END(irq_entries_start)
 	 *
 	 *
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * we fix gsbase, and we should do it before enter_from_user_mode
 	 * we fix gsbase, and we should do it before enter_from_user_mode
-	 * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
+	 * (which can take locks).  Since TRACE_IRQS_OFF is idempotent,
 	 * the simplest way to handle it is to just call it twice if
 	 * the simplest way to handle it is to just call it twice if
 	 * we enter from user mode.  There's no reason to optimize this since
 	 * we enter from user mode.  There's no reason to optimize this since
 	 * TRACE_IRQS_OFF is a no-op if lockdep is off.
 	 * TRACE_IRQS_OFF is a no-op if lockdep is off.
@@ -563,12 +619,15 @@ END(irq_entries_start)
 	CALL_enter_from_user_mode
 	CALL_enter_from_user_mode
 
 
 1:
 1:
-	ENTER_IRQ_STACK old_rsp=%rdi
+	ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
 	/* We entered an interrupt context - irqs are off: */
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 
 
-	call	\func	/* rdi points to pt_regs */
-	.endm
+	ret
+END(interrupt_entry)
+
+
+/* Interrupt entry/exit. */
 
 
 	/*
 	/*
 	 * The interrupt stubs push (~vector+0x80) onto the stack and
 	 * The interrupt stubs push (~vector+0x80) onto the stack and
@@ -576,9 +635,10 @@ END(irq_entries_start)
 	 */
 	 */
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 common_interrupt:
 common_interrupt:
-	ASM_CLAC
 	addq	$-0x80, (%rsp)			/* Adjust vector to [-256, -1] range */
 	addq	$-0x80, (%rsp)			/* Adjust vector to [-256, -1] range */
-	interrupt do_IRQ
+	call	interrupt_entry
+	UNWIND_HINT_REGS indirect=1
+	call	do_IRQ	/* rdi points to pt_regs */
 	/* 0(%rsp): old RSP */
 	/* 0(%rsp): old RSP */
 ret_from_intr:
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	DISABLE_INTERRUPTS(CLBR_ANY)
@@ -771,10 +831,11 @@ END(common_interrupt)
 .macro apicinterrupt3 num sym do_sym
 .macro apicinterrupt3 num sym do_sym
 ENTRY(\sym)
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS
 	UNWIND_HINT_IRET_REGS
-	ASM_CLAC
 	pushq	$~(\num)
 	pushq	$~(\num)
 .Lcommon_\sym:
 .Lcommon_\sym:
-	interrupt \do_sym
+	call	interrupt_entry
+	UNWIND_HINT_REGS indirect=1
+	call	\do_sym	/* rdi points to pt_regs */
 	jmp	ret_from_intr
 	jmp	ret_from_intr
 END(\sym)
 END(\sym)
 .endm
 .endm
@@ -837,34 +898,6 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
  */
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
 
 
-/*
- * Switch to the thread stack.  This is called with the IRET frame and
- * orig_ax on the stack.  (That is, RDI..R12 are not on the stack and
- * space has not been allocated for them.)
- */
-ENTRY(switch_to_thread_stack)
-	UNWIND_HINT_FUNC
-
-	pushq	%rdi
-	/* Need to switch before accessing the thread stack. */
-	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
-	movq	%rsp, %rdi
-	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
-	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
-
-	pushq	7*8(%rdi)		/* regs->ss */
-	pushq	6*8(%rdi)		/* regs->rsp */
-	pushq	5*8(%rdi)		/* regs->eflags */
-	pushq	4*8(%rdi)		/* regs->cs */
-	pushq	3*8(%rdi)		/* regs->ip */
-	pushq	2*8(%rdi)		/* regs->orig_ax */
-	pushq	8(%rdi)			/* return address */
-	UNWIND_HINT_FUNC
-
-	movq	(%rdi), %rdi
-	ret
-END(switch_to_thread_stack)
-
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@@ -880,12 +913,8 @@ ENTRY(\sym)
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 	.endif
 
 
-	/* Save all registers in pt_regs */
-	PUSH_AND_CLEAR_REGS
-	ENCODE_FRAME_POINTER
-
 	.if \paranoid < 2
 	.if \paranoid < 2
-	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
+	testb	$3, CS-ORIG_RAX(%rsp)		/* If coming from userspace, switch stacks */
 	jnz	.Lfrom_usermode_switch_stack_\@
 	jnz	.Lfrom_usermode_switch_stack_\@
 	.endif
 	.endif
 
 
@@ -1135,13 +1164,15 @@ idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 #endif
 #endif
 
 
 /*
 /*
- * Switch gs if needed.
+ * Save all registers in pt_regs, and switch gs if needed.
  * Use slow, but surefire "are we in kernel?" check.
  * Use slow, but surefire "are we in kernel?" check.
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
  */
 ENTRY(paranoid_entry)
 ENTRY(paranoid_entry)
 	UNWIND_HINT_FUNC
 	UNWIND_HINT_FUNC
 	cld
 	cld
+	PUSH_AND_CLEAR_REGS save_ret=1
+	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
 	movl	$MSR_GS_BASE, %ecx
 	rdmsr
 	rdmsr
@@ -1186,12 +1217,14 @@ ENTRY(paranoid_exit)
 END(paranoid_exit)
 END(paranoid_exit)
 
 
 /*
 /*
- * Switch gs if needed.
+ * Save all registers in pt_regs, and switch GS if needed.
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
  */
 ENTRY(error_entry)
 ENTRY(error_entry)
-	UNWIND_HINT_REGS offset=8
+	UNWIND_HINT_FUNC
 	cld
 	cld
+	PUSH_AND_CLEAR_REGS save_ret=1
+	ENCODE_FRAME_POINTER 8
 	testb	$3, CS+8(%rsp)
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
 	jz	.Lerror_kernelspace
 
 
@@ -1582,8 +1615,6 @@ end_repeat_nmi:
 	 * frame to point back to repeat_nmi.
 	 * frame to point back to repeat_nmi.
 	 */
 	 */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
-	PUSH_AND_CLEAR_REGS
-	ENCODE_FRAME_POINTER
 
 
 	/*
 	/*
 	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
 	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit

+ 41 - 42
arch/x86/entry/entry_64_compat.S

@@ -85,25 +85,25 @@ ENTRY(entry_SYSENTER_compat)
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq   $0			/* pt_regs->r8  = 0 */
 	pushq   $0			/* pt_regs->r8  = 0 */
-	xorq	%r8, %r8		/* nospec   r8 */
+	xorl	%r8d, %r8d		/* nospec   r8 */
 	pushq   $0			/* pt_regs->r9  = 0 */
 	pushq   $0			/* pt_regs->r9  = 0 */
-	xorq	%r9, %r9		/* nospec   r9 */
+	xorl	%r9d, %r9d		/* nospec   r9 */
 	pushq   $0			/* pt_regs->r10 = 0 */
 	pushq   $0			/* pt_regs->r10 = 0 */
-	xorq	%r10, %r10		/* nospec   r10 */
+	xorl	%r10d, %r10d		/* nospec   r10 */
 	pushq   $0			/* pt_regs->r11 = 0 */
 	pushq   $0			/* pt_regs->r11 = 0 */
-	xorq	%r11, %r11		/* nospec   r11 */
+	xorl	%r11d, %r11d		/* nospec   r11 */
 	pushq   %rbx                    /* pt_regs->rbx */
 	pushq   %rbx                    /* pt_regs->rbx */
 	xorl	%ebx, %ebx		/* nospec   rbx */
 	xorl	%ebx, %ebx		/* nospec   rbx */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
 	xorl	%ebp, %ebp		/* nospec   rbp */
 	xorl	%ebp, %ebp		/* nospec   rbp */
 	pushq   $0			/* pt_regs->r12 = 0 */
 	pushq   $0			/* pt_regs->r12 = 0 */
-	xorq	%r12, %r12		/* nospec   r12 */
+	xorl	%r12d, %r12d		/* nospec   r12 */
 	pushq   $0			/* pt_regs->r13 = 0 */
 	pushq   $0			/* pt_regs->r13 = 0 */
-	xorq	%r13, %r13		/* nospec   r13 */
+	xorl	%r13d, %r13d		/* nospec   r13 */
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r14 = 0 */
-	xorq	%r14, %r14		/* nospec   r14 */
+	xorl	%r14d, %r14d		/* nospec   r14 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
-	xorq	%r15, %r15		/* nospec   r15 */
+	xorl	%r15d, %r15d		/* nospec   r15 */
 	cld
 	cld
 
 
 	/*
 	/*
@@ -224,25 +224,25 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq	%rbp			/* pt_regs->cx (stashed in bp) */
 	pushq	%rbp			/* pt_regs->cx (stashed in bp) */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq   $0			/* pt_regs->r8  = 0 */
 	pushq   $0			/* pt_regs->r8  = 0 */
-	xorq	%r8, %r8		/* nospec   r8 */
+	xorl	%r8d, %r8d		/* nospec   r8 */
 	pushq   $0			/* pt_regs->r9  = 0 */
 	pushq   $0			/* pt_regs->r9  = 0 */
-	xorq	%r9, %r9		/* nospec   r9 */
+	xorl	%r9d, %r9d		/* nospec   r9 */
 	pushq   $0			/* pt_regs->r10 = 0 */
 	pushq   $0			/* pt_regs->r10 = 0 */
-	xorq	%r10, %r10		/* nospec   r10 */
+	xorl	%r10d, %r10d		/* nospec   r10 */
 	pushq   $0			/* pt_regs->r11 = 0 */
 	pushq   $0			/* pt_regs->r11 = 0 */
-	xorq	%r11, %r11		/* nospec   r11 */
+	xorl	%r11d, %r11d		/* nospec   r11 */
 	pushq   %rbx                    /* pt_regs->rbx */
 	pushq   %rbx                    /* pt_regs->rbx */
 	xorl	%ebx, %ebx		/* nospec   rbx */
 	xorl	%ebx, %ebx		/* nospec   rbx */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
 	pushq   %rbp                    /* pt_regs->rbp (will be overwritten) */
 	xorl	%ebp, %ebp		/* nospec   rbp */
 	xorl	%ebp, %ebp		/* nospec   rbp */
 	pushq   $0			/* pt_regs->r12 = 0 */
 	pushq   $0			/* pt_regs->r12 = 0 */
-	xorq	%r12, %r12		/* nospec   r12 */
+	xorl	%r12d, %r12d		/* nospec   r12 */
 	pushq   $0			/* pt_regs->r13 = 0 */
 	pushq   $0			/* pt_regs->r13 = 0 */
-	xorq	%r13, %r13		/* nospec   r13 */
+	xorl	%r13d, %r13d		/* nospec   r13 */
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r14 = 0 */
-	xorq	%r14, %r14		/* nospec   r14 */
+	xorl	%r14d, %r14d		/* nospec   r14 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
-	xorq	%r15, %r15		/* nospec   r15 */
+	xorl	%r15d, %r15d		/* nospec   r15 */
 
 
 	/*
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * User mode is traced as though IRQs are on, and SYSENTER
@@ -298,9 +298,9 @@ sysret32_from_system_call:
 	 */
 	 */
 	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
 	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
 
 
-	xorq	%r8, %r8
-	xorq	%r9, %r9
-	xorq	%r10, %r10
+	xorl	%r8d, %r8d
+	xorl	%r9d, %r9d
+	xorl	%r10d, %r10d
 	swapgs
 	swapgs
 	sysretl
 	sysretl
 END(entry_SYSCALL_compat)
 END(entry_SYSCALL_compat)
@@ -347,36 +347,47 @@ ENTRY(entry_INT80_compat)
 	 */
 	 */
 	movl	%eax, %eax
 	movl	%eax, %eax
 
 
+	/* switch to thread stack expects orig_ax and rdi to be pushed */
 	pushq	%rax			/* pt_regs->orig_ax */
 	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
 
 
-	/* switch to thread stack expects orig_ax to be pushed */
-	call	switch_to_thread_stack
+	/* Need to switch before accessing the thread stack. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 
-	pushq	%rdi			/* pt_regs->di */
+	pushq	6*8(%rdi)		/* regs->ss */
+	pushq	5*8(%rdi)		/* regs->rsp */
+	pushq	4*8(%rdi)		/* regs->eflags */
+	pushq	3*8(%rdi)		/* regs->cs */
+	pushq	2*8(%rdi)		/* regs->ip */
+	pushq	1*8(%rdi)		/* regs->orig_ax */
+
+	pushq	(%rdi)			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rdx			/* pt_regs->dx */
 	pushq	%rdx			/* pt_regs->dx */
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	%rcx			/* pt_regs->cx */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq	$-ENOSYS		/* pt_regs->ax */
 	pushq   $0			/* pt_regs->r8  = 0 */
 	pushq   $0			/* pt_regs->r8  = 0 */
-	xorq	%r8, %r8		/* nospec   r8 */
+	xorl	%r8d, %r8d		/* nospec   r8 */
 	pushq   $0			/* pt_regs->r9  = 0 */
 	pushq   $0			/* pt_regs->r9  = 0 */
-	xorq	%r9, %r9		/* nospec   r9 */
+	xorl	%r9d, %r9d		/* nospec   r9 */
 	pushq   $0			/* pt_regs->r10 = 0 */
 	pushq   $0			/* pt_regs->r10 = 0 */
-	xorq	%r10, %r10		/* nospec   r10 */
+	xorl	%r10d, %r10d		/* nospec   r10 */
 	pushq   $0			/* pt_regs->r11 = 0 */
 	pushq   $0			/* pt_regs->r11 = 0 */
-	xorq	%r11, %r11		/* nospec   r11 */
+	xorl	%r11d, %r11d		/* nospec   r11 */
 	pushq   %rbx                    /* pt_regs->rbx */
 	pushq   %rbx                    /* pt_regs->rbx */
 	xorl	%ebx, %ebx		/* nospec   rbx */
 	xorl	%ebx, %ebx		/* nospec   rbx */
 	pushq   %rbp                    /* pt_regs->rbp */
 	pushq   %rbp                    /* pt_regs->rbp */
 	xorl	%ebp, %ebp		/* nospec   rbp */
 	xorl	%ebp, %ebp		/* nospec   rbp */
 	pushq   %r12                    /* pt_regs->r12 */
 	pushq   %r12                    /* pt_regs->r12 */
-	xorq	%r12, %r12		/* nospec   r12 */
+	xorl	%r12d, %r12d		/* nospec   r12 */
 	pushq   %r13                    /* pt_regs->r13 */
 	pushq   %r13                    /* pt_regs->r13 */
-	xorq	%r13, %r13		/* nospec   r13 */
+	xorl	%r13d, %r13d		/* nospec   r13 */
 	pushq   %r14                    /* pt_regs->r14 */
 	pushq   %r14                    /* pt_regs->r14 */
-	xorq	%r14, %r14		/* nospec   r14 */
+	xorl	%r14d, %r14d		/* nospec   r14 */
 	pushq   %r15                    /* pt_regs->r15 */
 	pushq   %r15                    /* pt_regs->r15 */
-	xorq	%r15, %r15		/* nospec   r15 */
+	xorl	%r15d, %r15d		/* nospec   r15 */
 	cld
 	cld
 
 
 	/*
 	/*
@@ -393,15 +404,3 @@ ENTRY(entry_INT80_compat)
 	TRACE_IRQS_ON
 	TRACE_IRQS_ON
 	jmp	swapgs_restore_regs_and_return_to_usermode
 	jmp	swapgs_restore_regs_and_return_to_usermode
 END(entry_INT80_compat)
 END(entry_INT80_compat)
-
-ENTRY(stub32_clone)
-	/*
-	 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
-	 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
-	 *
-	 * The native 64-bit kernel's sys_clone() implements the latter,
-	 * so we need to swap arguments here before calling it:
-	 */
-	xchg	%r8, %rcx
-	jmp	sys_clone
-ENDPROC(stub32_clone)

+ 19 - 19
arch/x86/entry/syscalls/syscall_32.tbl

@@ -8,12 +8,12 @@
 #
 #
 0	i386	restart_syscall		sys_restart_syscall
 0	i386	restart_syscall		sys_restart_syscall
 1	i386	exit			sys_exit
 1	i386	exit			sys_exit
-2	i386	fork			sys_fork			sys_fork
+2	i386	fork			sys_fork
 3	i386	read			sys_read
 3	i386	read			sys_read
 4	i386	write			sys_write
 4	i386	write			sys_write
 5	i386	open			sys_open			compat_sys_open
 5	i386	open			sys_open			compat_sys_open
 6	i386	close			sys_close
 6	i386	close			sys_close
-7	i386	waitpid			sys_waitpid			sys32_waitpid
+7	i386	waitpid			sys_waitpid			compat_sys_x86_waitpid
 8	i386	creat			sys_creat
 8	i386	creat			sys_creat
 9	i386	link			sys_link
 9	i386	link			sys_link
 10	i386	unlink			sys_unlink
 10	i386	unlink			sys_unlink
@@ -78,7 +78,7 @@
 69	i386	ssetmask		sys_ssetmask
 69	i386	ssetmask		sys_ssetmask
 70	i386	setreuid		sys_setreuid16
 70	i386	setreuid		sys_setreuid16
 71	i386	setregid		sys_setregid16
 71	i386	setregid		sys_setregid16
-72	i386	sigsuspend		sys_sigsuspend			sys_sigsuspend
+72	i386	sigsuspend		sys_sigsuspend
 73	i386	sigpending		sys_sigpending			compat_sys_sigpending
 73	i386	sigpending		sys_sigpending			compat_sys_sigpending
 74	i386	sethostname		sys_sethostname
 74	i386	sethostname		sys_sethostname
 75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
 75	i386	setrlimit		sys_setrlimit			compat_sys_setrlimit
@@ -96,7 +96,7 @@
 87	i386	swapon			sys_swapon
 87	i386	swapon			sys_swapon
 88	i386	reboot			sys_reboot
 88	i386	reboot			sys_reboot
 89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
 89	i386	readdir			sys_old_readdir			compat_sys_old_readdir
-90	i386	mmap			sys_old_mmap			sys32_mmap
+90	i386	mmap			sys_old_mmap			compat_sys_x86_mmap
 91	i386	munmap			sys_munmap
 91	i386	munmap			sys_munmap
 92	i386	truncate		sys_truncate			compat_sys_truncate
 92	i386	truncate		sys_truncate			compat_sys_truncate
 93	i386	ftruncate		sys_ftruncate			compat_sys_ftruncate
 93	i386	ftruncate		sys_ftruncate			compat_sys_ftruncate
@@ -126,7 +126,7 @@
 117	i386	ipc			sys_ipc				compat_sys_ipc
 117	i386	ipc			sys_ipc				compat_sys_ipc
 118	i386	fsync			sys_fsync
 118	i386	fsync			sys_fsync
 119	i386	sigreturn		sys_sigreturn			sys32_sigreturn
 119	i386	sigreturn		sys_sigreturn			sys32_sigreturn
-120	i386	clone			sys_clone			stub32_clone
+120	i386	clone			sys_clone			compat_sys_x86_clone
 121	i386	setdomainname		sys_setdomainname
 121	i386	setdomainname		sys_setdomainname
 122	i386	uname			sys_newuname
 122	i386	uname			sys_newuname
 123	i386	modify_ldt		sys_modify_ldt
 123	i386	modify_ldt		sys_modify_ldt
@@ -186,8 +186,8 @@
 177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
 177	i386	rt_sigtimedwait		sys_rt_sigtimedwait		compat_sys_rt_sigtimedwait
 178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 178	i386	rt_sigqueueinfo		sys_rt_sigqueueinfo		compat_sys_rt_sigqueueinfo
 179	i386	rt_sigsuspend		sys_rt_sigsuspend
 179	i386	rt_sigsuspend		sys_rt_sigsuspend
-180	i386	pread64			sys_pread64			sys32_pread
-181	i386	pwrite64		sys_pwrite64			sys32_pwrite
+180	i386	pread64			sys_pread64			compat_sys_x86_pread
+181	i386	pwrite64		sys_pwrite64			compat_sys_x86_pwrite
 182	i386	chown			sys_chown16
 182	i386	chown			sys_chown16
 183	i386	getcwd			sys_getcwd
 183	i386	getcwd			sys_getcwd
 184	i386	capget			sys_capget
 184	i386	capget			sys_capget
@@ -196,14 +196,14 @@
 187	i386	sendfile		sys_sendfile			compat_sys_sendfile
 187	i386	sendfile		sys_sendfile			compat_sys_sendfile
 188	i386	getpmsg
 188	i386	getpmsg
 189	i386	putpmsg
 189	i386	putpmsg
-190	i386	vfork			sys_vfork			sys_vfork
+190	i386	vfork			sys_vfork
 191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
 191	i386	ugetrlimit		sys_getrlimit			compat_sys_getrlimit
 192	i386	mmap2			sys_mmap_pgoff
 192	i386	mmap2			sys_mmap_pgoff
-193	i386	truncate64		sys_truncate64			sys32_truncate64
-194	i386	ftruncate64		sys_ftruncate64			sys32_ftruncate64
-195	i386	stat64			sys_stat64			sys32_stat64
-196	i386	lstat64			sys_lstat64			sys32_lstat64
-197	i386	fstat64			sys_fstat64			sys32_fstat64
+193	i386	truncate64		sys_truncate64			compat_sys_x86_truncate64
+194	i386	ftruncate64		sys_ftruncate64			compat_sys_x86_ftruncate64
+195	i386	stat64			sys_stat64			compat_sys_x86_stat64
+196	i386	lstat64			sys_lstat64			compat_sys_x86_lstat64
+197	i386	fstat64			sys_fstat64			compat_sys_x86_fstat64
 198	i386	lchown32		sys_lchown
 198	i386	lchown32		sys_lchown
 199	i386	getuid32		sys_getuid
 199	i386	getuid32		sys_getuid
 200	i386	getgid32		sys_getgid
 200	i386	getgid32		sys_getgid
@@ -231,7 +231,7 @@
 # 222 is unused
 # 222 is unused
 # 223 is unused
 # 223 is unused
 224	i386	gettid			sys_gettid
 224	i386	gettid			sys_gettid
-225	i386	readahead		sys_readahead			sys32_readahead
+225	i386	readahead		sys_readahead			compat_sys_x86_readahead
 226	i386	setxattr		sys_setxattr
 226	i386	setxattr		sys_setxattr
 227	i386	lsetxattr		sys_lsetxattr
 227	i386	lsetxattr		sys_lsetxattr
 228	i386	fsetxattr		sys_fsetxattr
 228	i386	fsetxattr		sys_fsetxattr
@@ -256,7 +256,7 @@
 247	i386	io_getevents		sys_io_getevents		compat_sys_io_getevents
 247	i386	io_getevents		sys_io_getevents		compat_sys_io_getevents
 248	i386	io_submit		sys_io_submit			compat_sys_io_submit
 248	i386	io_submit		sys_io_submit			compat_sys_io_submit
 249	i386	io_cancel		sys_io_cancel
 249	i386	io_cancel		sys_io_cancel
-250	i386	fadvise64		sys_fadvise64			sys32_fadvise64
+250	i386	fadvise64		sys_fadvise64			compat_sys_x86_fadvise64
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
 252	i386	exit_group		sys_exit_group
 252	i386	exit_group		sys_exit_group
 253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
 253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
@@ -278,7 +278,7 @@
 269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
 269	i386	fstatfs64		sys_fstatfs64			compat_sys_fstatfs64
 270	i386	tgkill			sys_tgkill
 270	i386	tgkill			sys_tgkill
 271	i386	utimes			sys_utimes			compat_sys_utimes
 271	i386	utimes			sys_utimes			compat_sys_utimes
-272	i386	fadvise64_64		sys_fadvise64_64		sys32_fadvise64_64
+272	i386	fadvise64_64		sys_fadvise64_64		compat_sys_x86_fadvise64_64
 273	i386	vserver
 273	i386	vserver
 274	i386	mbind			sys_mbind
 274	i386	mbind			sys_mbind
 275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
 275	i386	get_mempolicy		sys_get_mempolicy		compat_sys_get_mempolicy
@@ -306,7 +306,7 @@
 297	i386	mknodat			sys_mknodat
 297	i386	mknodat			sys_mknodat
 298	i386	fchownat		sys_fchownat
 298	i386	fchownat		sys_fchownat
 299	i386	futimesat		sys_futimesat			compat_sys_futimesat
 299	i386	futimesat		sys_futimesat			compat_sys_futimesat
-300	i386	fstatat64		sys_fstatat64			sys32_fstatat
+300	i386	fstatat64		sys_fstatat64			compat_sys_x86_fstatat
 301	i386	unlinkat		sys_unlinkat
 301	i386	unlinkat		sys_unlinkat
 302	i386	renameat		sys_renameat
 302	i386	renameat		sys_renameat
 303	i386	linkat			sys_linkat
 303	i386	linkat			sys_linkat
@@ -320,7 +320,7 @@
 311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
 311	i386	set_robust_list		sys_set_robust_list		compat_sys_set_robust_list
 312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
 312	i386	get_robust_list		sys_get_robust_list		compat_sys_get_robust_list
 313	i386	splice			sys_splice
 313	i386	splice			sys_splice
-314	i386	sync_file_range		sys_sync_file_range		sys32_sync_file_range
+314	i386	sync_file_range		sys_sync_file_range		compat_sys_x86_sync_file_range
 315	i386	tee			sys_tee
 315	i386	tee			sys_tee
 316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice
 316	i386	vmsplice		sys_vmsplice			compat_sys_vmsplice
 317	i386	move_pages		sys_move_pages			compat_sys_move_pages
 317	i386	move_pages		sys_move_pages			compat_sys_move_pages
@@ -330,7 +330,7 @@
 321	i386	signalfd		sys_signalfd			compat_sys_signalfd
 321	i386	signalfd		sys_signalfd			compat_sys_signalfd
 322	i386	timerfd_create		sys_timerfd_create
 322	i386	timerfd_create		sys_timerfd_create
 323	i386	eventfd			sys_eventfd
 323	i386	eventfd			sys_eventfd
-324	i386	fallocate		sys_fallocate			sys32_fallocate
+324	i386	fallocate		sys_fallocate			compat_sys_x86_fallocate
 325	i386	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
 325	i386	timerfd_settime		sys_timerfd_settime		compat_sys_timerfd_settime
 326	i386	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
 326	i386	timerfd_gettime		sys_timerfd_gettime		compat_sys_timerfd_gettime
 327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4
 327	i386	signalfd4		sys_signalfd4			compat_sys_signalfd4

+ 3 - 13
arch/x86/entry/vsyscall/vsyscall_64.c

@@ -42,10 +42,8 @@
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include "vsyscall_trace.h"
 #include "vsyscall_trace.h"
 
 
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
-#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE)
-	NATIVE;
-#elif defined(CONFIG_LEGACY_VSYSCALL_NONE)
+static enum { EMULATE, NONE } vsyscall_mode =
+#ifdef CONFIG_LEGACY_VSYSCALL_NONE
 	NONE;
 	NONE;
 #else
 #else
 	EMULATE;
 	EMULATE;
@@ -56,8 +54,6 @@ static int __init vsyscall_setup(char *str)
 	if (str) {
 	if (str) {
 		if (!strcmp("emulate", str))
 		if (!strcmp("emulate", str))
 			vsyscall_mode = EMULATE;
 			vsyscall_mode = EMULATE;
-		else if (!strcmp("native", str))
-			vsyscall_mode = NATIVE;
 		else if (!strcmp("none", str))
 		else if (!strcmp("none", str))
 			vsyscall_mode = NONE;
 			vsyscall_mode = NONE;
 		else
 		else
@@ -139,10 +135,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 
 	WARN_ON_ONCE(address != regs->ip);
 	WARN_ON_ONCE(address != regs->ip);
 
 
-	/* This should be unreachable in NATIVE mode. */
-	if (WARN_ON(vsyscall_mode == NATIVE))
-		return false;
-
 	if (vsyscall_mode == NONE) {
 	if (vsyscall_mode == NONE) {
 		warn_bad_vsyscall(KERN_INFO, regs,
 		warn_bad_vsyscall(KERN_INFO, regs,
 				  "vsyscall attempted with vsyscall=none");
 				  "vsyscall attempted with vsyscall=none");
@@ -370,9 +362,7 @@ void __init map_vsyscall(void)
 
 
 	if (vsyscall_mode != NONE) {
 	if (vsyscall_mode != NONE) {
 		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
 		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
-			     vsyscall_mode == NATIVE
-			     ? PAGE_KERNEL_VSYSCALL
-			     : PAGE_KERNEL_VVAR);
+			     PAGE_KERNEL_VVAR);
 		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
 		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
 	}
 	}
 
 

+ 44 - 30
arch/x86/ia32/sys_ia32.c

@@ -51,15 +51,14 @@
 #define AA(__x)		((unsigned long)(__x))
 #define AA(__x)		((unsigned long)(__x))
 
 
 
 
-asmlinkage long sys32_truncate64(const char __user *filename,
-				 unsigned long offset_low,
-				 unsigned long offset_high)
+COMPAT_SYSCALL_DEFINE3(x86_truncate64, const char __user *, filename,
+		       unsigned long, offset_low, unsigned long, offset_high)
 {
 {
        return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
        return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low);
 }
 }
 
 
-asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low,
-				  unsigned long offset_high)
+COMPAT_SYSCALL_DEFINE3(x86_ftruncate64, unsigned int, fd,
+		       unsigned long, offset_low, unsigned long, offset_high)
 {
 {
        return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
        return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low);
 }
 }
@@ -96,8 +95,8 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
 	return 0;
 	return 0;
 }
 }
 
 
-asmlinkage long sys32_stat64(const char __user *filename,
-			     struct stat64 __user *statbuf)
+COMPAT_SYSCALL_DEFINE2(x86_stat64, const char __user *, filename,
+		       struct stat64 __user *, statbuf)
 {
 {
 	struct kstat stat;
 	struct kstat stat;
 	int ret = vfs_stat(filename, &stat);
 	int ret = vfs_stat(filename, &stat);
@@ -107,8 +106,8 @@ asmlinkage long sys32_stat64(const char __user *filename,
 	return ret;
 	return ret;
 }
 }
 
 
-asmlinkage long sys32_lstat64(const char __user *filename,
-			      struct stat64 __user *statbuf)
+COMPAT_SYSCALL_DEFINE2(x86_lstat64, const char __user *, filename,
+		       struct stat64 __user *, statbuf)
 {
 {
 	struct kstat stat;
 	struct kstat stat;
 	int ret = vfs_lstat(filename, &stat);
 	int ret = vfs_lstat(filename, &stat);
@@ -117,7 +116,8 @@ asmlinkage long sys32_lstat64(const char __user *filename,
 	return ret;
 	return ret;
 }
 }
 
 
-asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
+COMPAT_SYSCALL_DEFINE2(x86_fstat64, unsigned int, fd,
+		       struct stat64 __user *, statbuf)
 {
 {
 	struct kstat stat;
 	struct kstat stat;
 	int ret = vfs_fstat(fd, &stat);
 	int ret = vfs_fstat(fd, &stat);
@@ -126,8 +126,9 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf)
 	return ret;
 	return ret;
 }
 }
 
 
-asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename,
-			      struct stat64 __user *statbuf, int flag)
+COMPAT_SYSCALL_DEFINE4(x86_fstatat, unsigned int, dfd,
+		       const char __user *, filename,
+		       struct stat64 __user *, statbuf, int, flag)
 {
 {
 	struct kstat stat;
 	struct kstat stat;
 	int error;
 	int error;
@@ -153,7 +154,7 @@ struct mmap_arg_struct32 {
 	unsigned int offset;
 	unsigned int offset;
 };
 };
 
 
-asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg)
+COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg)
 {
 {
 	struct mmap_arg_struct32 a;
 	struct mmap_arg_struct32 a;
 
 
@@ -167,22 +168,22 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg)
 			       a.offset>>PAGE_SHIFT);
 			       a.offset>>PAGE_SHIFT);
 }
 }
 
 
-asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr,
-			      int options)
+COMPAT_SYSCALL_DEFINE3(x86_waitpid, compat_pid_t, pid, unsigned int __user *,
+		       stat_addr, int, options)
 {
 {
 	return compat_sys_wait4(pid, stat_addr, options, NULL);
 	return compat_sys_wait4(pid, stat_addr, options, NULL);
 }
 }
 
 
 /* warning: next two assume little endian */
 /* warning: next two assume little endian */
-asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
-			    u32 poslo, u32 poshi)
+COMPAT_SYSCALL_DEFINE5(x86_pread, unsigned int, fd, char __user *, ubuf,
+		       u32, count, u32, poslo, u32, poshi)
 {
 {
 	return sys_pread64(fd, ubuf, count,
 	return sys_pread64(fd, ubuf, count,
 			 ((loff_t)AA(poshi) << 32) | AA(poslo));
 			 ((loff_t)AA(poshi) << 32) | AA(poslo));
 }
 }
 
 
-asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
-			     u32 count, u32 poslo, u32 poshi)
+COMPAT_SYSCALL_DEFINE5(x86_pwrite, unsigned int, fd, const char __user *, ubuf,
+		       u32, count, u32, poslo, u32, poshi)
 {
 {
 	return sys_pwrite64(fd, ubuf, count,
 	return sys_pwrite64(fd, ubuf, count,
 			  ((loff_t)AA(poshi) << 32) | AA(poslo));
 			  ((loff_t)AA(poshi) << 32) | AA(poslo));
@@ -193,8 +194,9 @@ asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
  * Some system calls that need sign extended arguments. This could be
  * Some system calls that need sign extended arguments. This could be
  * done by a generic wrapper.
  * done by a generic wrapper.
  */
  */
-long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
-			__u32 len_low, __u32 len_high, int advice)
+COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low,
+		       __u32, offset_high, __u32, len_low, __u32, len_high,
+		       int, advice)
 {
 {
 	return sys_fadvise64_64(fd,
 	return sys_fadvise64_64(fd,
 			       (((u64)offset_high)<<32) | offset_low,
 			       (((u64)offset_high)<<32) | offset_low,
@@ -202,31 +204,43 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
 				advice);
 				advice);
 }
 }
 
 
-asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
-				   size_t count)
+COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo,
+		       unsigned int, off_hi, size_t, count)
 {
 {
 	return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
 	return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count);
 }
 }
 
 
-asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi,
-				      unsigned n_low, unsigned n_hi,  int flags)
+COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low,
+		       unsigned int, off_hi, unsigned int, n_low,
+		       unsigned int, n_hi, int, flags)
 {
 {
 	return sys_sync_file_range(fd,
 	return sys_sync_file_range(fd,
 				   ((u64)off_hi << 32) | off_low,
 				   ((u64)off_hi << 32) | off_low,
 				   ((u64)n_hi << 32) | n_low, flags);
 				   ((u64)n_hi << 32) | n_low, flags);
 }
 }
 
 
-asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi,
-				size_t len, int advice)
+COMPAT_SYSCALL_DEFINE5(x86_fadvise64, int, fd, unsigned int, offset_lo,
+		       unsigned int, offset_hi, size_t, len, int, advice)
 {
 {
 	return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
 	return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo,
 				len, advice);
 				len, advice);
 }
 }
 
 
-asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
-				unsigned offset_hi, unsigned len_lo,
-				unsigned len_hi)
+COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode,
+		       unsigned int, offset_lo, unsigned int, offset_hi,
+		       unsigned int, len_lo, unsigned int, len_hi)
 {
 {
 	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 			     ((u64)len_hi << 32) | len_lo);
 			     ((u64)len_hi << 32) | len_lo);
 }
 }
+
+/*
+ * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS
+ */
+COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
+		       unsigned long, newsp, int __user *, parent_tidptr,
+		       unsigned long, tls_val, int __user *, child_tidptr)
+{
+	return sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr,
+			tls_val);
+}

+ 6 - 0
arch/x86/include/asm/apm.h

@@ -7,6 +7,8 @@
 #ifndef _ASM_X86_MACH_DEFAULT_APM_H
 #ifndef _ASM_X86_MACH_DEFAULT_APM_H
 #define _ASM_X86_MACH_DEFAULT_APM_H
 #define _ASM_X86_MACH_DEFAULT_APM_H
 
 
+#include <asm/nospec-branch.h>
+
 #ifdef APM_ZERO_SEGS
 #ifdef APM_ZERO_SEGS
 #	define APM_DO_ZERO_SEGS \
 #	define APM_DO_ZERO_SEGS \
 		"pushl %%ds\n\t" \
 		"pushl %%ds\n\t" \
@@ -32,6 +34,7 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
 	 * N.B. We do NOT need a cld after the BIOS call
 	 * N.B. We do NOT need a cld after the BIOS call
 	 * because we always save and restore the flags.
 	 * because we always save and restore the flags.
 	 */
 	 */
+	firmware_restrict_branch_speculation_start();
 	__asm__ __volatile__(APM_DO_ZERO_SEGS
 	__asm__ __volatile__(APM_DO_ZERO_SEGS
 		"pushl %%edi\n\t"
 		"pushl %%edi\n\t"
 		"pushl %%ebp\n\t"
 		"pushl %%ebp\n\t"
@@ -44,6 +47,7 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
 		  "=S" (*esi)
 		  "=S" (*esi)
 		: "a" (func), "b" (ebx_in), "c" (ecx_in)
 		: "a" (func), "b" (ebx_in), "c" (ecx_in)
 		: "memory", "cc");
 		: "memory", "cc");
+	firmware_restrict_branch_speculation_end();
 }
 }
 
 
 static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in,
 static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in,
@@ -56,6 +60,7 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in,
 	 * N.B. We do NOT need a cld after the BIOS call
 	 * N.B. We do NOT need a cld after the BIOS call
 	 * because we always save and restore the flags.
 	 * because we always save and restore the flags.
 	 */
 	 */
+	firmware_restrict_branch_speculation_start();
 	__asm__ __volatile__(APM_DO_ZERO_SEGS
 	__asm__ __volatile__(APM_DO_ZERO_SEGS
 		"pushl %%edi\n\t"
 		"pushl %%edi\n\t"
 		"pushl %%ebp\n\t"
 		"pushl %%ebp\n\t"
@@ -68,6 +73,7 @@ static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in,
 		  "=S" (si)
 		  "=S" (si)
 		: "a" (func), "b" (ebx_in), "c" (ecx_in)
 		: "a" (func), "b" (ebx_in), "c" (ecx_in)
 		: "memory", "cc");
 		: "memory", "cc");
+	firmware_restrict_branch_speculation_end();
 	return error;
 	return error;
 }
 }
 
 

+ 0 - 3
arch/x86/include/asm/asm-prototypes.h

@@ -38,7 +38,4 @@ INDIRECT_THUNK(dx)
 INDIRECT_THUNK(si)
 INDIRECT_THUNK(si)
 INDIRECT_THUNK(di)
 INDIRECT_THUNK(di)
 INDIRECT_THUNK(bp)
 INDIRECT_THUNK(bp)
-asmlinkage void __fill_rsb(void);
-asmlinkage void __clear_rsb(void);
-
 #endif /* CONFIG_RETPOLINE */
 #endif /* CONFIG_RETPOLINE */

+ 3 - 0
arch/x86/include/asm/cpufeatures.h

@@ -213,6 +213,7 @@
 #define X86_FEATURE_SEV			( 7*32+20) /* AMD Secure Encrypted Virtualization */
 #define X86_FEATURE_SEV			( 7*32+20) /* AMD Secure Encrypted Virtualization */
 
 
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
 #define X86_FEATURE_USE_IBPB		( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */
+#define X86_FEATURE_USE_IBRS_FW		( 7*32+22) /* "" Use IBRS during runtime firmware calls */
 
 
 /* Virtualization flags: Linux defined, word 8 */
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
@@ -315,6 +316,7 @@
 #define X86_FEATURE_VPCLMULQDQ		(16*32+10) /* Carry-Less Multiplication Double Quadword */
 #define X86_FEATURE_VPCLMULQDQ		(16*32+10) /* Carry-Less Multiplication Double Quadword */
 #define X86_FEATURE_AVX512_VNNI		(16*32+11) /* Vector Neural Network Instructions */
 #define X86_FEATURE_AVX512_VNNI		(16*32+11) /* Vector Neural Network Instructions */
 #define X86_FEATURE_AVX512_BITALG	(16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
 #define X86_FEATURE_AVX512_BITALG	(16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+#define X86_FEATURE_TME			(16*32+13) /* Intel Total Memory Encryption */
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
 #define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
 #define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
 #define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
@@ -327,6 +329,7 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
 #define X86_FEATURE_AVX512_4VNNIW	(18*32+ 2) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4VNNIW	(18*32+ 2) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4FMAPS	(18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
 #define X86_FEATURE_AVX512_4FMAPS	(18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */
 #define X86_FEATURE_ARCH_CAPABILITIES	(18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */

+ 15 - 2
arch/x86/include/asm/efi.h

@@ -6,6 +6,7 @@
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/processor-flags.h>
 #include <asm/processor-flags.h>
 #include <asm/tlb.h>
 #include <asm/tlb.h>
+#include <asm/nospec-branch.h>
 
 
 /*
 /*
  * We map the EFI regions needed for runtime services non-contiguously,
  * We map the EFI regions needed for runtime services non-contiguously,
@@ -36,8 +37,18 @@
 
 
 extern asmlinkage unsigned long efi_call_phys(void *, ...);
 extern asmlinkage unsigned long efi_call_phys(void *, ...);
 
 
-#define arch_efi_call_virt_setup()	kernel_fpu_begin()
-#define arch_efi_call_virt_teardown()	kernel_fpu_end()
+#define arch_efi_call_virt_setup()					\
+({									\
+	kernel_fpu_begin();						\
+	firmware_restrict_branch_speculation_start();			\
+})
+
+#define arch_efi_call_virt_teardown()					\
+({									\
+	firmware_restrict_branch_speculation_end();			\
+	kernel_fpu_end();						\
+})
+
 
 
 /*
 /*
  * Wrap all the virtual calls in a way that forces the parameters on the stack.
  * Wrap all the virtual calls in a way that forces the parameters on the stack.
@@ -73,6 +84,7 @@ struct efi_scratch {
 	efi_sync_low_kernel_mappings();					\
 	efi_sync_low_kernel_mappings();					\
 	preempt_disable();						\
 	preempt_disable();						\
 	__kernel_fpu_begin();						\
 	__kernel_fpu_begin();						\
+	firmware_restrict_branch_speculation_start();			\
 									\
 									\
 	if (efi_scratch.use_pgd) {					\
 	if (efi_scratch.use_pgd) {					\
 		efi_scratch.prev_cr3 = __read_cr3();			\
 		efi_scratch.prev_cr3 = __read_cr3();			\
@@ -91,6 +103,7 @@ struct efi_scratch {
 		__flush_tlb_all();					\
 		__flush_tlb_all();					\
 	}								\
 	}								\
 									\
 									\
+	firmware_restrict_branch_speculation_end();			\
 	__kernel_fpu_end();						\
 	__kernel_fpu_end();						\
 	preempt_enable();						\
 	preempt_enable();						\
 })
 })

+ 7 - 2
arch/x86/include/asm/microcode.h

@@ -37,7 +37,12 @@ struct cpu_signature {
 
 
 struct device;
 struct device;
 
 
-enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+enum ucode_state {
+	UCODE_OK	= 0,
+	UCODE_UPDATED,
+	UCODE_NFOUND,
+	UCODE_ERROR,
+};
 
 
 struct microcode_ops {
 struct microcode_ops {
 	enum ucode_state (*request_microcode_user) (int cpu,
 	enum ucode_state (*request_microcode_user) (int cpu,
@@ -54,7 +59,7 @@ struct microcode_ops {
 	 * are being called.
 	 * are being called.
 	 * See also the "Synchronization" section in microcode_core.c.
 	 * See also the "Synchronization" section in microcode_core.c.
 	 */
 	 */
-	int (*apply_microcode) (int cpu);
+	enum ucode_state (*apply_microcode) (int cpu);
 	int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
 	int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
 };
 };
 
 

+ 1 - 0
arch/x86/include/asm/mmu_context.h

@@ -74,6 +74,7 @@ static inline void *ldt_slot_va(int slot)
 	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
 	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
 #else
 #else
 	BUG();
 	BUG();
+	return (void *)fix_to_virt(FIX_HOLE);
 #endif
 #endif
 }
 }
 
 

+ 118 - 20
arch/x86/include/asm/nospec-branch.h

@@ -8,6 +8,50 @@
 #include <asm/cpufeatures.h>
 #include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 #include <asm/msr-index.h>
 
 
+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; lfence; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * We define a CPP macro such that it can be used from both .S files and
+ * inline assembly. It's possible to do a .macro and then include that
+ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+ */
+
+#define RSB_CLEAR_LOOPS		32	/* To forcibly overwrite all entries */
+#define RSB_FILL_LOOPS		16	/* To avoid underflow */
+
+/*
+ * Google experimented with loop-unrolling and this turned out to be
+ * the optimal version — two calls, each with their own speculation
+ * trap should their return address end up getting used, in a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr, sp)	\
+	mov	$(nr/2), reg;			\
+771:						\
+	call	772f;				\
+773:	/* speculation trap */			\
+	pause;					\
+	lfence;					\
+	jmp	773b;				\
+772:						\
+	call	774f;				\
+775:	/* speculation trap */			\
+	pause;					\
+	lfence;					\
+	jmp	775b;				\
+774:						\
+	dec	reg;				\
+	jnz	771b;				\
+	add	$(BITS_PER_LONG/8) * nr, sp;
+
 #ifdef __ASSEMBLY__
 #ifdef __ASSEMBLY__
 
 
 /*
 /*
@@ -23,6 +67,18 @@
 	.popsection
 	.popsection
 .endm
 .endm
 
 
+/*
+ * This should be used immediately before an indirect jump/call. It tells
+ * objtool the subsequent indirect jump/call is vouched safe for retpoline
+ * builds.
+ */
+.macro ANNOTATE_RETPOLINE_SAFE
+	.Lannotate_\@:
+	.pushsection .discard.retpoline_safe
+	_ASM_PTR .Lannotate_\@
+	.popsection
+.endm
+
 /*
 /*
  * These are the bare retpoline primitives for indirect jmp and call.
  * These are the bare retpoline primitives for indirect jmp and call.
  * Do not use these directly; they only exist to make the ALTERNATIVE
  * Do not use these directly; they only exist to make the ALTERNATIVE
@@ -59,9 +115,9 @@
 .macro JMP_NOSPEC reg:req
 .macro JMP_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
 #ifdef CONFIG_RETPOLINE
 	ANNOTATE_NOSPEC_ALTERNATIVE
 	ANNOTATE_NOSPEC_ALTERNATIVE
-	ALTERNATIVE_2 __stringify(jmp *\reg),				\
+	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg),	\
 		__stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE,	\
 		__stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE,	\
-		__stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
+		__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
 #else
 #else
 	jmp	*\reg
 	jmp	*\reg
 #endif
 #endif
@@ -70,18 +126,25 @@
 .macro CALL_NOSPEC reg:req
 .macro CALL_NOSPEC reg:req
 #ifdef CONFIG_RETPOLINE
 #ifdef CONFIG_RETPOLINE
 	ANNOTATE_NOSPEC_ALTERNATIVE
 	ANNOTATE_NOSPEC_ALTERNATIVE
-	ALTERNATIVE_2 __stringify(call *\reg),				\
+	ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg),	\
 		__stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
 		__stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
-		__stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
+		__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD
 #else
 #else
 	call	*\reg
 	call	*\reg
 #endif
 #endif
 .endm
 .endm
 
 
-/* This clobbers the BX register */
-.macro FILL_RETURN_BUFFER nr:req ftr:req
+ /*
+  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+  * monstrosity above, manually.
+  */
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
 #ifdef CONFIG_RETPOLINE
 #ifdef CONFIG_RETPOLINE
-	ALTERNATIVE "", "call __clear_rsb", \ftr
+	ANNOTATE_NOSPEC_ALTERNATIVE
+	ALTERNATIVE "jmp .Lskip_rsb_\@",				\
+		__stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP))	\
+		\ftr
+.Lskip_rsb_\@:
 #endif
 #endif
 .endm
 .endm
 
 
@@ -93,6 +156,12 @@
 	".long 999b - .\n\t"					\
 	".long 999b - .\n\t"					\
 	".popsection\n\t"
 	".popsection\n\t"
 
 
+#define ANNOTATE_RETPOLINE_SAFE					\
+	"999:\n\t"						\
+	".pushsection .discard.retpoline_safe\n\t"		\
+	_ASM_PTR " 999b\n\t"					\
+	".popsection\n\t"
+
 #if defined(CONFIG_X86_64) && defined(RETPOLINE)
 #if defined(CONFIG_X86_64) && defined(RETPOLINE)
 
 
 /*
 /*
@@ -102,6 +171,7 @@
 # define CALL_NOSPEC						\
 # define CALL_NOSPEC						\
 	ANNOTATE_NOSPEC_ALTERNATIVE				\
 	ANNOTATE_NOSPEC_ALTERNATIVE				\
 	ALTERNATIVE(						\
 	ALTERNATIVE(						\
+	ANNOTATE_RETPOLINE_SAFE					\
 	"call *%[thunk_target]\n",				\
 	"call *%[thunk_target]\n",				\
 	"call __x86_indirect_thunk_%V[thunk_target]\n",		\
 	"call __x86_indirect_thunk_%V[thunk_target]\n",		\
 	X86_FEATURE_RETPOLINE)
 	X86_FEATURE_RETPOLINE)
@@ -156,26 +226,54 @@ extern char __indirect_thunk_end[];
 static inline void vmexit_fill_RSB(void)
 static inline void vmexit_fill_RSB(void)
 {
 {
 #ifdef CONFIG_RETPOLINE
 #ifdef CONFIG_RETPOLINE
-	alternative_input("",
-			  "call __fill_rsb",
-			  X86_FEATURE_RETPOLINE,
-			  ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory"));
+	unsigned long loops;
+
+	asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
+		      ALTERNATIVE("jmp 910f",
+				  __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
+				  X86_FEATURE_RETPOLINE)
+		      "910:"
+		      : "=r" (loops), ASM_CALL_CONSTRAINT
+		      : : "memory" );
 #endif
 #endif
 }
 }
 
 
+#define alternative_msr_write(_msr, _val, _feature)		\
+	asm volatile(ALTERNATIVE("",				\
+				 "movl %[msr], %%ecx\n\t"	\
+				 "movl %[val], %%eax\n\t"	\
+				 "movl $0, %%edx\n\t"		\
+				 "wrmsr",			\
+				 _feature)			\
+		     : : [msr] "i" (_msr), [val] "i" (_val)	\
+		     : "eax", "ecx", "edx", "memory")
+
 static inline void indirect_branch_prediction_barrier(void)
 static inline void indirect_branch_prediction_barrier(void)
 {
 {
-	asm volatile(ALTERNATIVE("",
-				 "movl %[msr], %%ecx\n\t"
-				 "movl %[val], %%eax\n\t"
-				 "movl $0, %%edx\n\t"
-				 "wrmsr",
-				 X86_FEATURE_USE_IBPB)
-		     : : [msr] "i" (MSR_IA32_PRED_CMD),
-			 [val] "i" (PRED_CMD_IBPB)
-		     : "eax", "ecx", "edx", "memory");
+	alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB,
+			      X86_FEATURE_USE_IBPB);
 }
 }
 
 
+/*
+ * With retpoline, we must use IBRS to restrict branch prediction
+ * before calling into firmware.
+ *
+ * (Implemented as CPP macros due to header hell.)
+ */
+#define firmware_restrict_branch_speculation_start()			\
+do {									\
+	preempt_disable();						\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS,	\
+			      X86_FEATURE_USE_IBRS_FW);			\
+} while (0)
+
+#define firmware_restrict_branch_speculation_end()			\
+do {									\
+	alternative_msr_write(MSR_IA32_SPEC_CTRL, 0,			\
+			      X86_FEATURE_USE_IBRS_FW);			\
+	preempt_enable();						\
+} while (0)
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASSEMBLY__ */
 
 
 /*
 /*

+ 13 - 4
arch/x86/include/asm/paravirt.h

@@ -7,6 +7,7 @@
 #ifdef CONFIG_PARAVIRT
 #ifdef CONFIG_PARAVIRT
 #include <asm/pgtable_types.h>
 #include <asm/pgtable_types.h>
 #include <asm/asm.h>
 #include <asm/asm.h>
+#include <asm/nospec-branch.h>
 
 
 #include <asm/paravirt_types.h>
 #include <asm/paravirt_types.h>
 
 
@@ -884,23 +885,27 @@ extern void default_banner(void);
 
 
 #define INTERRUPT_RETURN						\
 #define INTERRUPT_RETURN						\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
+		  ANNOTATE_RETPOLINE_SAFE;					\
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret);)
 
 
 #define DISABLE_INTERRUPTS(clobbers)					\
 #define DISABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
 		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
 		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
+		  ANNOTATE_RETPOLINE_SAFE;					\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable);	\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable);	\
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 
 
 #define ENABLE_INTERRUPTS(clobbers)					\
 #define ENABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
 		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
 		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
+		  ANNOTATE_RETPOLINE_SAFE;					\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 #define GET_CR0_INTO_EAX				\
 #define GET_CR0_INTO_EAX				\
 	push %ecx; push %edx;				\
 	push %ecx; push %edx;				\
+	ANNOTATE_RETPOLINE_SAFE;				\
 	call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0);	\
 	call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0);	\
 	pop %edx; pop %ecx
 	pop %edx; pop %ecx
 #else	/* !CONFIG_X86_32 */
 #else	/* !CONFIG_X86_32 */
@@ -922,21 +927,25 @@ extern void default_banner(void);
  */
  */
 #define SWAPGS								\
 #define SWAPGS								\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE,	\
-		  call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs)		\
+		  ANNOTATE_RETPOLINE_SAFE;					\
+		  call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs);		\
 		 )
 		 )
 
 
 #define GET_CR2_INTO_RAX				\
 #define GET_CR2_INTO_RAX				\
-	call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2)
+	ANNOTATE_RETPOLINE_SAFE;				\
+	call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2);
 
 
 #define USERGS_SYSRET64							\
 #define USERGS_SYSRET64							\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 		  CLBR_NONE,						\
 		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+		  ANNOTATE_RETPOLINE_SAFE;					\
+		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64);)
 
 
 #ifdef CONFIG_DEBUG_ENTRY
 #ifdef CONFIG_DEBUG_ENTRY
 #define SAVE_FLAGS(clobbers)                                        \
 #define SAVE_FLAGS(clobbers)                                        \
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
 		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
 		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
+		  ANNOTATE_RETPOLINE_SAFE;				    \
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 #endif
 #endif

+ 4 - 1
arch/x86/include/asm/paravirt_types.h

@@ -43,6 +43,7 @@
 #include <asm/desc_defs.h>
 #include <asm/desc_defs.h>
 #include <asm/kmap_types.h>
 #include <asm/kmap_types.h>
 #include <asm/pgtable_types.h>
 #include <asm/pgtable_types.h>
+#include <asm/nospec-branch.h>
 
 
 struct page;
 struct page;
 struct thread_struct;
 struct thread_struct;
@@ -392,7 +393,9 @@ int paravirt_disable_iospace(void);
  * offset into the paravirt_patch_template structure, and can therefore be
  * offset into the paravirt_patch_template structure, and can therefore be
  * freely converted back into a structure offset.
  * freely converted back into a structure offset.
  */
  */
-#define PARAVIRT_CALL	"call *%c[paravirt_opptr];"
+#define PARAVIRT_CALL					\
+	ANNOTATE_RETPOLINE_SAFE				\
+	"call *%c[paravirt_opptr];"
 
 
 /*
 /*
  * These macros are intended to wrap calls through one of the paravirt
  * These macros are intended to wrap calls through one of the paravirt

+ 4 - 4
arch/x86/include/asm/pgtable.h

@@ -350,14 +350,14 @@ static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
 {
 {
 	pmdval_t v = native_pmd_val(pmd);
 	pmdval_t v = native_pmd_val(pmd);
 
 
-	return __pmd(v | set);
+	return native_make_pmd(v | set);
 }
 }
 
 
 static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
 static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
 {
 {
 	pmdval_t v = native_pmd_val(pmd);
 	pmdval_t v = native_pmd_val(pmd);
 
 
-	return __pmd(v & ~clear);
+	return native_make_pmd(v & ~clear);
 }
 }
 
 
 static inline pmd_t pmd_mkold(pmd_t pmd)
 static inline pmd_t pmd_mkold(pmd_t pmd)
@@ -409,14 +409,14 @@ static inline pud_t pud_set_flags(pud_t pud, pudval_t set)
 {
 {
 	pudval_t v = native_pud_val(pud);
 	pudval_t v = native_pud_val(pud);
 
 
-	return __pud(v | set);
+	return native_make_pud(v | set);
 }
 }
 
 
 static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
 static inline pud_t pud_clear_flags(pud_t pud, pudval_t clear)
 {
 {
 	pudval_t v = native_pud_val(pud);
 	pudval_t v = native_pud_val(pud);
 
 
-	return __pud(v & ~clear);
+	return native_make_pud(v & ~clear);
 }
 }
 
 
 static inline pud_t pud_mkold(pud_t pud)
 static inline pud_t pud_mkold(pud_t pud)

+ 1 - 0
arch/x86/include/asm/pgtable_32.h

@@ -32,6 +32,7 @@ extern pmd_t initial_pg_pmd[];
 static inline void pgtable_cache_init(void) { }
 static inline void pgtable_cache_init(void) { }
 static inline void check_pgt_cache(void) { }
 static inline void check_pgt_cache(void) { }
 void paging_init(void);
 void paging_init(void);
+void sync_initial_page_table(void);
 
 
 static inline int pgd_large(pgd_t pgd) { return 0; }
 static inline int pgd_large(pgd_t pgd) { return 0; }
 
 

+ 1 - 0
arch/x86/include/asm/pgtable_64.h

@@ -28,6 +28,7 @@ extern pgd_t init_top_pgt[];
 #define swapper_pg_dir init_top_pgt
 #define swapper_pg_dir init_top_pgt
 
 
 extern void paging_init(void);
 extern void paging_init(void);
+static inline void sync_initial_page_table(void) { }
 
 
 #define pte_ERROR(e)					\
 #define pte_ERROR(e)					\
 	pr_err("%s:%d: bad pte %p(%016lx)\n",		\
 	pr_err("%s:%d: bad pte %p(%016lx)\n",		\

+ 10 - 2
arch/x86/include/asm/pgtable_types.h

@@ -174,7 +174,6 @@ enum page_cache_mode {
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
 #define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_NOCACHE)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_NOCACHE)
-#define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
 #define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
 #define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -206,7 +205,6 @@ enum page_cache_mode {
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
 #define PAGE_KERNEL_NOCACHE	__pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
 #define PAGE_KERNEL_LARGE	__pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
 #define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
 #define PAGE_KERNEL_LARGE_EXEC	__pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
-#define PAGE_KERNEL_VSYSCALL	__pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC)
 #define PAGE_KERNEL_VVAR	__pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
 #define PAGE_KERNEL_VVAR	__pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
 
 
 #define PAGE_KERNEL_IO		__pgprot(__PAGE_KERNEL_IO)
 #define PAGE_KERNEL_IO		__pgprot(__PAGE_KERNEL_IO)
@@ -323,6 +321,11 @@ static inline pudval_t native_pud_val(pud_t pud)
 #else
 #else
 #include <asm-generic/pgtable-nopud.h>
 #include <asm-generic/pgtable-nopud.h>
 
 
+static inline pud_t native_make_pud(pudval_t val)
+{
+	return (pud_t) { .p4d.pgd = native_make_pgd(val) };
+}
+
 static inline pudval_t native_pud_val(pud_t pud)
 static inline pudval_t native_pud_val(pud_t pud)
 {
 {
 	return native_pgd_val(pud.p4d.pgd);
 	return native_pgd_val(pud.p4d.pgd);
@@ -344,6 +347,11 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
 #else
 #else
 #include <asm-generic/pgtable-nopmd.h>
 #include <asm-generic/pgtable-nopmd.h>
 
 
+static inline pmd_t native_make_pmd(pmdval_t val)
+{
+	return (pmd_t) { .pud.p4d.pgd = native_make_pgd(val) };
+}
+
 static inline pmdval_t native_pmd_val(pmd_t pmd)
 static inline pmdval_t native_pmd_val(pmd_t pmd)
 {
 {
 	return native_pgd_val(pmd.pud.p4d.pgd);
 	return native_pgd_val(pmd.pud.p4d.pgd);

+ 1 - 0
arch/x86/include/asm/processor.h

@@ -977,4 +977,5 @@ bool xen_set_default_idle(void);
 
 
 void stop_this_cpu(void *dummy);
 void stop_this_cpu(void *dummy);
 void df_debug(struct pt_regs *regs, long error_code);
 void df_debug(struct pt_regs *regs, long error_code);
+void microcode_check(void);
 #endif /* _ASM_X86_PROCESSOR_H */
 #endif /* _ASM_X86_PROCESSOR_H */

+ 2 - 2
arch/x86/include/asm/refcount.h

@@ -67,13 +67,13 @@ static __always_inline __must_check
 bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 {
 {
 	GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
 	GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl", REFCOUNT_CHECK_LT_ZERO,
-				  r->refs.counter, "er", i, "%0", e);
+				  r->refs.counter, "er", i, "%0", e, "cx");
 }
 }
 
 
 static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
 static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
 {
 {
 	GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
 	GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl", REFCOUNT_CHECK_LT_ZERO,
-				 r->refs.counter, "%0", e);
+				 r->refs.counter, "%0", e, "cx");
 }
 }
 
 
 static __always_inline __must_check
 static __always_inline __must_check

+ 8 - 8
arch/x86/include/asm/rmwcc.h

@@ -2,8 +2,7 @@
 #ifndef _ASM_X86_RMWcc
 #ifndef _ASM_X86_RMWcc
 #define _ASM_X86_RMWcc
 #define _ASM_X86_RMWcc
 
 
-#define __CLOBBERS_MEM		"memory"
-#define __CLOBBERS_MEM_CC_CX	"memory", "cc", "cx"
+#define __CLOBBERS_MEM(clb...)	"memory", ## clb
 
 
 #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
 #if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO)
 
 
@@ -40,18 +39,19 @@ do {									\
 #endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 #endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */
 
 
 #define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
 #define GEN_UNARY_RMWcc(op, var, arg0, cc)				\
-	__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM)
+	__GEN_RMWcc(op " " arg0, var, cc, __CLOBBERS_MEM())
 
 
-#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc)		\
+#define GEN_UNARY_SUFFIXED_RMWcc(op, suffix, var, arg0, cc, clobbers...)\
 	__GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc,			\
 	__GEN_RMWcc(op " " arg0 "\n\t" suffix, var, cc,			\
-		    __CLOBBERS_MEM_CC_CX)
+		    __CLOBBERS_MEM(clobbers))
 
 
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
 #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc)			\
 	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc,		\
 	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0, var, cc,		\
-		    __CLOBBERS_MEM, vcon (val))
+		    __CLOBBERS_MEM(), vcon (val))
 
 
-#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc)	\
+#define GEN_BINARY_SUFFIXED_RMWcc(op, suffix, var, vcon, val, arg0, cc,	\
+				  clobbers...)				\
 	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc,	\
 	__GEN_RMWcc(op __BINARY_RMWcc_ARG arg0 "\n\t" suffix, var, cc,	\
-		    __CLOBBERS_MEM_CC_CX, vcon (val))
+		    __CLOBBERS_MEM(clobbers), vcon (val))
 
 
 #endif /* _ASM_X86_RMWcc */
 #endif /* _ASM_X86_RMWcc */

+ 1 - 0
arch/x86/include/asm/sections.h

@@ -10,6 +10,7 @@ extern struct exception_table_entry __stop___ex_table[];
 
 
 #if defined(CONFIG_X86_64)
 #if defined(CONFIG_X86_64)
 extern char __end_rodata_hpage_align[];
 extern char __end_rodata_hpage_align[];
+extern char __entry_trampoline_start[], __entry_trampoline_end[];
 #endif
 #endif
 
 
 #endif	/* _ASM_X86_SECTIONS_H */
 #endif	/* _ASM_X86_SECTIONS_H */

+ 30 - 18
arch/x86/include/asm/sys_ia32.h

@@ -20,31 +20,43 @@
 #include <asm/ia32.h>
 #include <asm/ia32.h>
 
 
 /* ia32/sys_ia32.c */
 /* ia32/sys_ia32.c */
-asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long);
-asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
+asmlinkage long compat_sys_x86_truncate64(const char __user *, unsigned long,
+					  unsigned long);
+asmlinkage long compat_sys_x86_ftruncate64(unsigned int, unsigned long,
+					   unsigned long);
 
 
-asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *);
-asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *);
-asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
-asmlinkage long sys32_fstatat(unsigned int, const char __user *,
+asmlinkage long compat_sys_x86_stat64(const char __user *,
+				      struct stat64 __user *);
+asmlinkage long compat_sys_x86_lstat64(const char __user *,
+				       struct stat64 __user *);
+asmlinkage long compat_sys_x86_fstat64(unsigned int, struct stat64 __user *);
+asmlinkage long compat_sys_x86_fstatat(unsigned int, const char __user *,
 			      struct stat64 __user *, int);
 			      struct stat64 __user *, int);
 struct mmap_arg_struct32;
 struct mmap_arg_struct32;
-asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
+asmlinkage long compat_sys_x86_mmap(struct mmap_arg_struct32 __user *);
 
 
-asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int);
+asmlinkage long compat_sys_x86_waitpid(compat_pid_t, unsigned int __user *,
+				       int);
 
 
-asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
-asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
+asmlinkage long compat_sys_x86_pread(unsigned int, char __user *, u32, u32,
+				     u32);
+asmlinkage long compat_sys_x86_pwrite(unsigned int, const char __user *, u32,
+				      u32, u32);
 
 
-long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
-long sys32_vm86_warning(void);
+asmlinkage long compat_sys_x86_fadvise64_64(int, __u32, __u32, __u32, __u32,
+					    int);
 
 
-asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
-asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
-				      unsigned, unsigned, int);
-asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int);
-asmlinkage long sys32_fallocate(int, int, unsigned,
-				unsigned, unsigned, unsigned);
+asmlinkage ssize_t compat_sys_x86_readahead(int, unsigned int, unsigned int,
+					    size_t);
+asmlinkage long compat_sys_x86_sync_file_range(int, unsigned int, unsigned int,
+					       unsigned int, unsigned int,
+					       int);
+asmlinkage long compat_sys_x86_fadvise64(int, unsigned int, unsigned int,
+					 size_t, int);
+asmlinkage long compat_sys_x86_fallocate(int, int, unsigned int, unsigned int,
+					 unsigned int, unsigned int);
+asmlinkage long compat_sys_x86_clone(unsigned long, unsigned long, int __user *,
+				     unsigned long, int __user *);
 
 
 /* ia32/ia32_signal.c */
 /* ia32/ia32_signal.c */
 asmlinkage long sys32_sigreturn(void);
 asmlinkage long sys32_sigreturn(void);

+ 1 - 1
arch/x86/kernel/apic/io_apic.c

@@ -1603,7 +1603,7 @@ static void __init delay_with_tsc(void)
 	do {
 	do {
 		rep_nop();
 		rep_nop();
 		now = rdtsc();
 		now = rdtsc();
-	} while ((now - start) < 40000000000UL / HZ &&
+	} while ((now - start) < 40000000000ULL / HZ &&
 		time_before_eq(jiffies, end));
 		time_before_eq(jiffies, end));
 }
 }
 
 

+ 11 - 1
arch/x86/kernel/cpu/bugs.c

@@ -300,6 +300,15 @@ retpoline_auto:
 		setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
 		setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
 		pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
 		pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
 	}
 	}
+
+	/*
+	 * Retpoline means the kernel is safe because it has no indirect
+	 * branches. But firmware isn't, so use IBRS to protect that.
+	 */
+	if (boot_cpu_has(X86_FEATURE_IBRS)) {
+		setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
+		pr_info("Enabling Restricted Speculation for firmware calls\n");
+	}
 }
 }
 
 
 #undef pr_fmt
 #undef pr_fmt
@@ -326,8 +335,9 @@ ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, c
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
 		return sprintf(buf, "Not affected\n");
 		return sprintf(buf, "Not affected\n");
 
 
-	return sprintf(buf, "%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+	return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
 		       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
 		       boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
+		       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
 		       spectre_v2_module_string());
 		       spectre_v2_module_string());
 }
 }
 #endif
 #endif

+ 30 - 0
arch/x86/kernel/cpu/common.c

@@ -1749,3 +1749,33 @@ static int __init init_cpu_syscore(void)
 	return 0;
 	return 0;
 }
 }
 core_initcall(init_cpu_syscore);
 core_initcall(init_cpu_syscore);
+
+/*
+ * The microcode loader calls this upon late microcode load to recheck features,
+ * only when microcode has been updated. Caller holds microcode_mutex and CPU
+ * hotplug lock.
+ */
+void microcode_check(void)
+{
+	struct cpuinfo_x86 info;
+
+	perf_check_microcode();
+
+	/* Reload CPUID max function as it might've changed. */
+	info.cpuid_level = cpuid_eax(0);
+
+	/*
+	 * Copy all capability leafs to pick up the synthetic ones so that
+	 * memcmp() below doesn't fail on that. The ones coming from CPUID will
+	 * get overwritten in get_cpu_cap().
+	 */
+	memcpy(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability));
+
+	get_cpu_cap(&info);
+
+	if (!memcmp(&info.x86_capability, &boot_cpu_data.x86_capability, sizeof(info.x86_capability)))
+		return;
+
+	pr_warn("x86/CPU: CPU features have changed after loading microcode, but might not take effect.\n");
+	pr_warn("x86/CPU: Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+}

+ 7 - 0
arch/x86/kernel/cpu/intel.c

@@ -144,6 +144,13 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c)
 {
 {
 	int i;
 	int i;
 
 
+	/*
+	 * We know that the hypervisor lie to us on the microcode version so
+	 * we may as well hope that it is running the correct version.
+	 */
+	if (cpu_has(c, X86_FEATURE_HYPERVISOR))
+		return false;
+
 	for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
 	for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) {
 		if (c->x86_model == spectre_bad_microcodes[i].model &&
 		if (c->x86_model == spectre_bad_microcodes[i].model &&
 		    c->x86_stepping == spectre_bad_microcodes[i].stepping)
 		    c->x86_stepping == spectre_bad_microcodes[i].stepping)

+ 5 - 5
arch/x86/kernel/cpu/microcode/amd.c

@@ -498,7 +498,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
 	return patch_size;
 	return patch_size;
 }
 }
 
 
-static int apply_microcode_amd(int cpu)
+static enum ucode_state apply_microcode_amd(int cpu)
 {
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct microcode_amd *mc_amd;
 	struct microcode_amd *mc_amd;
@@ -512,7 +512,7 @@ static int apply_microcode_amd(int cpu)
 
 
 	p = find_patch(cpu);
 	p = find_patch(cpu);
 	if (!p)
 	if (!p)
-		return 0;
+		return UCODE_NFOUND;
 
 
 	mc_amd  = p->data;
 	mc_amd  = p->data;
 	uci->mc = p->data;
 	uci->mc = p->data;
@@ -523,13 +523,13 @@ static int apply_microcode_amd(int cpu)
 	if (rev >= mc_amd->hdr.patch_id) {
 	if (rev >= mc_amd->hdr.patch_id) {
 		c->microcode = rev;
 		c->microcode = rev;
 		uci->cpu_sig.rev = rev;
 		uci->cpu_sig.rev = rev;
-		return 0;
+		return UCODE_OK;
 	}
 	}
 
 
 	if (__apply_microcode_amd(mc_amd)) {
 	if (__apply_microcode_amd(mc_amd)) {
 		pr_err("CPU%d: update failed for patch_level=0x%08x\n",
 		pr_err("CPU%d: update failed for patch_level=0x%08x\n",
 			cpu, mc_amd->hdr.patch_id);
 			cpu, mc_amd->hdr.patch_id);
-		return -1;
+		return UCODE_ERROR;
 	}
 	}
 	pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
 	pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
 		mc_amd->hdr.patch_id);
 		mc_amd->hdr.patch_id);
@@ -537,7 +537,7 @@ static int apply_microcode_amd(int cpu)
 	uci->cpu_sig.rev = mc_amd->hdr.patch_id;
 	uci->cpu_sig.rev = mc_amd->hdr.patch_id;
 	c->microcode = mc_amd->hdr.patch_id;
 	c->microcode = mc_amd->hdr.patch_id;
 
 
-	return 0;
+	return UCODE_UPDATED;
 }
 }
 
 
 static int install_equiv_cpu_table(const u8 *buf)
 static int install_equiv_cpu_table(const u8 *buf)

+ 123 - 38
arch/x86/kernel/cpu/microcode/core.c

@@ -22,13 +22,16 @@
 #define pr_fmt(fmt) "microcode: " fmt
 #define pr_fmt(fmt) "microcode: " fmt
 
 
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
+#include <linux/stop_machine.h>
 #include <linux/syscore_ops.h>
 #include <linux/syscore_ops.h>
 #include <linux/miscdevice.h>
 #include <linux/miscdevice.h>
 #include <linux/capability.h>
 #include <linux/capability.h>
 #include <linux/firmware.h>
 #include <linux/firmware.h>
 #include <linux/kernel.h>
 #include <linux/kernel.h>
+#include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/cpu.h>
 #include <linux/cpu.h>
+#include <linux/nmi.h>
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mm.h>
 
 
@@ -64,6 +67,11 @@ LIST_HEAD(microcode_cache);
  */
  */
 static DEFINE_MUTEX(microcode_mutex);
 static DEFINE_MUTEX(microcode_mutex);
 
 
+/*
+ * Serialize late loading so that CPUs get updated one-by-one.
+ */
+static DEFINE_SPINLOCK(update_lock);
+
 struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 
 
 struct cpu_info_ctx {
 struct cpu_info_ctx {
@@ -373,26 +381,23 @@ static int collect_cpu_info(int cpu)
 	return ret;
 	return ret;
 }
 }
 
 
-struct apply_microcode_ctx {
-	int err;
-};
-
 static void apply_microcode_local(void *arg)
 static void apply_microcode_local(void *arg)
 {
 {
-	struct apply_microcode_ctx *ctx = arg;
+	enum ucode_state *err = arg;
 
 
-	ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+	*err = microcode_ops->apply_microcode(smp_processor_id());
 }
 }
 
 
 static int apply_microcode_on_target(int cpu)
 static int apply_microcode_on_target(int cpu)
 {
 {
-	struct apply_microcode_ctx ctx = { .err = 0 };
+	enum ucode_state err;
 	int ret;
 	int ret;
 
 
-	ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
-	if (!ret)
-		ret = ctx.err;
-
+	ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1);
+	if (!ret) {
+		if (err == UCODE_ERROR)
+			ret = 1;
+	}
 	return ret;
 	return ret;
 }
 }
 
 
@@ -489,31 +494,110 @@ static void __exit microcode_dev_exit(void)
 /* fake device for request_firmware */
 /* fake device for request_firmware */
 static struct platform_device	*microcode_pdev;
 static struct platform_device	*microcode_pdev;
 
 
-static int reload_for_cpu(int cpu)
+/*
+ * Late loading dance. Why the heavy-handed stomp_machine effort?
+ *
+ * - HT siblings must be idle and not execute other code while the other sibling
+ *   is loading microcode in order to avoid any negative interactions caused by
+ *   the loading.
+ *
+ * - In addition, microcode update on the cores must be serialized until this
+ *   requirement can be relaxed in the future. Right now, this is conservative
+ *   and good.
+ */
+#define SPINUNIT 100 /* 100 nsec */
+
+static int check_online_cpus(void)
 {
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	enum ucode_state ustate;
-	int err = 0;
+	if (num_online_cpus() == num_present_cpus())
+		return 0;
 
 
-	if (!uci->valid)
-		return err;
+	pr_err("Not all CPUs online, aborting microcode update.\n");
 
 
-	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
-	if (ustate == UCODE_OK)
-		apply_microcode_on_target(cpu);
-	else
-		if (ustate == UCODE_ERROR)
-			err = -EINVAL;
-	return err;
+	return -EINVAL;
+}
+
+static atomic_t late_cpus;
+
+/*
+ * Returns:
+ * < 0 - on error
+ *   0 - no update done
+ *   1 - microcode was updated
+ */
+static int __reload_late(void *info)
+{
+	unsigned int timeout = NSEC_PER_SEC;
+	int all_cpus = num_online_cpus();
+	int cpu = smp_processor_id();
+	enum ucode_state err;
+	int ret = 0;
+
+	atomic_dec(&late_cpus);
+
+	/*
+	 * Wait for all CPUs to arrive. A load will not be attempted unless all
+	 * CPUs show up.
+	 * */
+	while (atomic_read(&late_cpus)) {
+		if (timeout < SPINUNIT) {
+			pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n",
+				atomic_read(&late_cpus));
+			return -1;
+		}
+
+		ndelay(SPINUNIT);
+		timeout -= SPINUNIT;
+
+		touch_nmi_watchdog();
+	}
+
+	spin_lock(&update_lock);
+	apply_microcode_local(&err);
+	spin_unlock(&update_lock);
+
+	if (err > UCODE_NFOUND) {
+		pr_warn("Error reloading microcode on CPU %d\n", cpu);
+		ret = -1;
+	} else if (err == UCODE_UPDATED) {
+		ret = 1;
+	}
+
+	atomic_inc(&late_cpus);
+
+	while (atomic_read(&late_cpus) != all_cpus)
+		cpu_relax();
+
+	return ret;
+}
+
+/*
+ * Reload microcode late on all CPUs. Wait for a sec until they
+ * all gather together.
+ */
+static int microcode_reload_late(void)
+{
+	int ret;
+
+	atomic_set(&late_cpus, num_online_cpus());
+
+	ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
+	if (ret < 0)
+		return ret;
+	else if (ret > 0)
+		microcode_check();
+
+	return ret;
 }
 }
 
 
 static ssize_t reload_store(struct device *dev,
 static ssize_t reload_store(struct device *dev,
 			    struct device_attribute *attr,
 			    struct device_attribute *attr,
 			    const char *buf, size_t size)
 			    const char *buf, size_t size)
 {
 {
+	enum ucode_state tmp_ret = UCODE_OK;
+	int bsp = boot_cpu_data.cpu_index;
 	unsigned long val;
 	unsigned long val;
-	int cpu;
-	ssize_t ret = 0, tmp_ret;
+	ssize_t ret = 0;
 
 
 	ret = kstrtoul(buf, 0, &val);
 	ret = kstrtoul(buf, 0, &val);
 	if (ret)
 	if (ret)
@@ -522,23 +606,24 @@ static ssize_t reload_store(struct device *dev,
 	if (val != 1)
 	if (val != 1)
 		return size;
 		return size;
 
 
+	tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+	if (tmp_ret != UCODE_OK)
+		return size;
+
 	get_online_cpus();
 	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-	for_each_online_cpu(cpu) {
-		tmp_ret = reload_for_cpu(cpu);
-		if (tmp_ret != 0)
-			pr_warn("Error reloading microcode on CPU %d\n", cpu);
 
 
-		/* save retval of the first encountered reload error */
-		if (!ret)
-			ret = tmp_ret;
-	}
-	if (!ret)
-		perf_check_microcode();
+	ret = check_online_cpus();
+	if (ret)
+		goto put;
+
+	mutex_lock(&microcode_mutex);
+	ret = microcode_reload_late();
 	mutex_unlock(&microcode_mutex);
 	mutex_unlock(&microcode_mutex);
+
+put:
 	put_online_cpus();
 	put_online_cpus();
 
 
-	if (!ret)
+	if (ret >= 0)
 		ret = size;
 		ret = size;
 
 
 	return ret;
 	return ret;

+ 45 - 13
arch/x86/kernel/cpu/microcode/intel.c

@@ -589,6 +589,23 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
 	if (!mc)
 	if (!mc)
 		return 0;
 		return 0;
 
 
+	/*
+	 * Save us the MSR write below - which is a particular expensive
+	 * operation - when the other hyperthread has updated the microcode
+	 * already.
+	 */
+	rev = intel_get_microcode_revision();
+	if (rev >= mc->hdr.rev) {
+		uci->cpu_sig.rev = rev;
+		return UCODE_OK;
+	}
+
+	/*
+	 * Writeback and invalidate caches before updating microcode to avoid
+	 * internal issues depending on what the microcode is updating.
+	 */
+	native_wbinvd();
+
 	/* write microcode via MSR 0x79 */
 	/* write microcode via MSR 0x79 */
 	native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
 	native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
 
 
@@ -772,27 +789,44 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 	return 0;
 	return 0;
 }
 }
 
 
-static int apply_microcode_intel(int cpu)
+static enum ucode_state apply_microcode_intel(int cpu)
 {
 {
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct microcode_intel *mc;
 	struct microcode_intel *mc;
-	struct ucode_cpu_info *uci;
-	struct cpuinfo_x86 *c;
 	static int prev_rev;
 	static int prev_rev;
 	u32 rev;
 	u32 rev;
 
 
 	/* We should bind the task to the CPU */
 	/* We should bind the task to the CPU */
 	if (WARN_ON(raw_smp_processor_id() != cpu))
 	if (WARN_ON(raw_smp_processor_id() != cpu))
-		return -1;
+		return UCODE_ERROR;
 
 
-	uci = ucode_cpu_info + cpu;
-	mc = uci->mc;
+	/* Look for a newer patch in our cache: */
+	mc = find_patch(uci);
 	if (!mc) {
 	if (!mc) {
-		/* Look for a newer patch in our cache: */
-		mc = find_patch(uci);
+		mc = uci->mc;
 		if (!mc)
 		if (!mc)
-			return 0;
+			return UCODE_NFOUND;
 	}
 	}
 
 
+	/*
+	 * Save us the MSR write below - which is a particular expensive
+	 * operation - when the other hyperthread has updated the microcode
+	 * already.
+	 */
+	rev = intel_get_microcode_revision();
+	if (rev >= mc->hdr.rev) {
+		uci->cpu_sig.rev = rev;
+		c->microcode = rev;
+		return UCODE_OK;
+	}
+
+	/*
+	 * Writeback and invalidate caches before updating microcode to avoid
+	 * internal issues depending on what the microcode is updating.
+	 */
+	native_wbinvd();
+
 	/* write microcode via MSR 0x79 */
 	/* write microcode via MSR 0x79 */
 	wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
 	wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
 
 
@@ -801,7 +835,7 @@ static int apply_microcode_intel(int cpu)
 	if (rev != mc->hdr.rev) {
 	if (rev != mc->hdr.rev) {
 		pr_err("CPU%d update to revision 0x%x failed\n",
 		pr_err("CPU%d update to revision 0x%x failed\n",
 		       cpu, mc->hdr.rev);
 		       cpu, mc->hdr.rev);
-		return -1;
+		return UCODE_ERROR;
 	}
 	}
 
 
 	if (rev != prev_rev) {
 	if (rev != prev_rev) {
@@ -813,12 +847,10 @@ static int apply_microcode_intel(int cpu)
 		prev_rev = rev;
 		prev_rev = rev;
 	}
 	}
 
 
-	c = &cpu_data(cpu);
-
 	uci->cpu_sig.rev = rev;
 	uci->cpu_sig.rev = rev;
 	c->microcode = rev;
 	c->microcode = rev;
 
 
-	return 0;
+	return UCODE_UPDATED;
 }
 }
 
 
 static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,

+ 2 - 0
arch/x86/kernel/head_64.S

@@ -23,6 +23,7 @@
 #include <asm/nops.h>
 #include <asm/nops.h>
 #include "../entry/calling.h"
 #include "../entry/calling.h"
 #include <asm/export.h>
 #include <asm/export.h>
+#include <asm/nospec-branch.h>
 
 
 #ifdef CONFIG_PARAVIRT
 #ifdef CONFIG_PARAVIRT
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
@@ -137,6 +138,7 @@ ENTRY(secondary_startup_64)
 
 
 	/* Ensure I am executing from virtual addresses */
 	/* Ensure I am executing from virtual addresses */
 	movq	$1f, %rax
 	movq	$1f, %rax
+	ANNOTATE_RETPOLINE_SAFE
 	jmp	*%rax
 	jmp	*%rax
 1:
 1:
 	UNWIND_HINT_EMPTY
 	UNWIND_HINT_EMPTY

+ 1 - 1
arch/x86/kernel/ioport.c

@@ -23,7 +23,7 @@
 /*
 /*
  * this changes the io permissions bitmap in the current task.
  * this changes the io permissions bitmap in the current task.
  */
  */
-asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on)
 {
 {
 	struct thread_struct *t = &current->thread;
 	struct thread_struct *t = &current->thread;
 	struct tss_struct *tss;
 	struct tss_struct *tss;

+ 9 - 1
arch/x86/kernel/kprobes/core.c

@@ -1168,10 +1168,18 @@ NOKPROBE_SYMBOL(longjmp_break_handler);
 
 
 bool arch_within_kprobe_blacklist(unsigned long addr)
 bool arch_within_kprobe_blacklist(unsigned long addr)
 {
 {
+	bool is_in_entry_trampoline_section = false;
+
+#ifdef CONFIG_X86_64
+	is_in_entry_trampoline_section =
+		(addr >= (unsigned long)__entry_trampoline_start &&
+		 addr < (unsigned long)__entry_trampoline_end);
+#endif
 	return  (addr >= (unsigned long)__kprobes_text_start &&
 	return  (addr >= (unsigned long)__kprobes_text_start &&
 		 addr < (unsigned long)__kprobes_text_end) ||
 		 addr < (unsigned long)__kprobes_text_end) ||
 		(addr >= (unsigned long)__entry_text_start &&
 		(addr >= (unsigned long)__entry_text_start &&
-		 addr < (unsigned long)__entry_text_end);
+		 addr < (unsigned long)__entry_text_end) ||
+		is_in_entry_trampoline_section;
 }
 }
 
 
 int __init arch_init_kprobes(void)
 int __init arch_init_kprobes(void)

+ 5 - 12
arch/x86/kernel/setup.c

@@ -1203,20 +1203,13 @@ void __init setup_arch(char **cmdline_p)
 
 
 	kasan_init();
 	kasan_init();
 
 
-#ifdef CONFIG_X86_32
-	/* sync back kernel address range */
-	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
-			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
-			KERNEL_PGD_PTRS);
-
 	/*
 	/*
-	 * sync back low identity map too.  It is used for example
-	 * in the 32-bit EFI stub.
+	 * Sync back kernel address range.
+	 *
+	 * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+	 * this call?
 	 */
 	 */
-	clone_pgd_range(initial_page_table,
-			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
-			min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
-#endif
+	sync_initial_page_table();
 
 
 	tboot_probe();
 	tboot_probe();
 
 

+ 4 - 13
arch/x86/kernel/setup_percpu.c

@@ -287,24 +287,15 @@ void __init setup_per_cpu_areas(void)
 	/* Setup cpu initialized, callin, callout masks */
 	/* Setup cpu initialized, callin, callout masks */
 	setup_cpu_local_masks();
 	setup_cpu_local_masks();
 
 
-#ifdef CONFIG_X86_32
 	/*
 	/*
 	 * Sync back kernel address range again.  We already did this in
 	 * Sync back kernel address range again.  We already did this in
 	 * setup_arch(), but percpu data also needs to be available in
 	 * setup_arch(), but percpu data also needs to be available in
 	 * the smpboot asm.  We can't reliably pick up percpu mappings
 	 * the smpboot asm.  We can't reliably pick up percpu mappings
 	 * using vmalloc_fault(), because exception dispatch needs
 	 * using vmalloc_fault(), because exception dispatch needs
 	 * percpu data.
 	 * percpu data.
+	 *
+	 * FIXME: Can the later sync in setup_cpu_entry_areas() replace
+	 * this call?
 	 */
 	 */
-	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
-			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
-			KERNEL_PGD_PTRS);
-
-	/*
-	 * sync back low identity map too.  It is used for example
-	 * in the 32-bit EFI stub.
-	 */
-	clone_pgd_range(initial_page_table,
-			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
-			min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
-#endif
+	sync_initial_page_table();
 }
 }

+ 1 - 2
arch/x86/kernel/unwind_orc.c

@@ -5,7 +5,6 @@
 #include <asm/unwind.h>
 #include <asm/unwind.h>
 #include <asm/orc_types.h>
 #include <asm/orc_types.h>
 #include <asm/orc_lookup.h>
 #include <asm/orc_lookup.h>
-#include <asm/sections.h>
 
 
 #define orc_warn(fmt, ...) \
 #define orc_warn(fmt, ...) \
 	printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
 	printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
@@ -148,7 +147,7 @@ static struct orc_entry *orc_find(unsigned long ip)
 	}
 	}
 
 
 	/* vmlinux .init slow lookup: */
 	/* vmlinux .init slow lookup: */
-	if (ip >= (unsigned long)_sinittext && ip < (unsigned long)_einittext)
+	if (init_kernel_text(ip))
 		return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
 		return __orc_find(__start_orc_unwind_ip, __start_orc_unwind,
 				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
 				  __stop_orc_unwind_ip - __start_orc_unwind_ip, ip);
 
 

+ 2 - 0
arch/x86/kernel/vmlinux.lds.S

@@ -118,9 +118,11 @@ SECTIONS
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 		. = ALIGN(PAGE_SIZE);
 		. = ALIGN(PAGE_SIZE);
+		VMLINUX_SYMBOL(__entry_trampoline_start) = .;
 		_entry_trampoline = .;
 		_entry_trampoline = .;
 		*(.entry_trampoline)
 		*(.entry_trampoline)
 		. = ALIGN(PAGE_SIZE);
 		. = ALIGN(PAGE_SIZE);
+		VMLINUX_SYMBOL(__entry_trampoline_end) = .;
 		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
 		ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
 #endif
 #endif
 
 

+ 5 - 4
arch/x86/kvm/svm.c

@@ -49,6 +49,7 @@
 #include <asm/debugreg.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 #include <asm/kvm_para.h>
 #include <asm/irq_remapping.h>
 #include <asm/irq_remapping.h>
+#include <asm/microcode.h>
 #include <asm/nospec-branch.h>
 #include <asm/nospec-branch.h>
 
 
 #include <asm/virtext.h>
 #include <asm/virtext.h>
@@ -5355,7 +5356,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * being speculatively taken.
 	 * being speculatively taken.
 	 */
 	 */
 	if (svm->spec_ctrl)
 	if (svm->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
 
 
 	asm volatile (
 	asm volatile (
 		"push %%" _ASM_BP "; \n\t"
 		"push %%" _ASM_BP "; \n\t"
@@ -5464,11 +5465,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 	 * save it.
 	 * save it.
 	 */
 	 */
-	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
-		rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+		svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
 
 	if (svm->spec_ctrl)
 	if (svm->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
 
 
 	/* Eliminate branch target predictions from guest mode */
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
 	vmexit_fill_RSB();

+ 5 - 4
arch/x86/kvm/vmx.c

@@ -51,6 +51,7 @@
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
+#include <asm/microcode.h>
 #include <asm/nospec-branch.h>
 #include <asm/nospec-branch.h>
 
 
 #include "trace.h"
 #include "trace.h"
@@ -9452,7 +9453,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * being speculatively taken.
 	 * being speculatively taken.
 	 */
 	 */
 	if (vmx->spec_ctrl)
 	if (vmx->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
 
 
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 	asm(
@@ -9587,11 +9588,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 	 * If the L02 MSR bitmap does not intercept the MSR, then we need to
 	 * save it.
 	 * save it.
 	 */
 	 */
-	if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
-		rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+	if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+		vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
 
 
 	if (vmx->spec_ctrl)
 	if (vmx->spec_ctrl)
-		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+		native_wrmsrl(MSR_IA32_SPEC_CTRL, 0);
 
 
 	/* Eliminate branch target predictions from guest mode */
 	/* Eliminate branch target predictions from guest mode */
 	vmexit_fill_RSB();
 	vmexit_fill_RSB();

+ 0 - 1
arch/x86/lib/Makefile

@@ -28,7 +28,6 @@ lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 lib-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 lib-$(CONFIG_FUNCTION_ERROR_INJECTION)	+= error-inject.o
 lib-$(CONFIG_RETPOLINE) += retpoline.o
 lib-$(CONFIG_RETPOLINE) += retpoline.o
-OBJECT_FILES_NON_STANDARD_retpoline.o :=y
 
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 

+ 0 - 56
arch/x86/lib/retpoline.S

@@ -7,7 +7,6 @@
 #include <asm/alternative-asm.h>
 #include <asm/alternative-asm.h>
 #include <asm/export.h>
 #include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/nospec-branch.h>
-#include <asm/bitsperlong.h>
 
 
 .macro THUNK reg
 .macro THUNK reg
 	.section .text.__x86.indirect_thunk
 	.section .text.__x86.indirect_thunk
@@ -47,58 +46,3 @@ GENERATE_THUNK(r13)
 GENERATE_THUNK(r14)
 GENERATE_THUNK(r14)
 GENERATE_THUNK(r15)
 GENERATE_THUNK(r15)
 #endif
 #endif
-
-/*
- * Fill the CPU return stack buffer.
- *
- * Each entry in the RSB, if used for a speculative 'ret', contains an
- * infinite 'pause; lfence; jmp' loop to capture speculative execution.
- *
- * This is required in various cases for retpoline and IBRS-based
- * mitigations for the Spectre variant 2 vulnerability. Sometimes to
- * eliminate potentially bogus entries from the RSB, and sometimes
- * purely to ensure that it doesn't get empty, which on some CPUs would
- * allow predictions from other (unwanted!) sources to be used.
- *
- * Google experimented with loop-unrolling and this turned out to be
- * the optimal version - two calls, each with their own speculation
- * trap should their return address end up getting used, in a loop.
- */
-.macro STUFF_RSB nr:req sp:req
-	mov	$(\nr / 2), %_ASM_BX
-	.align 16
-771:
-	call	772f
-773:						/* speculation trap */
-	pause
-	lfence
-	jmp	773b
-	.align 16
-772:
-	call	774f
-775:						/* speculation trap */
-	pause
-	lfence
-	jmp	775b
-	.align 16
-774:
-	dec	%_ASM_BX
-	jnz	771b
-	add	$((BITS_PER_LONG/8) * \nr), \sp
-.endm
-
-#define RSB_FILL_LOOPS		16	/* To avoid underflow */
-
-ENTRY(__fill_rsb)
-	STUFF_RSB RSB_FILL_LOOPS, %_ASM_SP
-	ret
-END(__fill_rsb)
-EXPORT_SYMBOL_GPL(__fill_rsb)
-
-#define RSB_CLEAR_LOOPS		32	/* To forcibly overwrite all entries */
-
-ENTRY(__clear_rsb)
-	STUFF_RSB RSB_CLEAR_LOOPS, %_ASM_SP
-	ret
-END(__clear_rsb)
-EXPORT_SYMBOL_GPL(__clear_rsb)

+ 6 - 0
arch/x86/mm/cpu_entry_area.c

@@ -163,4 +163,10 @@ void __init setup_cpu_entry_areas(void)
 
 
 	for_each_possible_cpu(cpu)
 	for_each_possible_cpu(cpu)
 		setup_cpu_entry_area(cpu);
 		setup_cpu_entry_area(cpu);
+
+	/*
+	 * This is the last essential update to swapper_pgdir which needs
+	 * to be synchronized to initial_page_table on 32bit.
+	 */
+	sync_initial_page_table();
 }
 }

+ 0 - 4
arch/x86/mm/fault.c

@@ -1248,10 +1248,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	tsk = current;
 	tsk = current;
 	mm = tsk->mm;
 	mm = tsk->mm;
 
 
-	/*
-	 * Detect and handle instructions that would cause a page fault for
-	 * both a tracked kernel page and a userspace page.
-	 */
 	prefetchw(&mm->mmap_sem);
 	prefetchw(&mm->mmap_sem);
 
 
 	if (unlikely(kmmio_fault(regs, address)))
 	if (unlikely(kmmio_fault(regs, address)))

+ 15 - 0
arch/x86/mm/init_32.c

@@ -453,6 +453,21 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base)
 }
 }
 #endif /* CONFIG_HIGHMEM */
 #endif /* CONFIG_HIGHMEM */
 
 
+void __init sync_initial_page_table(void)
+{
+	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+			KERNEL_PGD_PTRS);
+
+	/*
+	 * sync back low identity map too.  It is used for example
+	 * in the 32-bit EFI stub.
+	 */
+	clone_pgd_range(initial_page_table,
+			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
+			min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
+}
+
 void __init native_pagetable_init(void)
 void __init native_pagetable_init(void)
 {
 {
 	unsigned long pfn, va;
 	unsigned long pfn, va;

+ 2 - 0
arch/x86/mm/mem_encrypt_boot.S

@@ -15,6 +15,7 @@
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/processor-flags.h>
 #include <asm/processor-flags.h>
 #include <asm/msr-index.h>
 #include <asm/msr-index.h>
+#include <asm/nospec-branch.h>
 
 
 	.text
 	.text
 	.code64
 	.code64
@@ -59,6 +60,7 @@ ENTRY(sme_encrypt_execute)
 	movq	%rax, %r8		/* Workarea encryption routine */
 	movq	%rax, %r8		/* Workarea encryption routine */
 	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
 	addq	$PAGE_SIZE, %r8		/* Workarea intermediate copy buffer */
 
 
+	ANNOTATE_RETPOLINE_SAFE
 	call	*%rax			/* Call the encryption routine */
 	call	*%rax			/* Call the encryption routine */
 
 
 	pop	%r12
 	pop	%r12

+ 1 - 1
arch/x86/mm/pti.c

@@ -332,7 +332,7 @@ static void __init pti_clone_user_shared(void)
 }
 }
 
 
 /*
 /*
- * Clone the ESPFIX P4D into the user space visinble page table
+ * Clone the ESPFIX P4D into the user space visible page table
  */
  */
 static void __init pti_setup_espfix64(void)
 static void __init pti_setup_espfix64(void)
 {
 {

+ 1 - 1
arch/x86/realmode/rm/trampoline_64.S

@@ -102,7 +102,7 @@ ENTRY(startup_32)
 	 * don't we'll eventually crash trying to execute encrypted
 	 * don't we'll eventually crash trying to execute encrypted
 	 * instructions.
 	 * instructions.
 	 */
 	 */
-	bt	$TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
+	btl	$TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags
 	jnc	.Ldone
 	jnc	.Ldone
 	movl	$MSR_K8_SYSCFG, %ecx
 	movl	$MSR_K8_SYSCFG, %ecx
 	rdmsr
 	rdmsr

+ 16 - 0
arch/x86/xen/suspend.c

@@ -1,12 +1,15 @@
 // SPDX-License-Identifier: GPL-2.0
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/tick.h>
 #include <linux/tick.h>
+#include <linux/percpu-defs.h>
 
 
 #include <xen/xen.h>
 #include <xen/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/grant_table.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
 #include <xen/events.h>
 
 
+#include <asm/cpufeatures.h>
+#include <asm/msr-index.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/page.h>
 #include <asm/xen/page.h>
 #include <asm/fixmap.h>
 #include <asm/fixmap.h>
@@ -15,6 +18,8 @@
 #include "mmu.h"
 #include "mmu.h"
 #include "pmu.h"
 #include "pmu.h"
 
 
+static DEFINE_PER_CPU(u64, spec_ctrl);
+
 void xen_arch_pre_suspend(void)
 void xen_arch_pre_suspend(void)
 {
 {
 	xen_save_time_memory_area();
 	xen_save_time_memory_area();
@@ -35,6 +40,9 @@ void xen_arch_post_suspend(int cancelled)
 
 
 static void xen_vcpu_notify_restore(void *data)
 static void xen_vcpu_notify_restore(void *data)
 {
 {
+	if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+		wrmsrl(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl));
+
 	/* Boot processor notified via generic timekeeping_resume() */
 	/* Boot processor notified via generic timekeeping_resume() */
 	if (smp_processor_id() == 0)
 	if (smp_processor_id() == 0)
 		return;
 		return;
@@ -44,7 +52,15 @@ static void xen_vcpu_notify_restore(void *data)
 
 
 static void xen_vcpu_notify_suspend(void *data)
 static void xen_vcpu_notify_suspend(void *data)
 {
 {
+	u64 tmp;
+
 	tick_suspend_local();
 	tick_suspend_local();
+
+	if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) {
+		rdmsrl(MSR_IA32_SPEC_CTRL, tmp);
+		this_cpu_write(spec_ctrl, tmp);
+		wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+	}
 }
 }
 
 
 void xen_arch_resume(void)
 void xen_arch_resume(void)

+ 5 - 0
include/linux/compiler-clang.h

@@ -27,3 +27,8 @@
 #if __has_feature(address_sanitizer)
 #if __has_feature(address_sanitizer)
 #define __SANITIZE_ADDRESS__
 #define __SANITIZE_ADDRESS__
 #endif
 #endif
+
+/* Clang doesn't have a way to turn it off per-function, yet. */
+#ifdef __noretpoline
+#undef __noretpoline
+#endif

+ 4 - 0
include/linux/compiler-gcc.h

@@ -93,6 +93,10 @@
 #define __weak		__attribute__((weak))
 #define __weak		__attribute__((weak))
 #define __alias(symbol)	__attribute__((alias(#symbol)))
 #define __alias(symbol)	__attribute__((alias(#symbol)))
 
 
+#ifdef RETPOLINE
+#define __noretpoline __attribute__((indirect_branch("keep")))
+#endif
+
 /*
 /*
  * it doesn't make sense on ARM (currently the only user of __naked)
  * it doesn't make sense on ARM (currently the only user of __naked)
  * to trace naked functions because then mcount is called without
  * to trace naked functions because then mcount is called without

+ 4 - 4
include/linux/init.h

@@ -6,10 +6,10 @@
 #include <linux/types.h>
 #include <linux/types.h>
 
 
 /* Built-in __init functions needn't be compiled with retpoline */
 /* Built-in __init functions needn't be compiled with retpoline */
-#if defined(RETPOLINE) && !defined(MODULE)
-#define __noretpoline __attribute__((indirect_branch("keep")))
+#if defined(__noretpoline) && !defined(MODULE)
+#define __noinitretpoline __noretpoline
 #else
 #else
-#define __noretpoline
+#define __noinitretpoline
 #endif
 #endif
 
 
 /* These macros are used to mark some functions or 
 /* These macros are used to mark some functions or 
@@ -47,7 +47,7 @@
 
 
 /* These are for everybody (although not all archs will actually
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
    discard it in modules) */
-#define __init		__section(.init.text) __cold  __latent_entropy __noretpoline
+#define __init		__section(.init.text) __cold  __latent_entropy __noinitretpoline
 #define __initdata	__section(.init.data)
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
 #define __exitdata	__section(.exit.data)

+ 3 - 0
include/linux/jump_label.h

@@ -151,6 +151,7 @@ extern struct jump_entry __start___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 
 
 extern void jump_label_init(void);
 extern void jump_label_init(void);
+extern void jump_label_invalidate_init(void);
 extern void jump_label_lock(void);
 extern void jump_label_lock(void);
 extern void jump_label_unlock(void);
 extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
 extern void arch_jump_label_transform(struct jump_entry *entry,
@@ -198,6 +199,8 @@ static __always_inline void jump_label_init(void)
 	static_key_initialized = true;
 	static_key_initialized = true;
 }
 }
 
 
+static inline void jump_label_invalidate_init(void) {}
+
 static __always_inline bool static_key_false(struct static_key *key)
 static __always_inline bool static_key_false(struct static_key *key)
 {
 {
 	if (unlikely(static_key_count(key) > 0))
 	if (unlikely(static_key_count(key) > 0))

+ 1 - 0
include/linux/kernel.h

@@ -472,6 +472,7 @@ extern bool parse_option_str(const char *str, const char *option);
 extern char *next_arg(char *args, char **param, char **val);
 extern char *next_arg(char *args, char **param, char **val);
 
 
 extern int core_kernel_text(unsigned long addr);
 extern int core_kernel_text(unsigned long addr);
+extern int init_kernel_text(unsigned long addr);
 extern int core_kernel_data(unsigned long addr);
 extern int core_kernel_data(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);

+ 3 - 23
include/linux/nospec.h

@@ -5,6 +5,7 @@
 
 
 #ifndef _LINUX_NOSPEC_H
 #ifndef _LINUX_NOSPEC_H
 #define _LINUX_NOSPEC_H
 #define _LINUX_NOSPEC_H
+#include <asm/barrier.h>
 
 
 /**
 /**
  * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
  * array_index_mask_nospec() - generate a ~0 mask when index < size, 0 otherwise
@@ -29,26 +30,6 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 }
 }
 #endif
 #endif
 
 
-/*
- * Warn developers about inappropriate array_index_nospec() usage.
- *
- * Even if the CPU speculates past the WARN_ONCE branch, the
- * sign bit of @index is taken into account when generating the
- * mask.
- *
- * This warning is compiled out when the compiler can infer that
- * @index and @size are less than LONG_MAX.
- */
-#define array_index_mask_nospec_check(index, size)				\
-({										\
-	if (WARN_ONCE(index > LONG_MAX || size > LONG_MAX,			\
-	    "array_index_nospec() limited to range of [0, LONG_MAX]\n"))	\
-		_mask = 0;							\
-	else									\
-		_mask = array_index_mask_nospec(index, size);			\
-	_mask;									\
-})
-
 /*
 /*
  * array_index_nospec - sanitize an array index after a bounds check
  * array_index_nospec - sanitize an array index after a bounds check
  *
  *
@@ -67,12 +48,11 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 ({									\
 ({									\
 	typeof(index) _i = (index);					\
 	typeof(index) _i = (index);					\
 	typeof(size) _s = (size);					\
 	typeof(size) _s = (size);					\
-	unsigned long _mask = array_index_mask_nospec_check(_i, _s);	\
+	unsigned long _mask = array_index_mask_nospec(_i, _s);		\
 									\
 									\
 	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
 	BUILD_BUG_ON(sizeof(_i) > sizeof(long));			\
 	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\
 	BUILD_BUG_ON(sizeof(_s) > sizeof(long));			\
 									\
 									\
-	_i &= _mask;							\
-	_i;								\
+	(typeof(_i)) (_i & _mask);					\
 })
 })
 #endif /* _LINUX_NOSPEC_H */
 #endif /* _LINUX_NOSPEC_H */

+ 2 - 0
init/main.c

@@ -89,6 +89,7 @@
 #include <linux/io.h>
 #include <linux/io.h>
 #include <linux/cache.h>
 #include <linux/cache.h>
 #include <linux/rodata_test.h>
 #include <linux/rodata_test.h>
+#include <linux/jump_label.h>
 
 
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/bugs.h>
 #include <asm/bugs.h>
@@ -1000,6 +1001,7 @@ static int __ref kernel_init(void *unused)
 	/* need to finish all async __init code before freeing the memory */
 	/* need to finish all async __init code before freeing the memory */
 	async_synchronize_full();
 	async_synchronize_full();
 	ftrace_free_init_mem();
 	ftrace_free_init_mem();
+	jump_label_invalidate_init();
 	free_initmem();
 	free_initmem();
 	mark_readonly();
 	mark_readonly();
 	system_state = SYSTEM_RUNNING;
 	system_state = SYSTEM_RUNNING;

+ 1 - 1
kernel/extable.c

@@ -64,7 +64,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 	return e;
 }
 }
 
 
-static inline int init_kernel_text(unsigned long addr)
+int init_kernel_text(unsigned long addr)
 {
 {
 	if (addr >= (unsigned long)_sinittext &&
 	if (addr >= (unsigned long)_sinittext &&
 	    addr < (unsigned long)_einittext)
 	    addr < (unsigned long)_einittext)

+ 22 - 5
kernel/jump_label.c

@@ -366,12 +366,15 @@ static void __jump_label_update(struct static_key *key,
 {
 {
 	for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
 	for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
 		/*
 		/*
-		 * entry->code set to 0 invalidates module init text sections
-		 * kernel_text_address() verifies we are not in core kernel
-		 * init code, see jump_label_invalidate_module_init().
+		 * An entry->code of 0 indicates an entry which has been
+		 * disabled because it was in an init text area.
 		 */
 		 */
-		if (entry->code && kernel_text_address(entry->code))
-			arch_jump_label_transform(entry, jump_label_type(entry));
+		if (entry->code) {
+			if (kernel_text_address(entry->code))
+				arch_jump_label_transform(entry, jump_label_type(entry));
+			else
+				WARN_ONCE(1, "can't patch jump_label at %pS", (void *)entry->code);
+		}
 	}
 	}
 }
 }
 
 
@@ -417,6 +420,19 @@ void __init jump_label_init(void)
 	cpus_read_unlock();
 	cpus_read_unlock();
 }
 }
 
 
+/* Disable any jump label entries in __init code */
+void __init jump_label_invalidate_init(void)
+{
+	struct jump_entry *iter_start = __start___jump_table;
+	struct jump_entry *iter_stop = __stop___jump_table;
+	struct jump_entry *iter;
+
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		if (init_kernel_text(iter->code))
+			iter->code = 0;
+	}
+}
+
 #ifdef CONFIG_MODULES
 #ifdef CONFIG_MODULES
 
 
 static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
 static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
@@ -633,6 +649,7 @@ static void jump_label_del_module(struct module *mod)
 	}
 	}
 }
 }
 
 
+/* Disable any jump label entries in module init code */
 static void jump_label_invalidate_module_init(struct module *mod)
 static void jump_label_invalidate_module_init(struct module *mod)
 {
 {
 	struct jump_entry *iter_start = mod->jump_entries;
 	struct jump_entry *iter_start = mod->jump_entries;

+ 8 - 0
scripts/Makefile.build

@@ -256,6 +256,8 @@ __objtool_obj := $(objtree)/tools/objtool/objtool
 
 
 objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check)
 objtool_args = $(if $(CONFIG_UNWINDER_ORC),orc generate,check)
 
 
+objtool_args += $(if $(part-of-module), --module,)
+
 ifndef CONFIG_FRAME_POINTER
 ifndef CONFIG_FRAME_POINTER
 objtool_args += --no-fp
 objtool_args += --no-fp
 endif
 endif
@@ -264,6 +266,12 @@ objtool_args += --no-unreachable
 else
 else
 objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable)
 objtool_args += $(call cc-ifversion, -lt, 0405, --no-unreachable)
 endif
 endif
+ifdef CONFIG_RETPOLINE
+ifneq ($(RETPOLINE_CFLAGS),)
+  objtool_args += --retpoline
+endif
+endif
+
 
 
 ifdef CONFIG_MODVERSIONS
 ifdef CONFIG_MODVERSIONS
 objtool_o = $(@D)/.tmp_$(@F)
 objtool_o = $(@D)/.tmp_$(@F)

+ 4 - 2
tools/objtool/builtin-check.c

@@ -29,7 +29,7 @@
 #include "builtin.h"
 #include "builtin.h"
 #include "check.h"
 #include "check.h"
 
 
-bool no_fp, no_unreachable;
+bool no_fp, no_unreachable, retpoline, module;
 
 
 static const char * const check_usage[] = {
 static const char * const check_usage[] = {
 	"objtool check [<options>] file.o",
 	"objtool check [<options>] file.o",
@@ -39,6 +39,8 @@ static const char * const check_usage[] = {
 const struct option check_options[] = {
 const struct option check_options[] = {
 	OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"),
 	OPT_BOOLEAN('f', "no-fp", &no_fp, "Skip frame pointer validation"),
 	OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"),
 	OPT_BOOLEAN('u', "no-unreachable", &no_unreachable, "Skip 'unreachable instruction' warnings"),
+	OPT_BOOLEAN('r', "retpoline", &retpoline, "Validate retpoline assumptions"),
+	OPT_BOOLEAN('m', "module", &module, "Indicates the object will be part of a kernel module"),
 	OPT_END(),
 	OPT_END(),
 };
 };
 
 
@@ -53,5 +55,5 @@ int cmd_check(int argc, const char **argv)
 
 
 	objname = argv[0];
 	objname = argv[0];
 
 
-	return check(objname, no_fp, no_unreachable, false);
+	return check(objname, false);
 }
 }

+ 1 - 5
tools/objtool/builtin-orc.c

@@ -25,7 +25,6 @@
  */
  */
 
 
 #include <string.h>
 #include <string.h>
-#include <subcmd/parse-options.h>
 #include "builtin.h"
 #include "builtin.h"
 #include "check.h"
 #include "check.h"
 
 
@@ -36,9 +35,6 @@ static const char *orc_usage[] = {
 	NULL,
 	NULL,
 };
 };
 
 
-extern const struct option check_options[];
-extern bool no_fp, no_unreachable;
-
 int cmd_orc(int argc, const char **argv)
 int cmd_orc(int argc, const char **argv)
 {
 {
 	const char *objname;
 	const char *objname;
@@ -54,7 +50,7 @@ int cmd_orc(int argc, const char **argv)
 
 
 		objname = argv[0];
 		objname = argv[0];
 
 
-		return check(objname, no_fp, no_unreachable, true);
+		return check(objname, true);
 	}
 	}
 
 
 	if (!strcmp(argv[0], "dump")) {
 	if (!strcmp(argv[0], "dump")) {

+ 5 - 0
tools/objtool/builtin.h

@@ -17,6 +17,11 @@
 #ifndef _BUILTIN_H
 #ifndef _BUILTIN_H
 #define _BUILTIN_H
 #define _BUILTIN_H
 
 
+#include <subcmd/parse-options.h>
+
+extern const struct option check_options[];
+extern bool no_fp, no_unreachable, retpoline, module;
+
 extern int cmd_check(int argc, const char **argv);
 extern int cmd_check(int argc, const char **argv);
 extern int cmd_orc(int argc, const char **argv);
 extern int cmd_orc(int argc, const char **argv);
 
 

+ 88 - 5
tools/objtool/check.c

@@ -18,6 +18,7 @@
 #include <string.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdlib.h>
 
 
+#include "builtin.h"
 #include "check.h"
 #include "check.h"
 #include "elf.h"
 #include "elf.h"
 #include "special.h"
 #include "special.h"
@@ -33,7 +34,6 @@ struct alternative {
 };
 };
 
 
 const char *objname;
 const char *objname;
-static bool no_fp;
 struct cfi_state initial_func_cfi;
 struct cfi_state initial_func_cfi;
 
 
 struct instruction *find_insn(struct objtool_file *file,
 struct instruction *find_insn(struct objtool_file *file,
@@ -497,6 +497,7 @@ static int add_jump_destinations(struct objtool_file *file)
 			 * disguise, so convert them accordingly.
 			 * disguise, so convert them accordingly.
 			 */
 			 */
 			insn->type = INSN_JUMP_DYNAMIC;
 			insn->type = INSN_JUMP_DYNAMIC;
+			insn->retpoline_safe = true;
 			continue;
 			continue;
 		} else {
 		} else {
 			/* sibling call */
 			/* sibling call */
@@ -548,7 +549,8 @@ static int add_call_destinations(struct objtool_file *file)
 			if (!insn->call_dest && !insn->ignore) {
 			if (!insn->call_dest && !insn->ignore) {
 				WARN_FUNC("unsupported intra-function call",
 				WARN_FUNC("unsupported intra-function call",
 					  insn->sec, insn->offset);
 					  insn->sec, insn->offset);
-				WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.");
+				if (retpoline)
+					WARN("If this is a retpoline, please patch it in with alternatives and annotate it with ANNOTATE_NOSPEC_ALTERNATIVE.");
 				return -1;
 				return -1;
 			}
 			}
 
 
@@ -923,7 +925,11 @@ static struct rela *find_switch_table(struct objtool_file *file,
 		if (find_symbol_containing(file->rodata, text_rela->addend))
 		if (find_symbol_containing(file->rodata, text_rela->addend))
 			continue;
 			continue;
 
 
-		return find_rela_by_dest(file->rodata, text_rela->addend);
+		rodata_rela = find_rela_by_dest(file->rodata, text_rela->addend);
+		if (!rodata_rela)
+			continue;
+
+		return rodata_rela;
 	}
 	}
 
 
 	return NULL;
 	return NULL;
@@ -1108,6 +1114,41 @@ static int read_unwind_hints(struct objtool_file *file)
 	return 0;
 	return 0;
 }
 }
 
 
+static int read_retpoline_hints(struct objtool_file *file)
+{
+	struct section *sec;
+	struct instruction *insn;
+	struct rela *rela;
+
+	sec = find_section_by_name(file->elf, ".rela.discard.retpoline_safe");
+	if (!sec)
+		return 0;
+
+	list_for_each_entry(rela, &sec->rela_list, list) {
+		if (rela->sym->type != STT_SECTION) {
+			WARN("unexpected relocation symbol type in %s", sec->name);
+			return -1;
+		}
+
+		insn = find_insn(file, rela->sym->sec, rela->addend);
+		if (!insn) {
+			WARN("bad .discard.retpoline_safe entry");
+			return -1;
+		}
+
+		if (insn->type != INSN_JUMP_DYNAMIC &&
+		    insn->type != INSN_CALL_DYNAMIC) {
+			WARN_FUNC("retpoline_safe hint not an indirect jump/call",
+				  insn->sec, insn->offset);
+			return -1;
+		}
+
+		insn->retpoline_safe = true;
+	}
+
+	return 0;
+}
+
 static int decode_sections(struct objtool_file *file)
 static int decode_sections(struct objtool_file *file)
 {
 {
 	int ret;
 	int ret;
@@ -1146,6 +1187,10 @@ static int decode_sections(struct objtool_file *file)
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
+	ret = read_retpoline_hints(file);
+	if (ret)
+		return ret;
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1891,6 +1936,38 @@ static int validate_unwind_hints(struct objtool_file *file)
 	return warnings;
 	return warnings;
 }
 }
 
 
+static int validate_retpoline(struct objtool_file *file)
+{
+	struct instruction *insn;
+	int warnings = 0;
+
+	for_each_insn(file, insn) {
+		if (insn->type != INSN_JUMP_DYNAMIC &&
+		    insn->type != INSN_CALL_DYNAMIC)
+			continue;
+
+		if (insn->retpoline_safe)
+			continue;
+
+		/*
+		 * .init.text code is ran before userspace and thus doesn't
+		 * strictly need retpolines, except for modules which are
+		 * loaded late, they very much do need retpoline in their
+		 * .init.text
+		 */
+		if (!strcmp(insn->sec->name, ".init.text") && !module)
+			continue;
+
+		WARN_FUNC("indirect %s found in RETPOLINE build",
+			  insn->sec, insn->offset,
+			  insn->type == INSN_JUMP_DYNAMIC ? "jump" : "call");
+
+		warnings++;
+	}
+
+	return warnings;
+}
+
 static bool is_kasan_insn(struct instruction *insn)
 static bool is_kasan_insn(struct instruction *insn)
 {
 {
 	return (insn->type == INSN_CALL &&
 	return (insn->type == INSN_CALL &&
@@ -2022,13 +2099,12 @@ static void cleanup(struct objtool_file *file)
 	elf_close(file->elf);
 	elf_close(file->elf);
 }
 }
 
 
-int check(const char *_objname, bool _no_fp, bool no_unreachable, bool orc)
+int check(const char *_objname, bool orc)
 {
 {
 	struct objtool_file file;
 	struct objtool_file file;
 	int ret, warnings = 0;
 	int ret, warnings = 0;
 
 
 	objname = _objname;
 	objname = _objname;
-	no_fp = _no_fp;
 
 
 	file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY);
 	file.elf = elf_open(objname, orc ? O_RDWR : O_RDONLY);
 	if (!file.elf)
 	if (!file.elf)
@@ -2052,6 +2128,13 @@ int check(const char *_objname, bool _no_fp, bool no_unreachable, bool orc)
 	if (list_empty(&file.insn_list))
 	if (list_empty(&file.insn_list))
 		goto out;
 		goto out;
 
 
+	if (retpoline) {
+		ret = validate_retpoline(&file);
+		if (ret < 0)
+			return ret;
+		warnings += ret;
+	}
+
 	ret = validate_functions(&file);
 	ret = validate_functions(&file);
 	if (ret < 0)
 	if (ret < 0)
 		goto out;
 		goto out;

+ 2 - 1
tools/objtool/check.h

@@ -45,6 +45,7 @@ struct instruction {
 	unsigned char type;
 	unsigned char type;
 	unsigned long immediate;
 	unsigned long immediate;
 	bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts;
 	bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts;
+	bool retpoline_safe;
 	struct symbol *call_dest;
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
 	struct instruction *jump_dest;
 	struct instruction *first_jump_src;
 	struct instruction *first_jump_src;
@@ -63,7 +64,7 @@ struct objtool_file {
 	bool ignore_unreachables, c_file, hints;
 	bool ignore_unreachables, c_file, hints;
 };
 };
 
 
-int check(const char *objname, bool no_fp, bool no_unreachable, bool orc);
+int check(const char *objname, bool orc);
 
 
 struct instruction *find_insn(struct objtool_file *file,
 struct instruction *find_insn(struct objtool_file *file,
 			      struct section *sec, unsigned long offset);
 			      struct section *sec, unsigned long offset);

+ 6 - 5
tools/testing/selftests/x86/test_vsyscall.c

@@ -450,7 +450,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
 		num_vsyscall_traps++;
 		num_vsyscall_traps++;
 }
 }
 
 
-static int test_native_vsyscall(void)
+static int test_emulation(void)
 {
 {
 	time_t tmp;
 	time_t tmp;
 	bool is_native;
 	bool is_native;
@@ -458,7 +458,7 @@ static int test_native_vsyscall(void)
 	if (!vtime)
 	if (!vtime)
 		return 0;
 		return 0;
 
 
-	printf("[RUN]\tchecking for native vsyscall\n");
+	printf("[RUN]\tchecking that vsyscalls are emulated\n");
 	sethandler(SIGTRAP, sigtrap, 0);
 	sethandler(SIGTRAP, sigtrap, 0);
 	set_eflags(get_eflags() | X86_EFLAGS_TF);
 	set_eflags(get_eflags() | X86_EFLAGS_TF);
 	vtime(&tmp);
 	vtime(&tmp);
@@ -474,11 +474,12 @@ static int test_native_vsyscall(void)
 	 */
 	 */
 	is_native = (num_vsyscall_traps > 1);
 	is_native = (num_vsyscall_traps > 1);
 
 
-	printf("\tvsyscalls are %s (%d instructions in vsyscall page)\n",
+	printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n",
+	       (is_native ? "FAIL" : "OK"),
 	       (is_native ? "native" : "emulated"),
 	       (is_native ? "native" : "emulated"),
 	       (int)num_vsyscall_traps);
 	       (int)num_vsyscall_traps);
 
 
-	return 0;
+	return is_native;
 }
 }
 #endif
 #endif
 
 
@@ -498,7 +499,7 @@ int main(int argc, char **argv)
 	nerrs += test_vsys_r();
 	nerrs += test_vsys_r();
 
 
 #ifdef __x86_64__
 #ifdef __x86_64__
-	nerrs += test_native_vsyscall();
+	nerrs += test_emulation();
 #endif
 #endif
 
 
 	return nerrs ? 1 : 0;
 	return nerrs ? 1 : 0;