瀏覽代碼

Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 page table isolation updates from Thomas Gleixner:
 "This is the final set of enabling page table isolation on x86:

   - Infrastructure patches for handling the extra page tables.

   - Patches which map the various bits and pieces which are required to
     get in and out of user space into the user space visible page
     tables.

   - The required changes to have CR3 switching in the entry/exit code.

   - Optimizations for the CR3 switching along with documentation how
     the ASID/PCID mechanism works.

   - Updates to dump pagetables to cover the user space page tables for
     W+X scans and extra debugfs files to analyze both the kernel and
     the user space visible page tables

  The whole functionality is compile time controlled via a config switch
  and can be turned on/off on the command line as well"

* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
  x86/ldt: Make the LDT mapping RO
  x86/mm/dump_pagetables: Allow dumping current pagetables
  x86/mm/dump_pagetables: Check user space page table for WX pages
  x86/mm/dump_pagetables: Add page table directory to the debugfs VFS hierarchy
  x86/mm/pti: Add Kconfig
  x86/dumpstack: Indicate in Oops whether PTI is configured and enabled
  x86/mm: Clarify the whole ASID/kernel PCID/user PCID naming
  x86/mm: Use INVPCID for __native_flush_tlb_single()
  x86/mm: Optimize RESTORE_CR3
  x86/mm: Use/Fix PCID to optimize user/kernel switches
  x86/mm: Abstract switching CR3
  x86/mm: Allow flushing for future ASID switches
  x86/pti: Map the vsyscall page if needed
  x86/pti: Put the LDT in its own PGD if PTI is on
  x86/mm/64: Make a full PGD-entry size hole in the memory map
  x86/events/intel/ds: Map debug buffers in cpu_entry_area
  x86/cpu_entry_area: Add debugstore entries to cpu_entry_area
  x86/mm/pti: Map ESPFIX into user space
  x86/mm/pti: Share entry text PMD
  x86/entry: Align entry text section to PMD boundary
  ...
Linus Torvalds 7 年之前
父節點
當前提交
5aa90a8458
共有 45 個文件被更改,包括 1636 次插入202 次删除
  1. 8 0
      Documentation/admin-guide/kernel-parameters.txt
  2. 3 2
      Documentation/x86/x86_64/mm.txt
  3. 3 0
      arch/x86/boot/compressed/pagetable.c
  4. 145 0
      arch/x86/entry/calling.h
  5. 41 7
      arch/x86/entry/entry_64.S
  6. 23 1
      arch/x86/entry/entry_64_compat.S
  7. 3 3
      arch/x86/entry/vsyscall/vsyscall_64.c
  8. 83 47
      arch/x86/events/intel/ds.c
  9. 4 19
      arch/x86/events/perf_event.h
  10. 13 0
      arch/x86/include/asm/cpu_entry_area.h
  11. 3 1
      arch/x86/include/asm/cpufeatures.h
  12. 2 0
      arch/x86/include/asm/desc.h
  13. 7 1
      arch/x86/include/asm/disabled-features.h
  14. 36 0
      arch/x86/include/asm/intel_ds.h
  15. 53 6
      arch/x86/include/asm/mmu_context.h
  16. 11 0
      arch/x86/include/asm/pgalloc.h
  17. 26 4
      arch/x86/include/asm/pgtable.h
  18. 92 0
      arch/x86/include/asm/pgtable_64.h
  19. 6 2
      arch/x86/include/asm/pgtable_64_types.h
  20. 5 0
      arch/x86/include/asm/processor-flags.h
  21. 16 7
      arch/x86/include/asm/processor.h
  22. 14 0
      arch/x86/include/asm/pti.h
  23. 171 31
      arch/x86/include/asm/tlbflush.h
  24. 1 0
      arch/x86/include/asm/vsyscall.h
  25. 6 1
      arch/x86/include/uapi/asm/processor-flags.h
  26. 4 0
      arch/x86/kernel/asm-offsets.c
  27. 8 1
      arch/x86/kernel/cpu/common.c
  28. 4 2
      arch/x86/kernel/dumpstack.c
  29. 27 3
      arch/x86/kernel/head_64.S
  30. 141 3
      arch/x86/kernel/ldt.c
  31. 2 9
      arch/x86/kernel/tls.c
  32. 8 0
      arch/x86/kernel/vmlinux.lds.S
  33. 4 3
      arch/x86/mm/Makefile
  34. 27 0
      arch/x86/mm/cpu_entry_area.c
  35. 74 6
      arch/x86/mm/debug_pagetables.c
  36. 38 5
      arch/x86/mm/dump_pagetables.c
  37. 49 31
      arch/x86/mm/init.c
  38. 3 2
      arch/x86/mm/pgtable.c
  39. 387 0
      arch/x86/mm/pti.c
  40. 56 2
      arch/x86/mm/tlb.c
  41. 4 1
      arch/x86/platform/efi/efi_64.c
  42. 11 0
      include/linux/pti.h
  43. 3 0
      init/main.c
  44. 10 0
      security/Kconfig
  45. 1 2
      tools/testing/selftests/x86/ldt_gdt.c

+ 8 - 0
Documentation/admin-guide/kernel-parameters.txt

@@ -2708,6 +2708,8 @@
 			steal time is computed, but won't influence scheduler
 			steal time is computed, but won't influence scheduler
 			behaviour
 			behaviour
 
 
+	nopti		[X86-64] Disable kernel page table isolation
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
@@ -3282,6 +3284,12 @@
 	pt.		[PARIDE]
 	pt.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 			See Documentation/blockdev/paride.txt.
 
 
+	pti=		[X86_64]
+			Control user/kernel address space isolation:
+			on - enable
+			off - disable
+			auto - default setting
+
 	pty.legacy_count=
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			default number.
 			default number.

+ 3 - 2
Documentation/x86/x86_64/mm.txt

@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
 ... unused hole ...
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ... unused hole ...
@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
 hole caused by [56:63] sign extension
 hole caused by [56:63] sign extension
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ... unused hole ...

+ 3 - 0
arch/x86/boot/compressed/pagetable.c

@@ -23,6 +23,9 @@
  */
  */
 #undef CONFIG_AMD_MEM_ENCRYPT
 #undef CONFIG_AMD_MEM_ENCRYPT
 
 
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
 #include "misc.h"
 #include "misc.h"
 
 
 /* These actually do the work of building the kernel identity maps. */
 /* These actually do the work of building the kernel identity maps. */

+ 145 - 0
arch/x86/entry/calling.h

@@ -1,6 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/jump_label.h>
 #include <linux/jump_label.h>
 #include <asm/unwind_hints.h>
 #include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
 
 
 /*
 /*
 
 
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
 #endif
 #endif
 .endm
 .endm
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
+#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+
+.macro SET_NOFLUSH_BIT	reg:req
+	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
+.macro ADJUST_KERNEL_CR3 reg:req
+	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+	andq    $(~PTI_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	mov	%cr3, \scratch_reg
+	ADJUST_KERNEL_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+#define THIS_CPU_user_pcid_flush_mask   \
+	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	mov	%cr3, \scratch_reg
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * Test if the ASID needs a flush.
+	 */
+	movq	\scratch_reg, \scratch_reg2
+	andq	$(0x7FF), \scratch_reg		/* mask ASID */
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	/* Flush needed, clear the bit */
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	movq	\scratch_reg2, \scratch_reg
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	movq	\scratch_reg2, \scratch_reg
+	SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_\@:
+	/* Flip the PGD and ASID to the user version */
+	orq     $(PTI_SWITCH_MASK), \scratch_reg
+	mov	\scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
+	pushq	%rax
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+	popq	%rax
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+	movq	%cr3, \scratch_reg
+	movq	\scratch_reg, \save_reg
+	/*
+	 * Is the "switch mask" all zero?  That means that both of
+	 * these are zero:
+	 *
+	 *	1. The user/kernel PCID bit, and
+	 *	2. The user/kernel "bit" that points CR3 to the
+	 *	   bottom half of the 8k PGD
+	 *
+	 * That indicates a kernel CR3 value, not a user CR3.
+	 */
+	testq	$(PTI_SWITCH_MASK), \scratch_reg
+	jz	.Ldone_\@
+
+	ADJUST_KERNEL_CR3 \scratch_reg
+	movq	\scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * KERNEL pages can always resume with NOFLUSH as we do
+	 * explicit flushes.
+	 */
+	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
+	jnc	.Lnoflush_\@
+
+	/*
+	 * Check if there's a pending flush for the user ASID we're
+	 * about to set.
+	 */
+	movq	\save_reg, \scratch_reg
+	andq	$(0x7FF), \scratch_reg
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
+	/*
+	 * The CR3 write could be avoided when not changing its value,
+	 * but would require a CR3 read *and* a scratch register.
+	 */
+	movq	\save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
 #endif /* CONFIG_X86_64 */
 #endif /* CONFIG_X86_64 */
 
 
 /*
 /*

+ 41 - 7
arch/x86/entry/entry_64.S

@@ -23,7 +23,6 @@
 #include <asm/segment.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
 #include <asm/errno.h>
-#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
 #include <asm/msr.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
@@ -40,6 +39,8 @@
 #include <asm/frame.h>
 #include <asm/frame.h>
 #include <linux/err.h>
 #include <linux/err.h>
 
 
+#include "calling.h"
+
 .code64
 .code64
 .section .entry.text, "ax"
 .section .entry.text, "ax"
 
 
@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
 	/* Stash the user RSP. */
 	/* Stash the user RSP. */
 	movq	%rsp, RSP_SCRATCH
 	movq	%rsp, RSP_SCRATCH
 
 
+	/* Note: using %rsp as a scratch reg. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
 	/* Load the top of the task stack into RSP */
 	/* Load the top of the task stack into RSP */
 	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
 	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
 
 
@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
 	 */
 	 */
 
 
 	swapgs
 	swapgs
+	/*
+	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+	 * is not required to switch CR3.
+	 */
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 
@@ -403,6 +411,7 @@ syscall_return_via_sysret:
 	 * We are on the trampoline stack.  All regs except RDI are live.
 	 * We are on the trampoline stack.  All regs except RDI are live.
 	 * We can do future final exit work right here.
 	 * We can do future final exit work right here.
 	 */
 	 */
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
 
 
 	popq	%rdi
 	popq	%rdi
 	popq	%rsp
 	popq	%rsp
@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	 * We can do future final exit work right here.
 	 * We can do future final exit work right here.
 	 */
 	 */
 
 
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
 	/* Restore RDI. */
 	/* Restore RDI. */
 	popq	%rdi
 	popq	%rdi
 	SWAPGS
 	SWAPGS
@@ -822,7 +833,9 @@ native_irq_return_ldt:
 	 */
 	 */
 
 
 	pushq	%rdi				/* Stash user RDI */
 	pushq	%rdi				/* Stash user RDI */
-	SWAPGS
+	SWAPGS					/* to kernel GS */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
+
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	%rax, (0*8)(%rdi)		/* user RAX */
 	movq	%rax, (0*8)(%rdi)		/* user RAX */
 	movq	(1*8)(%rsp), %rax		/* user RIP */
 	movq	(1*8)(%rsp), %rax		/* user RIP */
@@ -838,7 +851,6 @@ native_irq_return_ldt:
 	/* Now RAX == RSP. */
 	/* Now RAX == RSP. */
 
 
 	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
 	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
-	popq	%rdi				/* Restore user RDI */
 
 
 	/*
 	/*
 	 * espfix_stack[31:16] == 0.  The page tables are set up such that
 	 * espfix_stack[31:16] == 0.  The page tables are set up such that
@@ -849,7 +861,11 @@ native_irq_return_ldt:
 	 * still points to an RO alias of the ESPFIX stack.
 	 * still points to an RO alias of the ESPFIX stack.
 	 */
 	 */
 	orq	PER_CPU_VAR(espfix_stack), %rax
 	orq	PER_CPU_VAR(espfix_stack), %rax
-	SWAPGS
+
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+	SWAPGS					/* to user GS */
+	popq	%rdi				/* Restore user RDI */
+
 	movq	%rax, %rsp
 	movq	%rax, %rsp
 	UNWIND_HINT_IRET_REGS offset=8
 	UNWIND_HINT_IRET_REGS offset=8
 
 
@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
 	UNWIND_HINT_FUNC
 	UNWIND_HINT_FUNC
 
 
 	pushq	%rdi
 	pushq	%rdi
+	/* Need to switch before accessing the thread stack. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
 	movq	%rsp, %rdi
 	movq	%rsp, %rdi
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
 	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
 	js	1f				/* negative -> in kernel */
 	js	1f				/* negative -> in kernel */
 	SWAPGS
 	SWAPGS
 	xorl	%ebx, %ebx
 	xorl	%ebx, %ebx
-1:	ret
+
+1:
+	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+	ret
 END(paranoid_entry)
 END(paranoid_entry)
 
 
 /*
 /*
@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	.Lparanoid_exit_no_swapgs
 	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	TRACE_IRQS_IRETQ
+	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
 	SWAPGS_UNSAFE_STACK
 	SWAPGS_UNSAFE_STACK
 	jmp	.Lparanoid_exit_restore
 	jmp	.Lparanoid_exit_restore
 .Lparanoid_exit_no_swapgs:
 .Lparanoid_exit_no_swapgs:
@@ -1299,6 +1322,8 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 * from user mode due to an IRET fault.
 	 */
 	 */
 	SWAPGS
 	SWAPGS
+	/* We have user CR3.  Change to kernel CR3. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 
 .Lerror_entry_from_usermode_after_swapgs:
 .Lerror_entry_from_usermode_after_swapgs:
 	/* Put us onto the real thread stack. */
 	/* Put us onto the real thread stack. */
@@ -1345,6 +1370,7 @@ ENTRY(error_entry)
 	 * .Lgs_change's error handler with kernel gsbase.
 	 * .Lgs_change's error handler with kernel gsbase.
 	 */
 	 */
 	SWAPGS
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 	jmp .Lerror_entry_done
 	jmp .Lerror_entry_done
 
 
 .Lbstep_iret:
 .Lbstep_iret:
@@ -1354,10 +1380,11 @@ ENTRY(error_entry)
 
 
 .Lerror_bad_iret:
 .Lerror_bad_iret:
 	/*
 	/*
-	 * We came from an IRET to user mode, so we have user gsbase.
-	 * Switch to kernel gsbase:
+	 * We came from an IRET to user mode, so we have user
+	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
 	 */
 	 */
 	SWAPGS
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 
 	/*
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1389,6 +1416,10 @@ END(error_exit)
 /*
 /*
  * Runs on exception stack.  Xen PV does not go through this path at all,
  * Runs on exception stack.  Xen PV does not go through this path at all,
  * so we can use real assembly here.
  * so we can use real assembly here.
+ *
+ * Registers:
+ *	%r14: Used to save/restore the CR3 of the interrupted context
+ *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
  */
  */
 ENTRY(nmi)
 ENTRY(nmi)
 	UNWIND_HINT_IRET_REGS
 	UNWIND_HINT_IRET_REGS
@@ -1452,6 +1483,7 @@ ENTRY(nmi)
 
 
 	swapgs
 	swapgs
 	cld
 	cld
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
 	movq	%rsp, %rdx
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1704,6 +1736,8 @@ end_repeat_nmi:
 	movq	$-1, %rsi
 	movq	$-1, %rsi
 	call	do_nmi
 	call	do_nmi
 
 
+	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
 	testl	%ebx, %ebx			/* swapgs needed? */
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 	jnz	nmi_restore
 nmi_swapgs:
 nmi_swapgs:

+ 23 - 1
arch/x86/entry/entry_64_compat.S

@@ -49,6 +49,10 @@
 ENTRY(entry_SYSENTER_compat)
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	/* Interrupts are off on entry. */
 	SWAPGS
 	SWAPGS
+
+	/* We are about to clobber %rsp anyway, clobbering here is OK */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 
 	/*
 	/*
@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 
 
+	/*
+	 * We just saved %rdi so it is safe to clobber.  It is not
+	 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
+	 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
 	/*
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.
 	 * turned them off.
@@ -256,10 +266,22 @@ sysret32_from_system_call:
 	 * when the system call started, which is already known to user
 	 * when the system call started, which is already known to user
 	 * code.  We zero R8-R10 to avoid info leaks.
 	 * code.  We zero R8-R10 to avoid info leaks.
          */
          */
+	movq	RSP-ORIG_RAX(%rsp), %rsp
+
+	/*
+	 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+	 * on the process stack which is not mapped to userspace and
+	 * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3
+	 * switch until after after the last reference to the process
+	 * stack.
+	 *
+	 * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+	 */
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
 	xorq	%r8, %r8
 	xorq	%r8, %r8
 	xorq	%r9, %r9
 	xorq	%r9, %r9
 	xorq	%r10, %r10
 	xorq	%r10, %r10
-	movq	RSP-ORIG_RAX(%rsp), %rsp
 	swapgs
 	swapgs
 	sysretl
 	sysretl
 END(entry_SYSCALL_compat)
 END(entry_SYSCALL_compat)

+ 3 - 3
arch/x86/entry/vsyscall/vsyscall_64.c

@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
  * vsyscalls but leave the page not present.  If so, we skip calling
  * vsyscalls but leave the page not present.  If so, we skip calling
  * this.
  * this.
  */
  */
-static void __init set_vsyscall_pgtable_user_bits(void)
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
 {
 {
 	pgd_t *pgd;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	p4d_t *p4d;
 	pud_t *pud;
 	pud_t *pud;
 	pmd_t *pmd;
 	pmd_t *pmd;
 
 
-	pgd = pgd_offset_k(VSYSCALL_ADDR);
+	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
 	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
 	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
 	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
 	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
 #if CONFIG_PGTABLE_LEVELS >= 5
 #if CONFIG_PGTABLE_LEVELS >= 5
@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
 			     vsyscall_mode == NATIVE
 			     vsyscall_mode == NATIVE
 			     ? PAGE_KERNEL_VSYSCALL
 			     ? PAGE_KERNEL_VSYSCALL
 			     : PAGE_KERNEL_VVAR);
 			     : PAGE_KERNEL_VVAR);
-		set_vsyscall_pgtable_user_bits();
+		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
 	}
 	}
 
 
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=

+ 83 - 47
arch/x86/events/intel/ds.c

@@ -3,16 +3,18 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 
 
+#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 #include <asm/insn.h>
 
 
 #include "../perf_event.h"
 #include "../perf_event.h"
 
 
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 #define BTS_RECORD_SIZE		24
 
 
-#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 
 /*
 /*
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
 
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 static DEFINE_PER_CPU(void *, insn_buffer);
 
 
-static int alloc_pebs_buffer(int cpu)
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	phys_addr_t pa;
+	size_t msz = 0;
+
+	pa = virt_to_phys(addr);
+	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+	size_t msz = 0;
+
+	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+	unsigned int order = get_order(size);
 	int node = cpu_to_node(cpu);
 	int node = cpu_to_node(cpu);
-	int max;
-	void *buffer, *ibuffer;
+	struct page *page;
+
+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+	return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+	if (buffer)
+		free_pages((unsigned long)buffer, get_order(size));
+}
+
+static int alloc_pebs_buffer(int cpu)
+{
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	size_t bsiz = x86_pmu.pebs_buffer_size;
+	int max, node = cpu_to_node(cpu);
+	void *buffer, *ibuffer, *cea;
 
 
 	if (!x86_pmu.pebs)
 	if (!x86_pmu.pebs)
 		return 0;
 		return 0;
 
 
-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
 	if (unlikely(!buffer))
 	if (unlikely(!buffer))
 		return -ENOMEM;
 		return -ENOMEM;
 
 
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
 		if (!ibuffer) {
-			kfree(buffer);
+			dsfree_pages(buffer, bsiz);
 			return -ENOMEM;
 			return -ENOMEM;
 		}
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
 	}
-
-	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
-
-	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_pebs_vaddr = buffer;
+	/* Update the cpu entry area mapping */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds->pebs_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
 	ds->pebs_index = ds->pebs_buffer_base;
 	ds->pebs_index = ds->pebs_buffer_base;
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-		max * x86_pmu.pebs_record_size;
-
+	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
 	return 0;
 	return 0;
 }
 }
 
 
 static void release_pebs_buffer(int cpu)
 static void release_pebs_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 
 	if (!ds || !x86_pmu.pebs)
 	if (!ds || !x86_pmu.pebs)
 		return;
 		return;
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
 	kfree(per_cpu(insn_buffer, cpu));
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 	per_cpu(insn_buffer, cpu) = NULL;
 
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
 	ds->pebs_buffer_base = 0;
+	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
+	hwev->ds_pebs_vaddr = NULL;
 }
 }
 
 
 static int alloc_bts_buffer(int cpu)
 static int alloc_bts_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max, thresh;
-	void *buffer;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *buffer, *cea;
+	int max;
 
 
 	if (!x86_pmu.bts)
 	if (!x86_pmu.bts)
 		return 0;
 		return 0;
 
 
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
 	if (unlikely(!buffer)) {
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
-
-	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-	thresh = max / 16;
-
-	ds->bts_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_bts_vaddr = buffer;
+	/* Update the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds->bts_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
 	ds->bts_index = ds->bts_buffer_base;
 	ds->bts_index = ds->bts_buffer_base;
-	ds->bts_absolute_maximum = ds->bts_buffer_base +
-		max * BTS_RECORD_SIZE;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-		thresh * BTS_RECORD_SIZE;
-
+	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
+	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
 	return 0;
 	return 0;
 }
 }
 
 
 static void release_bts_buffer(int cpu)
 static void release_bts_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 
 	if (!ds || !x86_pmu.bts)
 	if (!ds || !x86_pmu.bts)
 		return;
 		return;
 
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds_clear_cea(cea, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
 	ds->bts_buffer_base = 0;
+	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+	hwev->ds_bts_vaddr = NULL;
 }
 }
 
 
 static int alloc_ds_buffer(int cpu)
 static int alloc_ds_buffer(int cpu)
 {
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
+	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
 
 
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
 	per_cpu(cpu_hw_events, cpu).ds = ds;
-
 	return 0;
 	return 0;
 }
 }
 
 
 static void release_ds_buffer(int cpu)
 static void release_ds_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 }
 
 
 void release_ds_buffers(void)
 void release_ds_buffers(void)

+ 4 - 19
arch/x86/events/perf_event.h

@@ -14,6 +14,8 @@
 
 
 #include <linux/perf_event.h>
 #include <linux/perf_event.h>
 
 
+#include <asm/intel_ds.h>
+
 /* To enable MSR tracing please use the generic trace points. */
 /* To enable MSR tracing please use the generic trace points. */
 
 
 /*
 /*
@@ -77,8 +79,6 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 };
 
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		8
 #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
 #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
 
 
 /*
 /*
@@ -95,23 +95,6 @@ struct amd_nb {
 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
 
 
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
 #define PEBS_REGS \
 #define PEBS_REGS \
 	(PERF_REG_X86_AX | \
 	(PERF_REG_X86_AX | \
 	 PERF_REG_X86_BX | \
 	 PERF_REG_X86_BX | \
@@ -216,6 +199,8 @@ struct cpu_hw_events {
 	 * Intel DebugStore bits
 	 * Intel DebugStore bits
 	 */
 	 */
 	struct debug_store	*ds;
 	struct debug_store	*ds;
+	void			*ds_pebs_vaddr;
+	void			*ds_bts_vaddr;
 	u64			pebs_enabled;
 	u64			pebs_enabled;
 	int			n_pebs;
 	int			n_pebs;
 	int			n_large_pebs;
 	int			n_large_pebs;

+ 13 - 0
arch/x86/include/asm/cpu_entry_area.h

@@ -5,6 +5,7 @@
 
 
 #include <linux/percpu-defs.h>
 #include <linux/percpu-defs.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
+#include <asm/intel_ds.h>
 
 
 /*
 /*
  * cpu_entry_area is a percpu region that contains things needed by the CPU
  * cpu_entry_area is a percpu region that contains things needed by the CPU
@@ -40,6 +41,18 @@ struct cpu_entry_area {
 	 */
 	 */
 	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
 	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
 #endif
 #endif
+#ifdef CONFIG_CPU_SUP_INTEL
+	/*
+	 * Per CPU debug store for Intel performance monitoring. Wastes a
+	 * full page at the moment.
+	 */
+	struct debug_store cpu_debug_store;
+	/*
+	 * The actual PEBS/BTS buffers must be mapped to user space
+	 * Reserve enough fixmap PTEs.
+	 */
+	struct debug_store_buffers cpu_debug_buffers;
+#endif
 };
 };
 
 
 #define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
 #define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))

+ 3 - 1
arch/x86/include/asm/cpufeatures.h

@@ -197,11 +197,12 @@
 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
-
+#define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +341,6 @@
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_INSECURE		X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
 
 
 #endif /* _ASM_X86_CPUFEATURES_H */
 #endif /* _ASM_X86_CPUFEATURES_H */

+ 2 - 0
arch/x86/include/asm/desc.h

@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 
 	desc->type		= (info->read_exec_only ^ 1) << 1;
 	desc->type		= (info->read_exec_only ^ 1) << 1;
 	desc->type	       |= info->contents << 2;
 	desc->type	       |= info->contents << 2;
+	/* Set the ACCESS bit so it can be mapped RO */
+	desc->type	       |= 1;
 
 
 	desc->s			= 1;
 	desc->s			= 1;
 	desc->dpl		= 0x3;
 	desc->dpl		= 0x3;

+ 7 - 1
arch/x86/include/asm/disabled-features.h

@@ -50,6 +50,12 @@
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 #endif
 #endif
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI		0
+#else
+# define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
+#endif
+
 /*
 /*
  * Make sure to add features to the correct mask
  * Make sure to add features to the correct mask
  */
  */
@@ -60,7 +66,7 @@
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
 #define DISABLED_MASK6	0
-#define DISABLED_MASK7	0
+#define DISABLED_MASK7	(DISABLE_PTI)
 #define DISABLED_MASK8	0
 #define DISABLED_MASK8	0
 #define DISABLED_MASK9	(DISABLE_MPX)
 #define DISABLED_MASK9	(DISABLE_MPX)
 #define DISABLED_MASK10	0
 #define DISABLED_MASK10	0

+ 36 - 0
arch/x86/include/asm/intel_ds.h

@@ -0,0 +1,36 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		8
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+	char	bts_buffer[BTS_BUFFER_SIZE];
+	char	pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif

+ 53 - 6
arch/x86/include/asm/mmu_context.h

@@ -50,10 +50,33 @@ struct ldt_struct {
 	 * call gates.  On native, we could merge the ldt_struct and LDT
 	 * call gates.  On native, we could merge the ldt_struct and LDT
 	 * allocations, but it's not worth trying to optimize.
 	 * allocations, but it's not worth trying to optimize.
 	 */
 	 */
-	struct desc_struct *entries;
-	unsigned int nr_entries;
+	struct desc_struct	*entries;
+	unsigned int		nr_entries;
+
+	/*
+	 * If PTI is in use, then the entries array is not mapped while we're
+	 * in user mode.  The whole array will be aliased at the addressed
+	 * given by ldt_slot_va(slot).  We use two slots so that we can allocate
+	 * and map, and enable a new LDT without invalidating the mapping
+	 * of an older, still-in-use LDT.
+	 *
+	 * slot will be -1 if this LDT doesn't have an alias mapping.
+	 */
+	int			slot;
 };
 };
 
 
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+#ifdef CONFIG_X86_64
+	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+#else
+	BUG();
+#endif
+}
+
 /*
 /*
  * Used for LDT copy/destruction.
  * Used for LDT copy/destruction.
  */
  */
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
 }
 }
 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
 static inline void init_new_context_ldt(struct mm_struct *mm) { }
 static inline void init_new_context_ldt(struct mm_struct *mm) { }
 static inline int ldt_dup_context(struct mm_struct *oldmm,
 static inline int ldt_dup_context(struct mm_struct *oldmm,
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
 {
 {
 	return 0;
 	return 0;
 }
 }
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
 #endif
 #endif
 
 
 static inline void load_mm_ldt(struct mm_struct *mm)
 static inline void load_mm_ldt(struct mm_struct *mm)
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 	 * that we can see.
 	 * that we can see.
 	 */
 	 */
 
 
-	if (unlikely(ldt))
-		set_ldt(ldt->entries, ldt->nr_entries);
-	else
+	if (unlikely(ldt)) {
+		if (static_cpu_has(X86_FEATURE_PTI)) {
+			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+				/*
+				 * Whoops -- either the new LDT isn't mapped
+				 * (if slot == -1) or is mapped into a bogus
+				 * slot (if slot > 1).
+				 */
+				clear_LDT();
+				return;
+			}
+
+			/*
+			 * If page table isolation is enabled, ldt->entries
+			 * will not be mapped in the userspace pagetables.
+			 * Tell the CPU to access the LDT through the alias
+			 * at ldt_slot_va(ldt->slot).
+			 */
+			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+		} else {
+			set_ldt(ldt->entries, ldt->nr_entries);
+		}
+	} else {
 		clear_LDT();
 		clear_LDT();
+	}
 #else
 #else
 	clear_LDT();
 	clear_LDT();
 #endif
 #endif
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 static inline void arch_exit_mmap(struct mm_struct *mm)
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 {
 	paravirt_arch_exit_mmap(mm);
 	paravirt_arch_exit_mmap(mm);
+	ldt_arch_exit_mmap(mm);
 }
 }
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64

+ 11 - 0
arch/x86/include/asm/pgalloc.h

@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
  */
  */
 extern gfp_t __userpte_alloc_gfp;
 extern gfp_t __userpte_alloc_gfp;
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
 /*
 /*
  * Allocate and free page tables.
  * Allocate and free page tables.
  */
  */

+ 26 - 4
arch/x86/include/asm/pgtable.h

@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
 void ptdump_walk_pgd_level_checkwx(void);
 void ptdump_walk_pgd_level_checkwx(void);
 
 
 #ifdef CONFIG_DEBUG_WX
 #ifdef CONFIG_DEBUG_WX
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 
 
 static inline int p4d_bad(p4d_t p4d)
 static inline int p4d_bad(p4d_t p4d)
 {
 {
-	return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (p4d_flags(p4d) & ~ignore_flags) != 0;
 }
 }
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
 
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 
 
 static inline int pgd_bad(pgd_t pgd)
 static inline int pgd_bad(pgd_t pgd)
 {
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	unsigned long ignore_flags = _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 }
 
 
 static inline int pgd_none(pgd_t pgd)
 static inline int pgd_none(pgd_t pgd)
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
  * pgd_offset() returns a (pgd_t *)
  * pgd_offset() returns a (pgd_t *)
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
  */
  */
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+/*
+ * a shortcut to get a pgd_t in a given mm
+ */
+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
 /*
 /*
  * a shortcut which implies the use of the kernel's pgd, instead
  * a shortcut which implies the use of the kernel's pgd, instead
  * of a process's
  * of a process's
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
  */
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+	memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+	/* Clone the user space pgd as well */
+	memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+	       count * sizeof(pgd_t));
+#endif
 }
 }
 
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)

+ 92 - 0
arch/x86/include/asm/pgtable_64.h

@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
 #endif
 #endif
 }
 }
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
+ * the user one is in the last 4k.  To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT	PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr |= BIT(bit);
+	return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr &= ~BIT(bit);
+	return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+	return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+	return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+	return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+	return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return pgd;
+	return __pti_set_user_pgd(pgdp, pgd);
+}
+#else
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	return pgd;
+}
+#endif
+
 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
 {
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
+	p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
+#else
 	*p4dp = p4d;
 	*p4dp = p4d;
+#endif
 }
 }
 
 
 static inline void native_p4d_clear(p4d_t *p4d)
 static inline void native_p4d_clear(p4d_t *p4d)
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
 
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 {
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	*pgdp = pti_set_user_pgd(pgdp, pgd);
+#else
 	*pgdp = pgd;
 	*pgdp = pgd;
+#endif
 }
 }
 
 
 static inline void native_pgd_clear(pgd_t *pgd)
 static inline void native_pgd_clear(pgd_t *pgd)

+ 6 - 2
arch/x86/include/asm/pgtable_64_types.h

@@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
 #define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 #define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
 
 
 #ifdef CONFIG_X86_5LEVEL
 #ifdef CONFIG_X86_5LEVEL
-# define VMALLOC_SIZE_TB	_AC(16384, UL)
-# define __VMALLOC_BASE		_AC(0xff92000000000000, UL)
+# define VMALLOC_SIZE_TB	_AC(12800, UL)
+# define __VMALLOC_BASE		_AC(0xffa0000000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
+# define LDT_PGD_ENTRY		_AC(-112, UL)
+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #else
 #else
 # define VMALLOC_SIZE_TB	_AC(32, UL)
 # define VMALLOC_SIZE_TB	_AC(32, UL)
 # define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
 # define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
 # define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
+# define LDT_PGD_ENTRY		_AC(-4, UL)
+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
 #endif
 #endif
 
 
 #ifdef CONFIG_RANDOMIZE_MEMORY
 #ifdef CONFIG_RANDOMIZE_MEMORY

+ 5 - 0
arch/x86/include/asm/processor-flags.h

@@ -38,6 +38,11 @@
 #define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)
 #define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)
 #define CR3_PCID_MASK	0xFFFull
 #define CR3_PCID_MASK	0xFFFull
 #define CR3_NOFLUSH	BIT_ULL(63)
 #define CR3_NOFLUSH	BIT_ULL(63)
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define X86_CR3_PTI_SWITCH_BIT	11
+#endif
+
 #else
 #else
 /*
 /*
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save

+ 16 - 7
arch/x86/include/asm/processor.h

@@ -852,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x)
 
 
 #else
 #else
 /*
 /*
- * User space process size. 47bits minus one guard page.  The guard
- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- * the highest possible canonical userspace address, then that
- * syscall will enter the kernel with a non-canonical return
- * address, and SYSRET will explode dangerously.  We avoid this
- * particular problem by preventing anything from being mapped
- * at the maximum canonical address.
+ * User space process size.  This is the first address outside the user range.
+ * There are a few constraints that determine this:
+ *
+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
+ * address, then that syscall will enter the kernel with a
+ * non-canonical return address, and SYSRET will explode dangerously.
+ * We avoid this particular problem by preventing anything executable
+ * from being mapped at the maximum canonical address.
+ *
+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
+ * CPUs malfunction if they execute code from the highest canonical page.
+ * They'll speculate right off the end of the canonical space, and
+ * bad things happen.  This is worked around in the same way as the
+ * Intel problem.
+ *
+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
  */
  */
 #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
 
 

+ 14 - 0
arch/x86/include/asm/pti.h

@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _ASM_X86_PTI_H
+#define _ASM_X86_PTI_H
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern void pti_init(void);
+extern void pti_check_boottime_disable(void);
+#else
+static inline void pti_check_boottime_disable(void) { }
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* _ASM_X86_PTI_H */

+ 171 - 31
arch/x86/include/asm/tlbflush.h

@@ -10,38 +10,90 @@
 #include <asm/special_insns.h>
 #include <asm/special_insns.h>
 #include <asm/smp.h>
 #include <asm/smp.h>
 #include <asm/invpcid.h>
 #include <asm/invpcid.h>
+#include <asm/pti.h>
+#include <asm/processor-flags.h>
 
 
-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
-{
-	/*
-	 * Bump the generation count.  This also serves as a full barrier
-	 * that synchronizes with switch_mm(): callers are required to order
-	 * their read of mm_cpumask after their writes to the paging
-	 * structures.
-	 */
-	return atomic64_inc_return(&mm->context.tlb_gen);
-}
+/*
+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
+ * to what is traditionally called ASID on the RISC processors.
+ *
+ * We don't use the traditional ASID implementation, where each process/mm gets
+ * its own ASID and flush/restart when we run out of ASID space.
+ *
+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
+ * that came by on this CPU, allowing cheaper switch_mm between processes on
+ * this CPU.
+ *
+ * We end up with different spaces for different things. To avoid confusion we
+ * use different names for each of them:
+ *
+ * ASID  - [0, TLB_NR_DYN_ASIDS-1]
+ *         the canonical identifier for an mm
+ *
+ * kPCID - [1, TLB_NR_DYN_ASIDS]
+ *         the value we write into the PCID part of CR3; corresponds to the
+ *         ASID+1, because PCID 0 is special.
+ *
+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
+ *         for KPTI each mm has two address spaces and thus needs two
+ *         PCID values, but we can still do with a single ASID denomination
+ *         for each mm. Corresponds to kPCID + 2048.
+ *
+ */
 
 
 /* There are 12 bits of space for ASIDS in CR3 */
 /* There are 12 bits of space for ASIDS in CR3 */
 #define CR3_HW_ASID_BITS		12
 #define CR3_HW_ASID_BITS		12
+
 /*
 /*
  * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
  * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
  * user/kernel switches
  * user/kernel switches
  */
  */
-#define PTI_CONSUMED_ASID_BITS		0
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define PTI_CONSUMED_PCID_BITS	1
+#else
+# define PTI_CONSUMED_PCID_BITS	0
+#endif
+
+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
 
 
-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
 /*
 /*
  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
- * for them being zero-based.  Another -1 is because ASID 0 is reserved for
+ * for them being zero-based.  Another -1 is because PCID 0 is reserved for
  * use by non-PCID-aware users.
  * use by non-PCID-aware users.
  */
  */
-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
 
 
+/*
+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
+ * lines.
+ */
+#define TLB_NR_DYN_ASIDS	6
+
+/*
+ * Given @asid, compute kPCID
+ */
 static inline u16 kern_pcid(u16 asid)
 static inline u16 kern_pcid(u16 asid)
 {
 {
 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	/*
+	 * Make sure that the dynamic ASID space does not confict with the
+	 * bit we are using to switch between user and kernel ASIDs.
+	 */
+	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+
+	/*
+	 * The ASID being passed in here should have respected the
+	 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
+	 */
+	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+#endif
 	/*
 	/*
+	 * The dynamically-assigned ASIDs that get passed in are small
+	 * (<TLB_NR_DYN_ASIDS).  They never have the high switch bit set,
+	 * so do not bother to clear it.
+	 *
 	 * If PCID is on, ASID-aware code paths put the ASID+1 into the
 	 * If PCID is on, ASID-aware code paths put the ASID+1 into the
 	 * PCID bits.  This serves two purposes.  It prevents a nasty
 	 * PCID bits.  This serves two purposes.  It prevents a nasty
 	 * situation in which PCID-unaware code saves CR3, loads some other
 	 * situation in which PCID-unaware code saves CR3, loads some other
@@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
 	return asid + 1;
 	return asid + 1;
 }
 }
 
 
+/*
+ * Given @asid, compute uPCID
+ */
+static inline u16 user_pcid(u16 asid)
+{
+	u16 ret = kern_pcid(asid);
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+#endif
+	return ret;
+}
+
 struct pgd_t;
 struct pgd_t;
 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 {
 {
@@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
 	return !static_cpu_has(X86_FEATURE_PCID);
 	return !static_cpu_has(X86_FEATURE_PCID);
 }
 }
 
 
-/*
- * 6 because 6 should be plenty and struct tlb_state will fit in
- * two cache lines.
- */
-#define TLB_NR_DYN_ASIDS 6
-
 struct tlb_context {
 struct tlb_context {
 	u64 ctx_id;
 	u64 ctx_id;
 	u64 tlb_gen;
 	u64 tlb_gen;
@@ -134,6 +192,24 @@ struct tlb_state {
 	 */
 	 */
 	bool is_lazy;
 	bool is_lazy;
 
 
+	/*
+	 * If set we changed the page tables in such a way that we
+	 * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
+	 * This tells us to go invalidate all the non-loaded ctxs[]
+	 * on the next context switch.
+	 *
+	 * The current ctx was kept up-to-date as it ran and does not
+	 * need to be invalidated.
+	 */
+	bool invalidate_other;
+
+	/*
+	 * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
+	 * the corresponding user PCID needs a flush next time we
+	 * switch to it; see SWITCH_TO_USER_CR3.
+	 */
+	unsigned short user_pcid_flush_mask;
+
 	/*
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
 	 * disabling interrupts when modifying either one.
 	 * disabling interrupts when modifying either one.
@@ -214,6 +290,14 @@ static inline unsigned long cr4_read_shadow(void)
 	return this_cpu_read(cpu_tlbstate.cr4);
 	return this_cpu_read(cpu_tlbstate.cr4);
 }
 }
 
 
+/*
+ * Mark all other ASIDs as invalid, preserves the current.
+ */
+static inline void invalidate_other_asid(void)
+{
+	this_cpu_write(cpu_tlbstate.invalidate_other, true);
+}
+
 /*
 /*
  * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
  * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
  * enable and PPro Global page enable), so that any CPU's that boot
  * enable and PPro Global page enable), so that any CPU's that boot
@@ -233,15 +317,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 
 extern void initialize_tlbstate_and_flush(void);
 extern void initialize_tlbstate_and_flush(void);
 
 
+/*
+ * Given an ASID, flush the corresponding user ASID.  We can delay this
+ * until the next time we switch to it.
+ *
+ * See SWITCH_TO_USER_CR3.
+ */
+static inline void invalidate_user_asid(u16 asid)
+{
+	/* There is no user ASID if address space separation is off */
+	if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		return;
+
+	/*
+	 * We only have a single ASID if PCID is off and the CR3
+	 * write will have flushed it.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_PCID))
+		return;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	__set_bit(kern_pcid(asid),
+		  (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
+}
+
 /*
 /*
  * flush the entire current user mapping
  * flush the entire current user mapping
  */
  */
 static inline void __native_flush_tlb(void)
 static inline void __native_flush_tlb(void)
 {
 {
+	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 	/*
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change during a
-	 * task switch and therefore we must not be preempted while we write CR3
-	 * back:
+	 * If current->mm == NULL then we borrow a mm which may change
+	 * during a task switch and therefore we must not be preempted
+	 * while we write CR3 back:
 	 */
 	 */
 	preempt_disable();
 	preempt_disable();
 	native_write_cr3(__native_read_cr3());
 	native_write_cr3(__native_read_cr3());
@@ -259,6 +370,8 @@ static inline void __native_flush_tlb_global(void)
 		/*
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 *
+		 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
 		 */
 		invpcid_flush_all();
 		invpcid_flush_all();
 		return;
 		return;
@@ -285,7 +398,21 @@ static inline void __native_flush_tlb_global(void)
  */
  */
 static inline void __native_flush_tlb_single(unsigned long addr)
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
 {
+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+
 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	/*
+	 * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
+	 * Just use invalidate_user_asid() in case we are called early.
+	 */
+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
+		invalidate_user_asid(loaded_mm_asid);
+	else
+		invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
 }
 }
 
 
 /*
 /*
@@ -301,14 +428,6 @@ static inline void __flush_tlb_all(void)
 		 */
 		 */
 		__flush_tlb();
 		__flush_tlb();
 	}
 	}
-
-	/*
-	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
-	 * we'd end up flushing kernel translations for the current ASID but
-	 * we might fail to flush kernel translations for other cached ASIDs.
-	 *
-	 * To avoid this issue, we force PCID off if PGE is off.
-	 */
 }
 }
 
 
 /*
 /*
@@ -318,6 +437,16 @@ static inline void __flush_tlb_one(unsigned long addr)
 {
 {
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
 	__flush_tlb_single(addr);
 	__flush_tlb_single(addr);
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	/*
+	 * __flush_tlb_single() will have cleared the TLB entry for this ASID,
+	 * but since kernel space is replicated across all, we must also
+	 * invalidate all others.
+	 */
+	invalidate_other_asid();
 }
 }
 
 
 #define TLB_FLUSH_ALL	-1UL
 #define TLB_FLUSH_ALL	-1UL
@@ -378,6 +507,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 void native_flush_tlb_others(const struct cpumask *cpumask,
 void native_flush_tlb_others(const struct cpumask *cpumask,
 			     const struct flush_tlb_info *info);
 			     const struct flush_tlb_info *info);
 
 
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
+{
+	/*
+	 * Bump the generation count.  This also serves as a full barrier
+	 * that synchronizes with switch_mm(): callers are required to order
+	 * their read of mm_cpumask after their writes to the paging
+	 * structures.
+	 */
+	return atomic64_inc_return(&mm->context.tlb_gen);
+}
+
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
 					struct mm_struct *mm)
 					struct mm_struct *mm)
 {
 {

+ 1 - 0
arch/x86/include/asm/vsyscall.h

@@ -7,6 +7,7 @@
 
 
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
 #ifdef CONFIG_X86_VSYSCALL_EMULATION
 extern void map_vsyscall(void);
 extern void map_vsyscall(void);
+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
 
 
 /*
 /*
  * Called on instruction fetch fault in vsyscall page.
  * Called on instruction fetch fault in vsyscall page.

+ 6 - 1
arch/x86/include/uapi/asm/processor-flags.h

@@ -78,7 +78,12 @@
 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */
+
+#define X86_CR3_PCID_BITS	12
+#define X86_CR3_PCID_MASK	(_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
+
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 
 /*
 /*
  * Intel CPU features in CR4
  * Intel CPU features in CR4

+ 4 - 0
arch/x86/kernel/asm-offsets.c

@@ -17,6 +17,7 @@
 #include <asm/sigframe.h>
 #include <asm/sigframe.h>
 #include <asm/bootparam.h>
 #include <asm/bootparam.h>
 #include <asm/suspend.h>
 #include <asm/suspend.h>
+#include <asm/tlbflush.h>
 
 
 #ifdef CONFIG_XEN
 #ifdef CONFIG_XEN
 #include <xen/interface/xen.h>
 #include <xen/interface/xen.h>
@@ -94,6 +95,9 @@ void common(void) {
 	BLANK();
 	BLANK();
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
 
 
+	/* TLB state for the entry code */
+	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
+
 	/* Layout info for cpu_entry_area */
 	/* Layout info for cpu_entry_area */
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);

+ 8 - 1
arch/x86/kernel/cpu/common.c

@@ -922,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	}
 	}
 
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+	/* Assume for now that ALL x86 CPUs are insecure */
+	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+
 	fpu__init_system(c);
 	fpu__init_system(c);
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
@@ -1360,7 +1364,10 @@ void syscall_init(void)
 		(entry_SYSCALL_64_trampoline - _entry_trampoline);
 		(entry_SYSCALL_64_trampoline - _entry_trampoline);
 
 
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-	wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+	if (static_cpu_has(X86_FEATURE_PTI))
+		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
+	else
+		wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 
 #ifdef CONFIG_IA32_EMULATION
 #ifdef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
 	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);

+ 4 - 2
arch/x86/kernel/dumpstack.c

@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	unsigned long sp;
 	unsigned long sp;
 #endif
 #endif
 	printk(KERN_DEFAULT
 	printk(KERN_DEFAULT
-	       "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
+	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
 	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
 	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
 	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
 	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
 	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
 	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
-	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "");
+	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "",
+	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
+	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
 
 
 	if (notify_die(DIE_OOPS, str, regs, err,
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)

+ 27 - 3
arch/x86/kernel/head_64.S

@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
 	.balign	PAGE_SIZE; \
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 GLOBAL(name)
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define PTI_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define PTI_USER_PGD_FILL	0
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
 	i = 0 ;						\
@@ -350,13 +371,14 @@ GLOBAL(name)
 	.endr
 	.endr
 
 
 	__INITDATA
 	__INITDATA
-NEXT_PAGE(early_top_pgt)
+NEXT_PGD_PAGE(early_top_pgt)
 	.fill	511,8,0
 	.fill	511,8,0
 #ifdef CONFIG_X86_5LEVEL
 #ifdef CONFIG_X86_5LEVEL
 	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #else
 #else
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 #endif
 #endif
+	.fill	PTI_USER_PGD_FILL,8,0
 
 
 NEXT_PAGE(early_dynamic_pgts)
 NEXT_PAGE(early_dynamic_pgts)
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
 	.data
 	.data
 
 
 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
+	.fill	PTI_USER_PGD_FILL,8,0
 
 
 NEXT_PAGE(level3_ident_pgt)
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
 	 */
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #else
 #else
-NEXT_PAGE(init_top_pgt)
+NEXT_PGD_PAGE(init_top_pgt)
 	.fill	512,8,0
 	.fill	512,8,0
+	.fill	PTI_USER_PGD_FILL,8,0
 #endif
 #endif
 
 
 #ifdef CONFIG_X86_5LEVEL
 #ifdef CONFIG_X86_5LEVEL

+ 141 - 3
arch/x86/kernel/ldt.c

@@ -24,6 +24,7 @@
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 
 
 #include <asm/ldt.h>
 #include <asm/ldt.h>
+#include <asm/tlb.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
 #include <asm/syscalls.h>
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
 static void flush_ldt(void *__mm)
 static void flush_ldt(void *__mm)
 {
 {
 	struct mm_struct *mm = __mm;
 	struct mm_struct *mm = __mm;
-	mm_context_t *pc;
 
 
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
 	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
 		return;
 		return;
 
 
-	pc = &mm->context;
-	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
+	load_mm_ldt(mm);
 
 
 	refresh_ldt_segments();
 	refresh_ldt_segments();
 }
 }
@@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 		return NULL;
 		return NULL;
 	}
 	}
 
 
+	/* The new LDT isn't aliased for PTI yet. */
+	new_ldt->slot = -1;
+
 	new_ldt->nr_entries = num_entries;
 	new_ldt->nr_entries = num_entries;
 	return new_ldt;
 	return new_ldt;
 }
 }
 
 
+/*
+ * If PTI is enabled, this maps the LDT into the kernelmode and
+ * usermode tables for the given mm.
+ *
+ * There is no corresponding unmap function.  Even if the LDT is freed, we
+ * leave the PTEs around until the slot is reused or the mm is destroyed.
+ * This is harmless: the LDT is always in ordinary memory, and no one will
+ * access the freed slot.
+ *
+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
+ * it useful, and the flush would slow down modify_ldt().
+ */
+static int
+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	bool is_vmalloc, had_top_level_entry;
+	unsigned long va;
+	spinlock_t *ptl;
+	pgd_t *pgd;
+	int i;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return 0;
+
+	/*
+	 * Any given ldt_struct should have map_ldt_struct() called at most
+	 * once.
+	 */
+	WARN_ON(ldt->slot != -1);
+
+	/*
+	 * Did we already have the top level entry allocated?  We can't
+	 * use pgd_none() for this because it doens't do anything on
+	 * 4-level page table kernels.
+	 */
+	pgd = pgd_offset(mm, LDT_BASE_ADDR);
+	had_top_level_entry = (pgd->pgd != 0);
+
+	is_vmalloc = is_vmalloc_addr(ldt->entries);
+
+	for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
+		unsigned long offset = i << PAGE_SHIFT;
+		const void *src = (char *)ldt->entries + offset;
+		unsigned long pfn;
+		pte_t pte, *ptep;
+
+		va = (unsigned long)ldt_slot_va(slot) + offset;
+		pfn = is_vmalloc ? vmalloc_to_pfn(src) :
+			page_to_pfn(virt_to_page(src));
+		/*
+		 * Treat the PTI LDT range as a *userspace* range.
+		 * get_locked_pte() will allocate all needed pagetables
+		 * and account for them in this mm.
+		 */
+		ptep = get_locked_pte(mm, va, &ptl);
+		if (!ptep)
+			return -ENOMEM;
+		/*
+		 * Map it RO so the easy to find address is not a primary
+		 * target via some kernel interface which misses a
+		 * permission check.
+		 */
+		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
+		set_pte_at(mm, va, ptep, pte);
+		pte_unmap_unlock(ptep, ptl);
+	}
+
+	if (mm->context.ldt) {
+		/*
+		 * We already had an LDT.  The top-level entry should already
+		 * have been allocated and synchronized with the usermode
+		 * tables.
+		 */
+		WARN_ON(!had_top_level_entry);
+		if (static_cpu_has(X86_FEATURE_PTI))
+			WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
+	} else {
+		/*
+		 * This is the first time we're mapping an LDT for this process.
+		 * Sync the pgd to the usermode tables.
+		 */
+		WARN_ON(had_top_level_entry);
+		if (static_cpu_has(X86_FEATURE_PTI)) {
+			WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
+			set_pgd(kernel_to_user_pgdp(pgd), *pgd);
+		}
+	}
+
+	va = (unsigned long)ldt_slot_va(slot);
+	flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+
+	ldt->slot = slot;
+#endif
+	return 0;
+}
+
+static void free_ldt_pgtables(struct mm_struct *mm)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	struct mmu_gather tlb;
+	unsigned long start = LDT_BASE_ADDR;
+	unsigned long end = start + (1UL << PGDIR_SHIFT);
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	tlb_gather_mmu(&tlb, mm, start, end);
+	free_pgd_range(&tlb, start, end, start, end);
+	tlb_finish_mmu(&tlb, start, end);
+#endif
+}
+
 /* After calling this, the LDT is immutable. */
 /* After calling this, the LDT is immutable. */
 static void finalize_ldt_struct(struct ldt_struct *ldt)
 static void finalize_ldt_struct(struct ldt_struct *ldt)
 {
 {
@@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
 	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
 	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
 	finalize_ldt_struct(new_ldt);
 	finalize_ldt_struct(new_ldt);
 
 
+	retval = map_ldt_struct(mm, new_ldt, 0);
+	if (retval) {
+		free_ldt_pgtables(mm);
+		free_ldt_struct(new_ldt);
+		goto out_unlock;
+	}
 	mm->context.ldt = new_ldt;
 	mm->context.ldt = new_ldt;
 
 
 out_unlock:
 out_unlock:
@@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
 	mm->context.ldt = NULL;
 	mm->context.ldt = NULL;
 }
 }
 
 
+void ldt_arch_exit_mmap(struct mm_struct *mm)
+{
+	free_ldt_pgtables(mm);
+}
+
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
 {
 	struct mm_struct *mm = current->mm;
 	struct mm_struct *mm = current->mm;
@@ -287,6 +413,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 	new_ldt->entries[ldt_info.entry_number] = ldt;
 	new_ldt->entries[ldt_info.entry_number] = ldt;
 	finalize_ldt_struct(new_ldt);
 	finalize_ldt_struct(new_ldt);
 
 
+	/*
+	 * If we are using PTI, map the new LDT into the userspace pagetables.
+	 * If there is already an LDT, use the other slot so that other CPUs
+	 * will continue to use the old LDT until install_ldt() switches
+	 * them over to the new LDT.
+	 */
+	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
+	if (error) {
+		free_ldt_struct(old_ldt);
+		goto out_unlock;
+	}
+
 	install_ldt(mm, new_ldt);
 	install_ldt(mm, new_ldt);
 	free_ldt_struct(old_ldt);
 	free_ldt_struct(old_ldt);
 	error = 0;
 	error = 0;

+ 2 - 9
arch/x86/kernel/tls.c

@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
 	cpu = get_cpu();
 	cpu = get_cpu();
 
 
 	while (n-- > 0) {
 	while (n-- > 0) {
-		if (LDT_empty(info) || LDT_zero(info)) {
+		if (LDT_empty(info) || LDT_zero(info))
 			memset(desc, 0, sizeof(*desc));
 			memset(desc, 0, sizeof(*desc));
-		} else {
+		else
 			fill_ldt(desc, info);
 			fill_ldt(desc, info);
-
-			/*
-			 * Always set the accessed bit so that the CPU
-			 * doesn't try to write to the (read-only) GDT.
-			 */
-			desc->type |= 1;
-		}
 		++info;
 		++info;
 		++desc;
 		++desc;
 	}
 	}

+ 8 - 0
arch/x86/kernel/vmlinux.lds.S

@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
 		. = ALIGN(HPAGE_SIZE);				\
 		. = ALIGN(HPAGE_SIZE);				\
 		__end_rodata_hpage_align = .;
 		__end_rodata_hpage_align = .;
 
 
+#define ALIGN_ENTRY_TEXT_BEGIN	. = ALIGN(PMD_SIZE);
+#define ALIGN_ENTRY_TEXT_END	. = ALIGN(PMD_SIZE);
+
 #else
 #else
 
 
 #define X64_ALIGN_RODATA_BEGIN
 #define X64_ALIGN_RODATA_BEGIN
 #define X64_ALIGN_RODATA_END
 #define X64_ALIGN_RODATA_END
 
 
+#define ALIGN_ENTRY_TEXT_BEGIN
+#define ALIGN_ENTRY_TEXT_END
+
 #endif
 #endif
 
 
 PHDRS {
 PHDRS {
@@ -102,8 +108,10 @@ SECTIONS
 		CPUIDLE_TEXT
 		CPUIDLE_TEXT
 		LOCK_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
 		KPROBES_TEXT
+		ALIGN_ENTRY_TEXT_BEGIN
 		ENTRY_TEXT
 		ENTRY_TEXT
 		IRQENTRY_TEXT
 		IRQENTRY_TEXT
+		ALIGN_ENTRY_TEXT_END
 		SOFTIRQENTRY_TEXT
 		SOFTIRQENTRY_TEXT
 		*(.fixup)
 		*(.fixup)
 		*(.gnu.warning)
 		*(.gnu.warning)

+ 4 - 3
arch/x86/mm/Makefile

@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 
-obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_X86_INTEL_MPX)			+= mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
+obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= pti.o
 
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o

+ 27 - 0
arch/x86/mm/cpu_entry_area.c

@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
 		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
 		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
 }
 }
 
 
+static void percpu_setup_debug_store(int cpu)
+{
+#ifdef CONFIG_CPU_SUP_INTEL
+	int npages;
+	void *cea;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return;
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
+	npages = sizeof(struct debug_store) / PAGE_SIZE;
+	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
+	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
+			     PAGE_KERNEL);
+
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
+	/*
+	 * Force the population of PMDs for not yet allocated per cpu
+	 * memory like debug store buffers.
+	 */
+	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
+	for (; npages; npages--, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+#endif
+}
+
 /* Setup the fixmap mappings only once per-processor */
 /* Setup the fixmap mappings only once per-processor */
 static void __init setup_cpu_entry_area(int cpu)
 static void __init setup_cpu_entry_area(int cpu)
 {
 {
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
 	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
 	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
 #endif
 #endif
+	percpu_setup_debug_store(cpu);
 }
 }
 
 
 static __init void setup_cpu_entry_area_ptes(void)
 static __init void setup_cpu_entry_area_ptes(void)

+ 74 - 6
arch/x86/mm/debug_pagetables.c

@@ -5,7 +5,7 @@
 
 
 static int ptdump_show(struct seq_file *m, void *v)
 static int ptdump_show(struct seq_file *m, void *v)
 {
 {
-	ptdump_walk_pgd_level(m, NULL);
+	ptdump_walk_pgd_level_debugfs(m, NULL, false);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
 	.release	= single_release,
 	.release	= single_release,
 };
 };
 
 
-static struct dentry *pe;
+static int ptdump_show_curknl(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curknl, NULL);
+}
+
+static const struct file_operations ptdump_curknl_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curknl,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+static struct dentry *pe_curusr;
+
+static int ptdump_show_curusr(struct seq_file *m, void *v)
+{
+	if (current->mm->pgd) {
+		down_read(&current->mm->mmap_sem);
+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
+		up_read(&current->mm->mmap_sem);
+	}
+	return 0;
+}
+
+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ptdump_show_curusr, NULL);
+}
+
+static const struct file_operations ptdump_curusr_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ptdump_open_curusr,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+static struct dentry *dir, *pe_knl, *pe_curknl;
 
 
 static int __init pt_dump_debug_init(void)
 static int __init pt_dump_debug_init(void)
 {
 {
-	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
-				 &ptdump_fops);
-	if (!pe)
+	dir = debugfs_create_dir("page_tables", NULL);
+	if (!dir)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
+	pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
+				     &ptdump_fops);
+	if (!pe_knl)
+		goto err;
+
+	pe_curknl = debugfs_create_file("current_kernel", 0400,
+					dir, NULL, &ptdump_curknl_fops);
+	if (!pe_curknl)
+		goto err;
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pe_curusr = debugfs_create_file("current_user", 0400,
+					dir, NULL, &ptdump_curusr_fops);
+	if (!pe_curusr)
+		goto err;
+#endif
 	return 0;
 	return 0;
+err:
+	debugfs_remove_recursive(dir);
+	return -ENOMEM;
 }
 }
 
 
 static void __exit pt_dump_debug_exit(void)
 static void __exit pt_dump_debug_exit(void)
 {
 {
-	debugfs_remove_recursive(pe);
+	debugfs_remove_recursive(dir);
 }
 }
 
 
 module_init(pt_dump_debug_init);
 module_init(pt_dump_debug_init);

+ 38 - 5
arch/x86/mm/dump_pagetables.c

@@ -52,11 +52,17 @@ enum address_markers_idx {
 	USER_SPACE_NR = 0,
 	USER_SPACE_NR = 0,
 	KERNEL_SPACE_NR,
 	KERNEL_SPACE_NR,
 	LOW_KERNEL_NR,
 	LOW_KERNEL_NR,
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
+#endif
 	VMALLOC_START_NR,
 	VMALLOC_START_NR,
 	VMEMMAP_START_NR,
 	VMEMMAP_START_NR,
 #ifdef CONFIG_KASAN
 #ifdef CONFIG_KASAN
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_START_NR,
 	KASAN_SHADOW_END_NR,
 	KASAN_SHADOW_END_NR,
+#endif
+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
+	LDT_NR,
 #endif
 #endif
 	CPU_ENTRY_AREA_NR,
 	CPU_ENTRY_AREA_NR,
 #ifdef CONFIG_X86_ESPFIX64
 #ifdef CONFIG_X86_ESPFIX64
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
 #ifdef CONFIG_KASAN
 #ifdef CONFIG_KASAN
 	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
 	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
 	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
 	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
+#endif
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+	[LDT_NR]		= { LDT_BASE_ADDR,	"LDT remap" },
 #endif
 #endif
 	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
 	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
 #ifdef CONFIG_X86_ESPFIX64
 #ifdef CONFIG_X86_ESPFIX64
@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
 }
 }
 
 
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
-				       bool checkwx)
+				       bool checkwx, bool dmesg)
 {
 {
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	pgd_t *start = (pgd_t *) &init_top_pgt;
 	pgd_t *start = (pgd_t *) &init_top_pgt;
@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 
 	if (pgd) {
 	if (pgd) {
 		start = pgd;
 		start = pgd;
-		st.to_dmesg = true;
+		st.to_dmesg = dmesg;
 	}
 	}
 
 
 	st.check_wx = checkwx;
 	st.check_wx = checkwx;
@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
 {
 {
-	ptdump_walk_pgd_level_core(m, pgd, false);
+	ptdump_walk_pgd_level_core(m, pgd, false, true);
+}
+
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (user && static_cpu_has(X86_FEATURE_PTI))
+		pgd = kernel_to_user_pgdp(pgd);
+#endif
+	ptdump_walk_pgd_level_core(m, pgd, false, false);
+}
+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
+
+static void ptdump_walk_user_pgd_level_checkwx(void)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	pgd_t *pgd = (pgd_t *) &init_top_pgt;
+
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("x86/mm: Checking user space page tables\n");
+	pgd = kernel_to_user_pgdp(pgd);
+	ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+#endif
 }
 }
-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
 
 
 void ptdump_walk_pgd_level_checkwx(void)
 void ptdump_walk_pgd_level_checkwx(void)
 {
 {
-	ptdump_walk_pgd_level_core(NULL, NULL, true);
+	ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+	ptdump_walk_user_pgd_level_checkwx();
 }
 }
 
 
 static int __init pt_dump_init(void)
 static int __init pt_dump_init(void)

+ 49 - 31
arch/x86/mm/init.c

@@ -20,6 +20,7 @@
 #include <asm/kaslr.h>
 #include <asm/kaslr.h>
 #include <asm/hypervisor.h>
 #include <asm/hypervisor.h>
 #include <asm/cpufeature.h>
 #include <asm/cpufeature.h>
+#include <asm/pti.h>
 
 
 /*
 /*
  * We need to define the tracepoints somewhere, and tlb.c
  * We need to define the tracepoints somewhere, and tlb.c
@@ -160,6 +161,12 @@ struct map_range {
 
 
 static int page_size_mask;
 static int page_size_mask;
 
 
+static void enable_global_pages(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		__supported_pte_mask |= _PAGE_GLOBAL;
+}
+
 static void __init probe_page_size_mask(void)
 static void __init probe_page_size_mask(void)
 {
 {
 	/*
 	/*
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 
 	/* Enable PGE if available */
 	/* Enable PGE if available */
+	__supported_pte_mask &= ~_PAGE_GLOBAL;
 	if (boot_cpu_has(X86_FEATURE_PGE)) {
 	if (boot_cpu_has(X86_FEATURE_PGE)) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	} else
-		__supported_pte_mask &= ~_PAGE_GLOBAL;
+		enable_global_pages();
+	}
 
 
 	/* Enable 1 GB linear kernel mappings if available: */
 	/* Enable 1 GB linear kernel mappings if available: */
 	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
 	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
 
 
 static void setup_pcid(void)
 static void setup_pcid(void)
 {
 {
-#ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_PCID)) {
-		if (boot_cpu_has(X86_FEATURE_PGE)) {
-			/*
-			 * This can't be cr4_set_bits_and_update_boot() --
-			 * the trampoline code can't handle CR4.PCIDE and
-			 * it wouldn't do any good anyway.  Despite the name,
-			 * cr4_set_bits_and_update_boot() doesn't actually
-			 * cause the bits in question to remain set all the
-			 * way through the secondary boot asm.
-			 *
-			 * Instead, we brute-force it and set CR4.PCIDE
-			 * manually in start_secondary().
-			 */
-			cr4_set_bits(X86_CR4_PCIDE);
-		} else {
-			/*
-			 * flush_tlb_all(), as currently implemented, won't
-			 * work if PCID is on but PGE is not.  Since that
-			 * combination doesn't exist on real hardware, there's
-			 * no reason to try to fully support it, but it's
-			 * polite to avoid corrupting data if we're on
-			 * an improperly configured VM.
-			 */
-			setup_clear_cpu_cap(X86_FEATURE_PCID);
-		}
+	if (!IS_ENABLED(CONFIG_X86_64))
+		return;
+
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return;
+
+	if (boot_cpu_has(X86_FEATURE_PGE)) {
+		/*
+		 * This can't be cr4_set_bits_and_update_boot() -- the
+		 * trampoline code can't handle CR4.PCIDE and it wouldn't
+		 * do any good anyway.  Despite the name,
+		 * cr4_set_bits_and_update_boot() doesn't actually cause
+		 * the bits in question to remain set all the way through
+		 * the secondary boot asm.
+		 *
+		 * Instead, we brute-force it and set CR4.PCIDE manually in
+		 * start_secondary().
+		 */
+		cr4_set_bits(X86_CR4_PCIDE);
+
+		/*
+		 * INVPCID's single-context modes (2/3) only work if we set
+		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
+		 * on systems that have X86_CR4_PCIDE clear, or that have
+		 * no INVPCID support at all.
+		 */
+		if (boot_cpu_has(X86_FEATURE_INVPCID))
+			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
+	} else {
+		/*
+		 * flush_tlb_all(), as currently implemented, won't work if
+		 * PCID is on but PGE is not.  Since that combination
+		 * doesn't exist on real hardware, there's no reason to try
+		 * to fully support it, but it's polite to avoid corrupting
+		 * data if we're on an improperly configured VM.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_PCID);
 	}
 	}
-#endif
 }
 }
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
 {
 {
 	unsigned long end;
 	unsigned long end;
 
 
+	pti_check_boottime_disable();
 	probe_page_size_mask();
 	probe_page_size_mask();
 	setup_pcid();
 	setup_pcid();
 
 
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 	free_area_init_nodes(max_zone_pfns);
 }
 }
 
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.loaded_mm = &init_mm,
 	.loaded_mm = &init_mm,
 	.next_asid = 1,
 	.next_asid = 1,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */

+ 3 - 2
arch/x86/mm/pgtable.c

@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 		kmem_cache_free(pgd_cache, pgd);
 }
 }
 #else
 #else
+
 static inline pgd_t *_pgd_alloc(void)
 static inline pgd_t *_pgd_alloc(void)
 {
 {
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 }
 
 
 static inline void _pgd_free(pgd_t *pgd)
 static inline void _pgd_free(pgd_t *pgd)
 {
 {
-	free_page((unsigned long)pgd);
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 }
 #endif /* CONFIG_X86_PAE */
 #endif /* CONFIG_X86_PAE */
 
 

+ 387 - 0
arch/x86/mm/pti.c

@@ -0,0 +1,387 @@
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * This code is based in part on work published here:
+ *
+ *	https://github.com/IAIK/KAISER
+ *
+ * The original work was written by and and signed off by for the Linux
+ * kernel by:
+ *
+ *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
+ *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
+ *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
+ *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
+ *
+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
+ *		       Andy Lutomirsky <luto@amacapital.net>
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#include <asm/cpufeature.h>
+#include <asm/hypervisor.h>
+#include <asm/vsyscall.h>
+#include <asm/cmdline.h>
+#include <asm/pti.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
+
+/* Backporting helper */
+#ifndef __GFP_NOTRACK
+#define __GFP_NOTRACK	0
+#endif
+
+static void __init pti_print_if_insecure(const char *reason)
+{
+	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		pr_info("%s\n", reason);
+}
+
+static void __init pti_print_if_secure(const char *reason)
+{
+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		pr_info("%s\n", reason);
+}
+
+void __init pti_check_boottime_disable(void)
+{
+	char arg[5];
+	int ret;
+
+	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
+		pti_print_if_insecure("disabled on XEN PV.");
+		return;
+	}
+
+	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+	if (ret > 0)  {
+		if (ret == 3 && !strncmp(arg, "off", 3)) {
+			pti_print_if_insecure("disabled on command line.");
+			return;
+		}
+		if (ret == 2 && !strncmp(arg, "on", 2)) {
+			pti_print_if_secure("force enabled on command line.");
+			goto enable;
+		}
+		if (ret == 4 && !strncmp(arg, "auto", 4))
+			goto autosel;
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "nopti")) {
+		pti_print_if_insecure("disabled on command line.");
+		return;
+	}
+
+autosel:
+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
+		return;
+enable:
+	setup_force_cpu_cap(X86_FEATURE_PTI);
+}
+
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	/*
+	 * Changes to the high (kernel) portion of the kernelmode page
+	 * tables are not automatically propagated to the usermode tables.
+	 *
+	 * Users should keep in mind that, unlike the kernelmode tables,
+	 * there is no vmalloc_fault equivalent for the usermode tables.
+	 * Top-level entries added to init_mm's usermode pgd after boot
+	 * will not be automatically propagated to other mms.
+	 */
+	if (!pgdp_maps_userspace(pgdp))
+		return pgd;
+
+	/*
+	 * The user page tables get the full PGD, accessible from
+	 * userspace:
+	 */
+	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
+
+	/*
+	 * If this is normal user memory, make it NX in the kernel
+	 * pagetables so that, if we somehow screw up and return to
+	 * usermode with the kernel CR3 loaded, we'll get a page fault
+	 * instead of allowing user code to execute with the wrong CR3.
+	 *
+	 * As exceptions, we don't set NX if:
+	 *  - _PAGE_USER is not set.  This could be an executable
+	 *     EFI runtime mapping or something similar, and the kernel
+	 *     may execute from it
+	 *  - we don't have NX support
+	 *  - we're clearing the PGD (i.e. the new pgd is not present).
+	 */
+	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
+	    (__supported_pte_mask & _PAGE_NX))
+		pgd.pgd |= _PAGE_NX;
+
+	/* return the copy of the PGD we want the kernel to use: */
+	return pgd;
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a P4D on success, or NULL on failure.
+ */
+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+{
+	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+	if (address < PAGE_OFFSET) {
+		WARN_ONCE(1, "attempt to walk user address\n");
+		return NULL;
+	}
+
+	if (pgd_none(*pgd)) {
+		unsigned long new_p4d_page = __get_free_page(gfp);
+		if (!new_p4d_page)
+			return NULL;
+
+		if (pgd_none(*pgd)) {
+			set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
+			new_p4d_page = 0;
+		}
+		if (new_p4d_page)
+			free_page(new_p4d_page);
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+	return p4d_offset(pgd, address);
+}
+
+/*
+ * Walk the user copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.
+ *
+ * Returns a pointer to a PMD on success, or NULL on failure.
+ */
+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+	pud_t *pud;
+
+	BUILD_BUG_ON(p4d_large(*p4d) != 0);
+	if (p4d_none(*p4d)) {
+		unsigned long new_pud_page = __get_free_page(gfp);
+		if (!new_pud_page)
+			return NULL;
+
+		if (p4d_none(*p4d)) {
+			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
+			new_pud_page = 0;
+		}
+		if (new_pud_page)
+			free_page(new_pud_page);
+	}
+
+	pud = pud_offset(p4d, address);
+	/* The user page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+			new_pmd_page = 0;
+		}
+		if (new_pmd_page)
+			free_page(new_pmd_page);
+	}
+
+	return pmd_offset(pud, address);
+}
+
+#ifdef CONFIG_X86_VSYSCALL_EMULATION
+/*
+ * Walk the shadow copy of the page tables (optionally) trying to allocate
+ * page table pages on the way down.  Does not support large pages.
+ *
+ * Note: this is only used when mapping *new* kernel data into the
+ * user/shadow page tables.  It is never used for userspace data.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+{
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+	pte_t *pte;
+
+	/* We can't do anything sensible if we hit a large mapping. */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+			new_pte_page = 0;
+		}
+		if (new_pte_page)
+			free_page(new_pte_page);
+	}
+
+	pte = pte_offset_kernel(pmd, address);
+	if (pte_flags(*pte) & _PAGE_USER) {
+		WARN_ONCE(1, "attempt to walk to user pte\n");
+		return NULL;
+	}
+	return pte;
+}
+
+static void __init pti_setup_vsyscall(void)
+{
+	pte_t *pte, *target_pte;
+	unsigned int level;
+
+	pte = lookup_address(VSYSCALL_ADDR, &level);
+	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
+		return;
+
+	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
+	if (WARN_ON(!target_pte))
+		return;
+
+	*target_pte = *pte;
+	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
+}
+#else
+static void __init pti_setup_vsyscall(void) { }
+#endif
+
+static void __init
+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
+{
+	unsigned long addr;
+
+	/*
+	 * Clone the populated PMDs which cover start to end. These PMD areas
+	 * can have holes.
+	 */
+	for (addr = start; addr < end; addr += PMD_SIZE) {
+		pmd_t *pmd, *target_pmd;
+		pgd_t *pgd;
+		p4d_t *p4d;
+		pud_t *pud;
+
+		pgd = pgd_offset_k(addr);
+		if (WARN_ON(pgd_none(*pgd)))
+			return;
+		p4d = p4d_offset(pgd, addr);
+		if (WARN_ON(p4d_none(*p4d)))
+			return;
+		pud = pud_offset(p4d, addr);
+		if (pud_none(*pud))
+			continue;
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd))
+			continue;
+
+		target_pmd = pti_user_pagetable_walk_pmd(addr);
+		if (WARN_ON(!target_pmd))
+			return;
+
+		/*
+		 * Copy the PMD.  That is, the kernelmode and usermode
+		 * tables will share the last-level page tables of this
+		 * address range
+		 */
+		*target_pmd = pmd_clear_flags(*pmd, clear);
+	}
+}
+
+/*
+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
+ * next-level entry on 5-level systems.
+ */
+static void __init pti_clone_p4d(unsigned long addr)
+{
+	p4d_t *kernel_p4d, *user_p4d;
+	pgd_t *kernel_pgd;
+
+	user_p4d = pti_user_pagetable_walk_p4d(addr);
+	kernel_pgd = pgd_offset_k(addr);
+	kernel_p4d = p4d_offset(kernel_pgd, addr);
+	*user_p4d = *kernel_p4d;
+}
+
+/*
+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
+ */
+static void __init pti_clone_user_shared(void)
+{
+	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
+}
+
+/*
+ * Clone the ESPFIX P4D into the user space visinble page table
+ */
+static void __init pti_setup_espfix64(void)
+{
+#ifdef CONFIG_X86_ESPFIX64
+	pti_clone_p4d(ESPFIX_BASE_ADDR);
+#endif
+}
+
+/*
+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
+ */
+static void __init pti_clone_entry_text(void)
+{
+	pti_clone_pmds((unsigned long) __entry_text_start,
+			(unsigned long) __irqentry_text_end, _PAGE_RW);
+}
+
+/*
+ * Initialize kernel page table isolation
+ */
+void __init pti_init(void)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+
+	pr_info("enabled\n");
+
+	pti_clone_user_shared();
+	pti_clone_entry_text();
+	pti_setup_espfix64();
+	pti_setup_vsyscall();
+}

+ 56 - 2
arch/x86/mm/tlb.c

@@ -28,6 +28,38 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
  */
 
 
+/*
+ * We get here when we do something requiring a TLB invalidation
+ * but could not go invalidate all of the contexts.  We do the
+ * necessary invalidation by clearing out the 'ctx_id' which
+ * forces a TLB flush when the context is loaded.
+ */
+void clear_asid_other(void)
+{
+	u16 asid;
+
+	/*
+	 * This is only expected to be set if we have disabled
+	 * kernel _PAGE_GLOBAL pages.
+	 */
+	if (!static_cpu_has(X86_FEATURE_PTI)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
+		/* Do not need to flush the current asid */
+		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
+			continue;
+		/*
+		 * Make sure the next time we go to switch to
+		 * this asid, we do a flush:
+		 */
+		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
+	}
+	this_cpu_write(cpu_tlbstate.invalidate_other, false);
+}
+
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
 
 
 
 
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 		return;
 		return;
 	}
 	}
 
 
+	if (this_cpu_read(cpu_tlbstate.invalidate_other))
+		clear_asid_other();
+
 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
 		    next->context.ctx_id)
 		    next->context.ctx_id)
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 	*need_flush = true;
 	*need_flush = true;
 }
 }
 
 
+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
+{
+	unsigned long new_mm_cr3;
+
+	if (need_flush) {
+		invalidate_user_asid(new_asid);
+		new_mm_cr3 = build_cr3(pgdir, new_asid);
+	} else {
+		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
+	}
+
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 void leave_mm(int cpu)
 void leave_mm(int cpu)
 {
 {
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (need_flush) {
 		if (need_flush) {
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-			write_cr3(build_cr3(next->pgd, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, true);
 
 
 			/*
 			/*
 			 * NB: This gets called via leave_mm() in the idle path
 			 * NB: This gets called via leave_mm() in the idle path
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 		} else {
 		} else {
 			/* The new ASID is already up to date. */
 			/* The new ASID is already up to date. */
-			write_cr3(build_cr3_noflush(next->pgd, new_asid));
+			load_new_mm_cr3(next->pgd, new_asid, false);
 
 
 			/* See above wrt _rcuidle. */
 			/* See above wrt _rcuidle. */
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);

+ 4 - 1
arch/x86/platform/efi/efi_64.c

@@ -196,6 +196,9 @@ static pgd_t *efi_pgd;
  * because we want to avoid inserting EFI region mappings (EFI_VA_END
  * because we want to avoid inserting EFI region mappings (EFI_VA_END
  * to EFI_VA_START) into the standard kernel page tables. Everything
  * to EFI_VA_START) into the standard kernel page tables. Everything
  * else can be shared, see efi_sync_low_kernel_mappings().
  * else can be shared, see efi_sync_low_kernel_mappings().
+ *
+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
+ * allocation.
  */
  */
 int __init efi_alloc_page_tables(void)
 int __init efi_alloc_page_tables(void)
 {
 {
@@ -208,7 +211,7 @@ int __init efi_alloc_page_tables(void)
 		return 0;
 		return 0;
 
 
 	gfp_mask = GFP_KERNEL | __GFP_ZERO;
 	gfp_mask = GFP_KERNEL | __GFP_ZERO;
-	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
 	if (!efi_pgd)
 	if (!efi_pgd)
 		return -ENOMEM;
 		return -ENOMEM;
 
 

+ 11 - 0
include/linux/pti.h

@@ -0,0 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _INCLUDE_PTI_H
+#define _INCLUDE_PTI_H
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#include <asm/pti.h>
+#else
+static inline void pti_init(void) { }
+#endif
+
+#endif

+ 3 - 0
init/main.c

@@ -75,6 +75,7 @@
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/ptrace.h>
+#include <linux/pti.h>
 #include <linux/blkdev.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/elevator.h>
 #include <linux/sched_clock.h>
 #include <linux/sched_clock.h>
@@ -506,6 +507,8 @@ static void __init mm_init(void)
 	ioremap_huge_init();
 	ioremap_huge_init();
 	/* Should be run before the first non-init thread is created */
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
 	init_espfix_bsp();
+	/* Should be run after espfix64 is set up. */
+	pti_init();
 }
 }
 
 
 asmlinkage __visible void __init start_kernel(void)
 asmlinkage __visible void __init start_kernel(void)

+ 10 - 0
security/Kconfig

@@ -54,6 +54,16 @@ config SECURITY_NETWORK
 	  implement socket and networking access controls.
 	  implement socket and networking access controls.
 	  If you are unsure how to answer this question, answer N.
 	  If you are unsure how to answer this question, answer N.
 
 
+config PAGE_TABLE_ISOLATION
+	bool "Remove the kernel mapping in user mode"
+	depends on X86_64 && !UML
+	help
+	  This feature reduces the number of hardware side channels by
+	  ensuring that the majority of kernel addresses are not mapped
+	  into userspace.
+
+	  See Documentation/x86/pagetable-isolation.txt for more details.
+
 config SECURITY_INFINIBAND
 config SECURITY_INFINIBAND
 	bool "Infiniband Security Hooks"
 	bool "Infiniband Security Hooks"
 	depends on SECURITY && INFINIBAND
 	depends on SECURITY && INFINIBAND

+ 1 - 2
tools/testing/selftests/x86/ldt_gdt.c

@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
 	 * NB: Different Linux versions do different things with the
 	 * NB: Different Linux versions do different things with the
 	 * accessed bit in set_thread_area().
 	 * accessed bit in set_thread_area().
 	 */
 	 */
-	if (ar != expected_ar &&
-	    (ldt || ar != (expected_ar | AR_ACCESSED))) {
+	if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
 		printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
 		printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
 		       (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
 		       (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
 		nerrs++;
 		nerrs++;