7 年之前 · 5aa90a8458
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2708,6 +2708,8 @@
 
															 			steal time is computed, but won't influence scheduler
														
 
															 			behaviour
														
 
															+	nopti		[X86-64] Disable kernel page table isolation
														
 
															+
														
 
															 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
														
 
															 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
														
@@ -3282,6 +3284,12 @@
 
															 	pt.		[PARIDE]
														
 
															 			See Documentation/blockdev/paride.txt.
														
 
															+	pti=		[X86_64]
														
 
															+			Control user/kernel address space isolation:
														
 
															+			on - enable
														
 
															+			off - disable
														
 
															+			auto - default setting
														
 
															+
														
 
															 	pty.legacy_count=
														
 
															 			[KNL] Number of legacy pty's. Overwrites compiled-in
														
 
															 			default number.
														
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 
															 ... unused hole ...
														
 
															 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
														
 
															 ... unused hole ...
														
 
															+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
														
 
															 fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
														
 
															 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
														
 
															 ... unused hole ...
														
@@ -29,8 +30,8 @@ Virtual memory map with 5 level page tables:
 
															 hole caused by [56:63] sign extension
														
 
															 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
														
 
															 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
														
 
															-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
														
 
															-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
														
 
															+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
														
 
															+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
														
 
															 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
														
 
															 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
														
 
															 ... unused hole ...
														
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -23,6 +23,9 @@
 
															  */
														
 
															 #undef CONFIG_AMD_MEM_ENCRYPT
														
 
															+/* No PAGE_TABLE_ISOLATION support needed either: */
														
 
															+#undef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+
														
 
															 #include "misc.h"
														
 
															 /* These actually do the work of building the kernel identity maps. */
														
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,6 +1,11 @@
 
															 /* SPDX-License-Identifier: GPL-2.0 */
														
 
															 #include <linux/jump_label.h>
														
 
															 #include <asm/unwind_hints.h>
														
 
															+#include <asm/cpufeatures.h>
														
 
															+#include <asm/page_types.h>
														
 
															+#include <asm/percpu.h>
														
 
															+#include <asm/asm-offsets.h>
														
 
															+#include <asm/processor-flags.h>
														
 
															 /*
														
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
 
															 #endif
														
 
															 .endm
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+
														
 
															+/*
														
 
															+ * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
														
 
															+ * halves:
														
 
															+ */
														
 
															+#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
														
 
															+#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
														
 
															+
														
 
															+.macro SET_NOFLUSH_BIT	reg:req
														
 
															+	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
														
 
															+.endm
														
 
															+
														
 
															+.macro ADJUST_KERNEL_CR3 reg:req
														
 
															+	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
														
 
															+	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
														
 
															+	andq    $(~PTI_SWITCH_MASK), \reg
														
 
															+.endm
														
 
															+
														
 
															+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
														
 
															+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
														
 
															+	mov	%cr3, \scratch_reg
														
 
															+	ADJUST_KERNEL_CR3 \scratch_reg
														
 
															+	mov	\scratch_reg, %cr3
														
 
															+.Lend_\@:
														
 
															+.endm
														
 
															+
														
 
															+#define THIS_CPU_user_pcid_flush_mask   \
														
 
															+	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
														
 
															+
														
 
															+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
														
 
															+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
														
 
															+	mov	%cr3, \scratch_reg
														
 
															+
														
 
															+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
														
 
															+
														
 
															+	/*
														
 
															+	 * Test if the ASID needs a flush.
														
 
															+	 */
														
 
															+	movq	\scratch_reg, \scratch_reg2
														
 
															+	andq	$(0x7FF), \scratch_reg		/* mask ASID */
														
 
															+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
														
 
															+	jnc	.Lnoflush_\@
														
 
															+
														
 
															+	/* Flush needed, clear the bit */
														
 
															+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
														
 
															+	movq	\scratch_reg2, \scratch_reg
														
 
															+	jmp	.Lwrcr3_\@
														
 
															+
														
 
															+.Lnoflush_\@:
														
 
															+	movq	\scratch_reg2, \scratch_reg
														
 
															+	SET_NOFLUSH_BIT \scratch_reg
														
 
															+
														
 
															+.Lwrcr3_\@:
														
 
															+	/* Flip the PGD and ASID to the user version */
														
 
															+	orq     $(PTI_SWITCH_MASK), \scratch_reg
														
 
															+	mov	\scratch_reg, %cr3
														
 
															+.Lend_\@:
														
 
															+.endm
														
 
															+
														
 
															+.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
														
 
															+	pushq	%rax
														
 
															+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
														
 
															+	popq	%rax
														
 
															+.endm
														
 
															+
														
 
															+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
														
 
															+	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
														
 
															+	movq	%cr3, \scratch_reg
														
 
															+	movq	\scratch_reg, \save_reg
														
 
															+	/*
														
 
															+	 * Is the "switch mask" all zero?  That means that both of
														
 
															+	 * these are zero:
														
 
															+	 *
														
 
															+	 *	1. The user/kernel PCID bit, and
														
 
															+	 *	2. The user/kernel "bit" that points CR3 to the
														
 
															+	 *	   bottom half of the 8k PGD
														
 
															+	 *
														
 
															+	 * That indicates a kernel CR3 value, not a user CR3.
														
 
															+	 */
														
 
															+	testq	$(PTI_SWITCH_MASK), \scratch_reg
														
 
															+	jz	.Ldone_\@
														
 
															+
														
 
															+	ADJUST_KERNEL_CR3 \scratch_reg
														
 
															+	movq	\scratch_reg, %cr3
														
 
															+
														
 
															+.Ldone_\@:
														
 
															+.endm
														
 
															+
														
 
															+.macro RESTORE_CR3 scratch_reg:req save_reg:req
														
 
															+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
														
 
															+
														
 
															+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
														
 
															+
														
 
															+	/*
														
 
															+	 * KERNEL pages can always resume with NOFLUSH as we do
														
 
															+	 * explicit flushes.
														
 
															+	 */
														
 
															+	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
														
 
															+	jnc	.Lnoflush_\@
														
 
															+
														
 
															+	/*
														
 
															+	 * Check if there's a pending flush for the user ASID we're
														
 
															+	 * about to set.
														
 
															+	 */
														
 
															+	movq	\save_reg, \scratch_reg
														
 
															+	andq	$(0x7FF), \scratch_reg
														
 
															+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
														
 
															+	jnc	.Lnoflush_\@
														
 
															+
														
 
															+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
														
 
															+	jmp	.Lwrcr3_\@
														
 
															+
														
 
															+.Lnoflush_\@:
														
 
															+	SET_NOFLUSH_BIT \save_reg
														
 
															+
														
 
															+.Lwrcr3_\@:
														
 
															+	/*
														
 
															+	 * The CR3 write could be avoided when not changing its value,
														
 
															+	 * but would require a CR3 read *and* a scratch register.
														
 
															+	 */
														
 
															+	movq	\save_reg, %cr3
														
 
															+.Lend_\@:
														
 
															+.endm
														
 
															+
														
 
															+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
														
 
															+
														
 
															+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
														
 
															+.endm
														
 
															+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
														
 
															+.endm
														
 
															+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
														
 
															+.endm
														
 
															+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
														
 
															+.endm
														
 
															+.macro RESTORE_CR3 scratch_reg:req save_reg:req
														
 
															+.endm
														
 
															+
														
 
															+#endif
														
 
															+
														
 
															 #endif /* CONFIG_X86_64 */
														
 
															 /*
														
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -23,7 +23,6 @@
 
															 #include <asm/segment.h>
														
 
															 #include <asm/cache.h>
														
 
															 #include <asm/errno.h>
														
 
															-#include "calling.h"
														
 
															 #include <asm/asm-offsets.h>
														
 
															 #include <asm/msr.h>
														
 
															 #include <asm/unistd.h>
														
@@ -40,6 +39,8 @@
 
															 #include <asm/frame.h>
														
 
															 #include <linux/err.h>
														
 
															+#include "calling.h"
														
 
															+
														
 
															 .code64
														
 
															 .section .entry.text, "ax"
														
@@ -168,6 +169,9 @@ ENTRY(entry_SYSCALL_64_trampoline)
 
															 	/* Stash the user RSP. */
														
 
															 	movq	%rsp, RSP_SCRATCH
														
 
															+	/* Note: using %rsp as a scratch reg. */
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
														
 
															+
														
 
															 	/* Load the top of the task stack into RSP */
														
 
															 	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
														
@@ -207,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
 
															 	 */
														
 
															 	swapgs
														
 
															+	/*
														
 
															+	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
														
 
															+	 * is not required to switch CR3.
														
 
															+	 */
														
 
															 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
														
 
															 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
														
@@ -403,6 +411,7 @@ syscall_return_via_sysret:
 
															 	 * We are on the trampoline stack.  All regs except RDI are live.
														
 
															 	 * We can do future final exit work right here.
														
 
															 	 */
														
 
															+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
														
 
															 	popq	%rdi
														
 
															 	popq	%rsp
														
@@ -740,6 +749,8 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 
															 	 * We can do future final exit work right here.
														
 
															 	 */
														
 
															+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
														
 
															+
														
 
															 	/* Restore RDI. */
														
 
															 	popq	%rdi
														
 
															 	SWAPGS
														
@@ -822,7 +833,9 @@ native_irq_return_ldt:
 
															 	 */
														
 
															 	pushq	%rdi				/* Stash user RDI */
														
 
															-	SWAPGS
														
 
															+	SWAPGS					/* to kernel GS */
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
														
 
															+
														
 
															 	movq	PER_CPU_VAR(espfix_waddr), %rdi
														
 
															 	movq	%rax, (0*8)(%rdi)		/* user RAX */
														
 
															 	movq	(1*8)(%rsp), %rax		/* user RIP */
														
@@ -838,7 +851,6 @@ native_irq_return_ldt:
 
															 	/* Now RAX == RSP. */
														
 
															 	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
														
 
															-	popq	%rdi				/* Restore user RDI */
														
 
															 	/*
														
 
															 	 * espfix_stack[31:16] == 0.  The page tables are set up such that
														
@@ -849,7 +861,11 @@ native_irq_return_ldt:
 
															 	 * still points to an RO alias of the ESPFIX stack.
														
 
															 	 */
														
 
															 	orq	PER_CPU_VAR(espfix_stack), %rax
														
 
															-	SWAPGS
														
 
															+
														
 
															+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
														
 
															+	SWAPGS					/* to user GS */
														
 
															+	popq	%rdi				/* Restore user RDI */
														
 
															+
														
 
															 	movq	%rax, %rsp
														
 
															 	UNWIND_HINT_IRET_REGS offset=8
														
@@ -949,6 +965,8 @@ ENTRY(switch_to_thread_stack)
 
															 	UNWIND_HINT_FUNC
														
 
															 	pushq	%rdi
														
 
															+	/* Need to switch before accessing the thread stack. */
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
														
 
															 	movq	%rsp, %rdi
														
 
															 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
														
 
															 	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
														
@@ -1250,7 +1268,11 @@ ENTRY(paranoid_entry)
 
															 	js	1f				/* negative -> in kernel */
														
 
															 	SWAPGS
														
 
															 	xorl	%ebx, %ebx
														
 
															-1:	ret
														
 
															+
														
 
															+1:
														
 
															+	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
														
 
															+
														
 
															+	ret
														
 
															 END(paranoid_entry)
														
 
															 /*
														
@@ -1272,6 +1294,7 @@ ENTRY(paranoid_exit)
 
															 	testl	%ebx, %ebx			/* swapgs needed? */
														
 
															 	jnz	.Lparanoid_exit_no_swapgs
														
 
															 	TRACE_IRQS_IRETQ
														
 
															+	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
														
 
															 	SWAPGS_UNSAFE_STACK
														
 
															 	jmp	.Lparanoid_exit_restore
														
 
															 .Lparanoid_exit_no_swapgs:
														
@@ -1299,6 +1322,8 @@ ENTRY(error_entry)
 
															 	 * from user mode due to an IRET fault.
														
 
															 	 */
														
 
															 	SWAPGS
														
 
															+	/* We have user CR3.  Change to kernel CR3. */
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
														
 
															 .Lerror_entry_from_usermode_after_swapgs:
														
 
															 	/* Put us onto the real thread stack. */
														
@@ -1345,6 +1370,7 @@ ENTRY(error_entry)
 
															 	 * .Lgs_change's error handler with kernel gsbase.
														
 
															 	 */
														
 
															 	SWAPGS
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
														
 
															 	jmp .Lerror_entry_done
														
 
															 .Lbstep_iret:
														
@@ -1354,10 +1380,11 @@ ENTRY(error_entry)
 
															 .Lerror_bad_iret:
														
 
															 	/*
														
 
															-	 * We came from an IRET to user mode, so we have user gsbase.
														
 
															-	 * Switch to kernel gsbase:
														
 
															+	 * We came from an IRET to user mode, so we have user
														
 
															+	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
														
 
															 	 */
														
 
															 	SWAPGS
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
														
 
															 	/*
														
 
															 	 * Pretend that the exception came from user mode: set up pt_regs
														
@@ -1389,6 +1416,10 @@ END(error_exit)
 
															 /*
														
 
															  * Runs on exception stack.  Xen PV does not go through this path at all,
														
 
															  * so we can use real assembly here.
														
 
															+ *
														
 
															+ * Registers:
														
 
															+ *	%r14: Used to save/restore the CR3 of the interrupted context
														
 
															+ *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
														
 
															  */
														
 
															 ENTRY(nmi)
														
 
															 	UNWIND_HINT_IRET_REGS
														
@@ -1452,6 +1483,7 @@ ENTRY(nmi)
 
															 	swapgs
														
 
															 	cld
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
														
 
															 	movq	%rsp, %rdx
														
 
															 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
														
 
															 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
														
@@ -1704,6 +1736,8 @@ end_repeat_nmi:
 
															 	movq	$-1, %rsi
														
 
															 	call	do_nmi
														
 
															+	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
														
 
															+
														
 
															 	testl	%ebx, %ebx			/* swapgs needed? */
														
 
															 	jnz	nmi_restore
														
 
															 nmi_swapgs:
														
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -49,6 +49,10 @@
 
															 ENTRY(entry_SYSENTER_compat)
														
 
															 	/* Interrupts are off on entry. */
														
 
															 	SWAPGS
														
 
															+
														
 
															+	/* We are about to clobber %rsp anyway, clobbering here is OK */
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
														
 
															+
														
 
															 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
														
 
															 	/*
														
@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 
															 	pushq   $0			/* pt_regs->r14 = 0 */
														
 
															 	pushq   $0			/* pt_regs->r15 = 0 */
														
 
															+	/*
														
 
															+	 * We just saved %rdi so it is safe to clobber.  It is not
														
 
															+	 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
														
 
															+	 */
														
 
															+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
														
 
															+
														
 
															 	/*
														
 
															 	 * User mode is traced as though IRQs are on, and SYSENTER
														
 
															 	 * turned them off.
														
@@ -256,10 +266,22 @@ sysret32_from_system_call:
 
															 	 * when the system call started, which is already known to user
														
 
															 	 * code.  We zero R8-R10 to avoid info leaks.
														
 
															          */
														
 
															+	movq	RSP-ORIG_RAX(%rsp), %rsp
														
 
															+
														
 
															+	/*
														
 
															+	 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
														
 
															+	 * on the process stack which is not mapped to userspace and
														
 
															+	 * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3
														
 
															+	 * switch until after after the last reference to the process
														
 
															+	 * stack.
														
 
															+	 *
														
 
															+	 * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
														
 
															+	 */
														
 
															+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
														
 
															+
														
 
															 	xorq	%r8, %r8
														
 
															 	xorq	%r9, %r9
														
 
															 	xorq	%r10, %r10
														
 
															-	movq	RSP-ORIG_RAX(%rsp), %rsp
														
 
															 	swapgs
														
 
															 	sysretl
														
 
															 END(entry_SYSCALL_compat)
														
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -344,14 +344,14 @@ int in_gate_area_no_mm(unsigned long addr)
 
															  * vsyscalls but leave the page not present.  If so, we skip calling
														
 
															  * this.
														
 
															  */
														
 
															-static void __init set_vsyscall_pgtable_user_bits(void)
														
 
															+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
														
 
															 {
														
 
															 	pgd_t *pgd;
														
 
															 	p4d_t *p4d;
														
 
															 	pud_t *pud;
														
 
															 	pmd_t *pmd;
														
 
															-	pgd = pgd_offset_k(VSYSCALL_ADDR);
														
 
															+	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
														
 
															 	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
														
 
															 	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
														
 
															 #if CONFIG_PGTABLE_LEVELS >= 5
														
@@ -373,7 +373,7 @@ void __init map_vsyscall(void)
 
															 			     vsyscall_mode == NATIVE
														
 
															 			     ? PAGE_KERNEL_VSYSCALL
														
 
															 			     : PAGE_KERNEL_VVAR);
														
 
															-		set_vsyscall_pgtable_user_bits();
														
 
															+		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
														
 
															 	}
														
 
															 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
														
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -3,16 +3,18 @@
 
															 #include <linux/types.h>
														
 
															 #include <linux/slab.h>
														
 
															+#include <asm/cpu_entry_area.h>
														
 
															 #include <asm/perf_event.h>
														
 
															 #include <asm/insn.h>
														
 
															 #include "../perf_event.h"
														
 
															+/* Waste a full page so it can be mapped into the cpu_entry_area */
														
 
															+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
														
 
															+
														
 
															 /* The size of a BTS record in bytes: */
														
 
															 #define BTS_RECORD_SIZE		24
														
 
															-#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
														
 
															-#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
														
 
															 #define PEBS_FIXUP_SIZE		PAGE_SIZE
														
 
															 /*
														
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
 
															 static DEFINE_PER_CPU(void *, insn_buffer);
														
 
															-static int alloc_pebs_buffer(int cpu)
														
 
															+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
														
 
															 {
														
 
															-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
														
 
															+	phys_addr_t pa;
														
 
															+	size_t msz = 0;
														
 
															+
														
 
															+	pa = virt_to_phys(addr);
														
 
															+	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
														
 
															+		cea_set_pte(cea, pa, prot);
														
 
															+}
														
 
															+
														
 
															+static void ds_clear_cea(void *cea, size_t size)
														
 
															+{
														
 
															+	size_t msz = 0;
														
 
															+
														
 
															+	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
														
 
															+		cea_set_pte(cea, 0, PAGE_NONE);
														
 
															+}
														
 
															+
														
 
															+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
														
 
															+{
														
 
															+	unsigned int order = get_order(size);
														
 
															 	int node = cpu_to_node(cpu);
														
 
															-	int max;
														
 
															-	void *buffer, *ibuffer;
														
 
															+	struct page *page;
														
 
															+
														
 
															+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
														
 
															+	return page ? page_address(page) : NULL;
														
 
															+}
														
 
															+
														
 
															+static void dsfree_pages(const void *buffer, size_t size)
														
 
															+{
														
 
															+	if (buffer)
														
 
															+		free_pages((unsigned long)buffer, get_order(size));
														
 
															+}
														
 
															+
														
 
															+static int alloc_pebs_buffer(int cpu)
														
 
															+{
														
 
															+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
														
 
															+	struct debug_store *ds = hwev->ds;
														
 
															+	size_t bsiz = x86_pmu.pebs_buffer_size;
														
 
															+	int max, node = cpu_to_node(cpu);
														
 
															+	void *buffer, *ibuffer, *cea;
														
 
															 	if (!x86_pmu.pebs)
														
 
															 		return 0;
														
 
															-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
														
 
															+	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
														
 
															 	if (unlikely(!buffer))
														
 
															 		return -ENOMEM;
														
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
 
															 	if (x86_pmu.intel_cap.pebs_format < 2) {
														
 
															 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
														
 
															 		if (!ibuffer) {
														
 
															-			kfree(buffer);
														
 
															+			dsfree_pages(buffer, bsiz);
														
 
															 			return -ENOMEM;
														
 
															 		}
														
 
															 		per_cpu(insn_buffer, cpu) = ibuffer;
														
 
															 	}
														
 
															-
														
 
															-	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
														
 
															-
														
 
															-	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
														
 
															+	hwev->ds_pebs_vaddr = buffer;
														
 
															+	/* Update the cpu entry area mapping */
														
 
															+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
														
 
															+	ds->pebs_buffer_base = (unsigned long) cea;
														
 
															+	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
														
 
															 	ds->pebs_index = ds->pebs_buffer_base;
														
 
															-	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
														
 
															-		max * x86_pmu.pebs_record_size;
														
 
															-
														
 
															+	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
														
 
															+	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
														
 
															 	return 0;
														
 
															 }
														
 
															 static void release_pebs_buffer(int cpu)
														
 
															 {
														
 
															-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
														
 
															+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
														
 
															+	struct debug_store *ds = hwev->ds;
														
 
															+	void *cea;
														
 
															 	if (!ds || !x86_pmu.pebs)
														
 
															 		return;
														
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
 
															 	kfree(per_cpu(insn_buffer, cpu));
														
 
															 	per_cpu(insn_buffer, cpu) = NULL;
														
 
															-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
														
 
															+	/* Clear the fixmap */
														
 
															+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
														
 
															+	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
														
 
															 	ds->pebs_buffer_base = 0;
														
 
															+	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
														
 
															+	hwev->ds_pebs_vaddr = NULL;
														
 
															 }
														
 
															 static int alloc_bts_buffer(int cpu)
														
 
															 {
														
 
															-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
														
 
															-	int node = cpu_to_node(cpu);
														
 
															-	int max, thresh;
														
 
															-	void *buffer;
														
 
															+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
														
 
															+	struct debug_store *ds = hwev->ds;
														
 
															+	void *buffer, *cea;
														
 
															+	int max;
														
 
															 	if (!x86_pmu.bts)
														
 
															 		return 0;
														
 
															-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
														
 
															+	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
														
 
															 	if (unlikely(!buffer)) {
														
 
															 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
														
 
															 		return -ENOMEM;
														
 
															 	}
														
 
															-
														
 
															-	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
														
 
															-	thresh = max / 16;
														
 
															-
														
 
															-	ds->bts_buffer_base = (u64)(unsigned long)buffer;
														
 
															+	hwev->ds_bts_vaddr = buffer;
														
 
															+	/* Update the fixmap */
														
 
															+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
														
 
															+	ds->bts_buffer_base = (unsigned long) cea;
														
 
															+	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
														
 
															 	ds->bts_index = ds->bts_buffer_base;
														
 
															-	ds->bts_absolute_maximum = ds->bts_buffer_base +
														
 
															-		max * BTS_RECORD_SIZE;
														
 
															-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
														
 
															-		thresh * BTS_RECORD_SIZE;
														
 
															-
														
 
															+	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
														
 
															+	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
														
 
															+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
														
 
															 	return 0;
														
 
															 }
														
 
															 static void release_bts_buffer(int cpu)
														
 
															 {
														
 
															-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
														
 
															+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
														
 
															+	struct debug_store *ds = hwev->ds;
														
 
															+	void *cea;
														
 
															 	if (!ds || !x86_pmu.bts)
														
 
															 		return;
														
 
															-	kfree((void *)(unsigned long)ds->bts_buffer_base);
														
 
															+	/* Clear the fixmap */
														
 
															+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
														
 
															+	ds_clear_cea(cea, BTS_BUFFER_SIZE);
														
 
															 	ds->bts_buffer_base = 0;
														
 
															+	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
														
 
															+	hwev->ds_bts_vaddr = NULL;
														
 
															 }
														
 
															 static int alloc_ds_buffer(int cpu)
														
 
															 {
														
 
															-	int node = cpu_to_node(cpu);
														
 
															-	struct debug_store *ds;
														
 
															-
														
 
															-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
														
 
															-	if (unlikely(!ds))
														
 
															-		return -ENOMEM;
														
 
															+	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
														
 
															+	memset(ds, 0, sizeof(*ds));
														
 
															 	per_cpu(cpu_hw_events, cpu).ds = ds;
														
 
															-
														
 
															 	return 0;
														
 
															 }
														
 
															 static void release_ds_buffer(int cpu)
														
 
															 {
														
 
															-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
														
 
															-
														
 
															-	if (!ds)
														
 
															-		return;
														
 
															-
														
 
															 	per_cpu(cpu_hw_events, cpu).ds = NULL;
														
 
															-	kfree(ds);
														
 
															 }
														
 
															 void release_ds_buffers(void)
														
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -14,6 +14,8 @@
 
															 #include <linux/perf_event.h>
														
 
															+#include <asm/intel_ds.h>
														
 
															+
														
 
															 /* To enable MSR tracing please use the generic trace points. */
														
 
															 /*
														
@@ -77,8 +79,6 @@ struct amd_nb {
 
															 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
														
 
															 };
														
 
															-/* The maximal number of PEBS events: */
														
 
															-#define MAX_PEBS_EVENTS		8
														
 
															 #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
														
 
															 /*
														
@@ -95,23 +95,6 @@ struct amd_nb {
 
															 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
														
 
															 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
														
 
															-/*
														
 
															- * A debug store configuration.
														
 
															- *
														
 
															- * We only support architectures that use 64bit fields.
														
 
															- */
														
 
															-struct debug_store {
														
 
															-	u64	bts_buffer_base;
														
 
															-	u64	bts_index;
														
 
															-	u64	bts_absolute_maximum;
														
 
															-	u64	bts_interrupt_threshold;
														
 
															-	u64	pebs_buffer_base;
														
 
															-	u64	pebs_index;
														
 
															-	u64	pebs_absolute_maximum;
														
 
															-	u64	pebs_interrupt_threshold;
														
 
															-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
														
 
															-};
														
 
															-
														
 
															 #define PEBS_REGS \
														
 
															 	(PERF_REG_X86_AX | \
														
 
															 	 PERF_REG_X86_BX | \
														
@@ -216,6 +199,8 @@ struct cpu_hw_events {
 
															 	 * Intel DebugStore bits
														
 
															 	 */
														
 
															 	struct debug_store	*ds;
														
 
															+	void			*ds_pebs_vaddr;
														
 
															+	void			*ds_bts_vaddr;
														
 
															 	u64			pebs_enabled;
														
 
															 	int			n_pebs;
														
 
															 	int			n_large_pebs;
														
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -5,6 +5,7 @@
 
															 #include <linux/percpu-defs.h>
														
 
															 #include <asm/processor.h>
														
 
															+#include <asm/intel_ds.h>
														
 
															 /*
														
 
															  * cpu_entry_area is a percpu region that contains things needed by the CPU
														
@@ -40,6 +41,18 @@ struct cpu_entry_area {
 
															 	 */
														
 
															 	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
														
 
															 #endif
														
 
															+#ifdef CONFIG_CPU_SUP_INTEL
														
 
															+	/*
														
 
															+	 * Per CPU debug store for Intel performance monitoring. Wastes a
														
 
															+	 * full page at the moment.
														
 
															+	 */
														
 
															+	struct debug_store cpu_debug_store;
														
 
															+	/*
														
 
															+	 * The actual PEBS/BTS buffers must be mapped to user space
														
 
															+	 * Reserve enough fixmap PTEs.
														
 
															+	 */
														
 
															+	struct debug_store_buffers cpu_debug_buffers;
														
 
															+#endif
														
 
															 };
														
 
															 #define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
														
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -197,11 +197,12 @@
 
															 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
														
 
															 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
														
 
															 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
														
 
															+#define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
														
 
															 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
														
 
															 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
														
 
															 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
														
 
															-
														
 
															+#define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
														
 
															 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
														
 
															 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
														
 
															 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
														
@@ -340,5 +341,6 @@
 
															 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
														
 
															 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
														
 
															 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
														
 
															+#define X86_BUG_CPU_INSECURE		X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
														
 
															 #endif /* _ASM_X86_CPUFEATURES_H */
														
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -21,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
															 	desc->type		= (info->read_exec_only ^ 1) << 1;
														
 
															 	desc->type	       |= info->contents << 2;
														
 
															+	/* Set the ACCESS bit so it can be mapped RO */
														
 
															+	desc->type	       |= 1;
														
 
															 	desc->s			= 1;
														
 
															 	desc->dpl		= 0x3;
														
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -50,6 +50,12 @@
 
															 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
														
 
															 #endif
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+# define DISABLE_PTI		0
														
 
															+#else
														
 
															+# define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
														
 
															+#endif
														
 
															+
														
 
															 /*
														
 
															  * Make sure to add features to the correct mask
														
 
															  */
														
@@ -60,7 +66,7 @@
 
															 #define DISABLED_MASK4	(DISABLE_PCID)
														
 
															 #define DISABLED_MASK5	0
														
 
															 #define DISABLED_MASK6	0
														
 
															-#define DISABLED_MASK7	0
														
 
															+#define DISABLED_MASK7	(DISABLE_PTI)
														
 
															 #define DISABLED_MASK8	0
														
 
															 #define DISABLED_MASK9	(DISABLE_MPX)
														
 
															 #define DISABLED_MASK10	0
														
--- a/arch/x86/include/asm/intel_ds.h
+++ b/arch/x86/include/asm/intel_ds.h
@@ -0,0 +1,36 @@
 
															+#ifndef _ASM_INTEL_DS_H
														
 
															+#define _ASM_INTEL_DS_H
														
 
															+
														
 
															+#include <linux/percpu-defs.h>
														
 
															+
														
 
															+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
														
 
															+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
														
 
															+
														
 
															+/* The maximal number of PEBS events: */
														
 
															+#define MAX_PEBS_EVENTS		8
														
 
															+
														
 
															+/*
														
 
															+ * A debug store configuration.
														
 
															+ *
														
 
															+ * We only support architectures that use 64bit fields.
														
 
															+ */
														
 
															+struct debug_store {
														
 
															+	u64	bts_buffer_base;
														
 
															+	u64	bts_index;
														
 
															+	u64	bts_absolute_maximum;
														
 
															+	u64	bts_interrupt_threshold;
														
 
															+	u64	pebs_buffer_base;
														
 
															+	u64	pebs_index;
														
 
															+	u64	pebs_absolute_maximum;
														
 
															+	u64	pebs_interrupt_threshold;
														
 
															+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
														
 
															+} __aligned(PAGE_SIZE);
														
 
															+
														
 
															+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
														
 
															+
														
 
															+struct debug_store_buffers {
														
 
															+	char	bts_buffer[BTS_BUFFER_SIZE];
														
 
															+	char	pebs_buffer[PEBS_BUFFER_SIZE];
														
 
															+};
														
 
															+
														
 
															+#endif
														
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -50,10 +50,33 @@ struct ldt_struct {
 
															 	 * call gates.  On native, we could merge the ldt_struct and LDT
														
 
															 	 * allocations, but it's not worth trying to optimize.
														
 
															 	 */
														
 
															-	struct desc_struct *entries;
														
 
															-	unsigned int nr_entries;
														
 
															+	struct desc_struct	*entries;
														
 
															+	unsigned int		nr_entries;
														
 
															+
														
 
															+	/*
														
 
															+	 * If PTI is in use, then the entries array is not mapped while we're
														
 
															+	 * in user mode.  The whole array will be aliased at the addressed
														
 
															+	 * given by ldt_slot_va(slot).  We use two slots so that we can allocate
														
 
															+	 * and map, and enable a new LDT without invalidating the mapping
														
 
															+	 * of an older, still-in-use LDT.
														
 
															+	 *
														
 
															+	 * slot will be -1 if this LDT doesn't have an alias mapping.
														
 
															+	 */
														
 
															+	int			slot;
														
 
															 };
														
 
															+/* This is a multiple of PAGE_SIZE. */
														
 
															+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
														
 
															+
														
 
															+static inline void *ldt_slot_va(int slot)
														
 
															+{
														
 
															+#ifdef CONFIG_X86_64
														
 
															+	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
														
 
															+#else
														
 
															+	BUG();
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Used for LDT copy/destruction.
														
 
															  */
														
@@ -64,6 +87,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
 
															 }
														
 
															 int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
														
 
															 void destroy_context_ldt(struct mm_struct *mm);
														
 
															+void ldt_arch_exit_mmap(struct mm_struct *mm);
														
 
															 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
														
 
															 static inline void init_new_context_ldt(struct mm_struct *mm) { }
														
 
															 static inline int ldt_dup_context(struct mm_struct *oldmm,
														
@@ -71,7 +95,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
 
															 {
														
 
															 	return 0;
														
 
															 }
														
 
															-static inline void destroy_context_ldt(struct mm_struct *mm) {}
														
 
															+static inline void destroy_context_ldt(struct mm_struct *mm) { }
														
 
															+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
														
 
															 #endif
														
 
															 static inline void load_mm_ldt(struct mm_struct *mm)
														
@@ -96,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 
															 	 * that we can see.
														
 
															 	 */
														
 
															-	if (unlikely(ldt))
														
 
															-		set_ldt(ldt->entries, ldt->nr_entries);
														
 
															-	else
														
 
															+	if (unlikely(ldt)) {
														
 
															+		if (static_cpu_has(X86_FEATURE_PTI)) {
														
 
															+			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
														
 
															+				/*
														
 
															+				 * Whoops -- either the new LDT isn't mapped
														
 
															+				 * (if slot == -1) or is mapped into a bogus
														
 
															+				 * slot (if slot > 1).
														
 
															+				 */
														
 
															+				clear_LDT();
														
 
															+				return;
														
 
															+			}
														
 
															+
														
 
															+			/*
														
 
															+			 * If page table isolation is enabled, ldt->entries
														
 
															+			 * will not be mapped in the userspace pagetables.
														
 
															+			 * Tell the CPU to access the LDT through the alias
														
 
															+			 * at ldt_slot_va(ldt->slot).
														
 
															+			 */
														
 
															+			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
														
 
															+		} else {
														
 
															+			set_ldt(ldt->entries, ldt->nr_entries);
														
 
															+		}
														
 
															+	} else {
														
 
															 		clear_LDT();
														
 
															+	}
														
 
															 #else
														
 
															 	clear_LDT();
														
 
															 #endif
														
@@ -194,6 +240,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 
															 static inline void arch_exit_mmap(struct mm_struct *mm)
														
 
															 {
														
 
															 	paravirt_arch_exit_mmap(mm);
														
 
															+	ldt_arch_exit_mmap(mm);
														
 
															 }
														
 
															 #ifdef CONFIG_X86_64
														
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
 
															  */
														
 
															 extern gfp_t __userpte_alloc_gfp;
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+/*
														
 
															+ * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
														
 
															+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
														
 
															+ * in a pointer to swap between the two 4k halves.
														
 
															+ */
														
 
															+#define PGD_ALLOCATION_ORDER 1
														
 
															+#else
														
 
															+#define PGD_ALLOCATION_ORDER 0
														
 
															+#endif
														
 
															+
														
 
															 /*
														
 
															  * Allocate and free page tables.
														
 
															  */
														
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
 
															 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
														
 
															 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
														
 
															+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
														
 
															 void ptdump_walk_pgd_level_checkwx(void);
														
 
															 #ifdef CONFIG_DEBUG_WX
														
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 
															 static inline int p4d_bad(p4d_t p4d)
														
 
															 {
														
 
															-	return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
														
 
															+	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
														
 
															+
														
 
															+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
														
 
															+		ignore_flags |= _PAGE_NX;
														
 
															+
														
 
															+	return (p4d_flags(p4d) & ~ignore_flags) != 0;
														
 
															 }
														
 
															 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
														
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 
															 static inline int pgd_bad(pgd_t pgd)
														
 
															 {
														
 
															-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
														
 
															+	unsigned long ignore_flags = _PAGE_USER;
														
 
															+
														
 
															+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
														
 
															+		ignore_flags |= _PAGE_NX;
														
 
															+
														
 
															+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
														
 
															 }
														
 
															 static inline int pgd_none(pgd_t pgd)
														
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
 
															  * pgd_offset() returns a (pgd_t *)
														
 
															  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
														
 
															  */
														
 
															-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
														
 
															+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
														
 
															+/*
														
 
															+ * a shortcut to get a pgd_t in a given mm
														
 
															+ */
														
 
															+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
														
 
															 /*
														
 
															  * a shortcut which implies the use of the kernel's pgd, instead
														
 
															  * of a process's
														
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
 
															  */
														
 
															 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
														
 
															 {
														
 
															-       memcpy(dst, src, count * sizeof(pgd_t));
														
 
															+	memcpy(dst, src, count * sizeof(pgd_t));
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return;
														
 
															+	/* Clone the user space pgd as well */
														
 
															+	memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
														
 
															+	       count * sizeof(pgd_t));
														
 
															+#endif
														
 
															 }
														
 
															 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
														
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
 
															 #endif
														
 
															 }
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+/*
														
 
															+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
														
 
															+ * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
														
 
															+ * the user one is in the last 4k.  To switch between them, you
														
 
															+ * just need to flip the 12th bit in their addresses.
														
 
															+ */
														
 
															+#define PTI_PGTABLE_SWITCH_BIT	PAGE_SHIFT
														
 
															+
														
 
															+/*
														
 
															+ * This generates better code than the inline assembly in
														
 
															+ * __set_bit().
														
 
															+ */
														
 
															+static inline void *ptr_set_bit(void *ptr, int bit)
														
 
															+{
														
 
															+	unsigned long __ptr = (unsigned long)ptr;
														
 
															+
														
 
															+	__ptr |= BIT(bit);
														
 
															+	return (void *)__ptr;
														
 
															+}
														
 
															+static inline void *ptr_clear_bit(void *ptr, int bit)
														
 
															+{
														
 
															+	unsigned long __ptr = (unsigned long)ptr;
														
 
															+
														
 
															+	__ptr &= ~BIT(bit);
														
 
															+	return (void *)__ptr;
														
 
															+}
														
 
															+
														
 
															+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
														
 
															+{
														
 
															+	return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
														
 
															+}
														
 
															+
														
 
															+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
														
 
															+{
														
 
															+	return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
														
 
															+}
														
 
															+
														
 
															+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
														
 
															+{
														
 
															+	return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
														
 
															+}
														
 
															+
														
 
															+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
														
 
															+{
														
 
															+	return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
														
 
															+}
														
 
															+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
														
 
															+
														
 
															+/*
														
 
															+ * Page table pages are page-aligned.  The lower half of the top
														
 
															+ * level is used for userspace and the top half for the kernel.
														
 
															+ *
														
 
															+ * Returns true for parts of the PGD that map userspace and
														
 
															+ * false for the parts that map the kernel.
														
 
															+ */
														
 
															+static inline bool pgdp_maps_userspace(void *__ptr)
														
 
															+{
														
 
															+	unsigned long ptr = (unsigned long)__ptr;
														
 
															+
														
 
															+	return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
														
 
															+}
														
 
															+
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
														
 
															+
														
 
															+/*
														
 
															+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
														
 
															+ * Populates the user and returns the resulting PGD that must be set in
														
 
															+ * the kernel copy of the page tables.
														
 
															+ */
														
 
															+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
														
 
															+{
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return pgd;
														
 
															+	return __pti_set_user_pgd(pgdp, pgd);
														
 
															+}
														
 
															+#else
														
 
															+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
														
 
															+{
														
 
															+	return pgd;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
														
 
															 {
														
 
															+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
														
 
															+	p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
														
 
															+#else
														
 
															 	*p4dp = p4d;
														
 
															+#endif
														
 
															 }
														
 
															 static inline void native_p4d_clear(p4d_t *p4d)
														
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
 
															 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
														
 
															 {
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	*pgdp = pti_set_user_pgd(pgdp, pgd);
														
 
															+#else
														
 
															 	*pgdp = pgd;
														
 
															+#endif
														
 
															 }
														
 
															 static inline void native_pgd_clear(pgd_t *pgd)
														
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -79,13 +79,17 @@ typedef struct { pteval_t pte; } pte_t;
 
															 #define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
														
 
															 #ifdef CONFIG_X86_5LEVEL
														
 
															-# define VMALLOC_SIZE_TB	_AC(16384, UL)
														
 
															-# define __VMALLOC_BASE		_AC(0xff92000000000000, UL)
														
 
															+# define VMALLOC_SIZE_TB	_AC(12800, UL)
														
 
															+# define __VMALLOC_BASE		_AC(0xffa0000000000000, UL)
														
 
															 # define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)
														
 
															+# define LDT_PGD_ENTRY		_AC(-112, UL)
														
 
															+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
														
 
															 #else
														
 
															 # define VMALLOC_SIZE_TB	_AC(32, UL)
														
 
															 # define __VMALLOC_BASE		_AC(0xffffc90000000000, UL)
														
 
															 # define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)
														
 
															+# define LDT_PGD_ENTRY		_AC(-4, UL)
														
 
															+# define LDT_BASE_ADDR		(LDT_PGD_ENTRY << PGDIR_SHIFT)
														
 
															 #endif
														
 
															 #ifdef CONFIG_RANDOMIZE_MEMORY
														
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -38,6 +38,11 @@
 
															 #define CR3_ADDR_MASK	__sme_clr(0x7FFFFFFFFFFFF000ull)
														
 
															 #define CR3_PCID_MASK	0xFFFull
														
 
															 #define CR3_NOFLUSH	BIT_ULL(63)
														
 
															+
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+# define X86_CR3_PTI_SWITCH_BIT	11
														
 
															+#endif
														
 
															+
														
 
															 #else
														
 
															 /*
														
 
															  * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
														
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -852,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x)
 
															 #else
														
 
															 /*
														
 
															- * User space process size. 47bits minus one guard page.  The guard
														
 
															- * page is necessary on Intel CPUs: if a SYSCALL instruction is at
														
 
															- * the highest possible canonical userspace address, then that
														
 
															- * syscall will enter the kernel with a non-canonical return
														
 
															- * address, and SYSRET will explode dangerously.  We avoid this
														
 
															- * particular problem by preventing anything from being mapped
														
 
															- * at the maximum canonical address.
														
 
															+ * User space process size.  This is the first address outside the user range.
														
 
															+ * There are a few constraints that determine this:
														
 
															+ *
														
 
															+ * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
														
 
															+ * address, then that syscall will enter the kernel with a
														
 
															+ * non-canonical return address, and SYSRET will explode dangerously.
														
 
															+ * We avoid this particular problem by preventing anything executable
														
 
															+ * from being mapped at the maximum canonical address.
														
 
															+ *
														
 
															+ * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
														
 
															+ * CPUs malfunction if they execute code from the highest canonical page.
														
 
															+ * They'll speculate right off the end of the canonical space, and
														
 
															+ * bad things happen.  This is worked around in the same way as the
														
 
															+ * Intel problem.
														
 
															+ *
														
 
															+ * With page table isolation enabled, we map the LDT in ... [stay tuned]
														
 
															  */
														
 
															 #define TASK_SIZE_MAX	((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
														
--- a/arch/x86/include/asm/pti.h
+++ b/arch/x86/include/asm/pti.h
@@ -0,0 +1,14 @@
 
															+// SPDX-License-Identifier: GPL-2.0
														
 
															+#ifndef _ASM_X86_PTI_H
														
 
															+#define _ASM_X86_PTI_H
														
 
															+#ifndef __ASSEMBLY__
														
 
															+
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+extern void pti_init(void);
														
 
															+extern void pti_check_boottime_disable(void);
														
 
															+#else
														
 
															+static inline void pti_check_boottime_disable(void) { }
														
 
															+#endif
														
 
															+
														
 
															+#endif /* __ASSEMBLY__ */
														
 
															+#endif /* _ASM_X86_PTI_H */
														
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,38 +10,90 @@
 
															 #include <asm/special_insns.h>
														
 
															 #include <asm/smp.h>
														
 
															 #include <asm/invpcid.h>
														
 
															+#include <asm/pti.h>
														
 
															+#include <asm/processor-flags.h>
														
 
															-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
														
 
															-{
														
 
															-	/*
														
 
															-	 * Bump the generation count.  This also serves as a full barrier
														
 
															-	 * that synchronizes with switch_mm(): callers are required to order
														
 
															-	 * their read of mm_cpumask after their writes to the paging
														
 
															-	 * structures.
														
 
															-	 */
														
 
															-	return atomic64_inc_return(&mm->context.tlb_gen);
														
 
															-}
														
 
															+/*
														
 
															+ * The x86 feature is called PCID (Process Context IDentifier). It is similar
														
 
															+ * to what is traditionally called ASID on the RISC processors.
														
 
															+ *
														
 
															+ * We don't use the traditional ASID implementation, where each process/mm gets
														
 
															+ * its own ASID and flush/restart when we run out of ASID space.
														
 
															+ *
														
 
															+ * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
														
 
															+ * that came by on this CPU, allowing cheaper switch_mm between processes on
														
 
															+ * this CPU.
														
 
															+ *
														
 
															+ * We end up with different spaces for different things. To avoid confusion we
														
 
															+ * use different names for each of them:
														
 
															+ *
														
 
															+ * ASID  - [0, TLB_NR_DYN_ASIDS-1]
														
 
															+ *         the canonical identifier for an mm
														
 
															+ *
														
 
															+ * kPCID - [1, TLB_NR_DYN_ASIDS]
														
 
															+ *         the value we write into the PCID part of CR3; corresponds to the
														
 
															+ *         ASID+1, because PCID 0 is special.
														
 
															+ *
														
 
															+ * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
														
 
															+ *         for KPTI each mm has two address spaces and thus needs two
														
 
															+ *         PCID values, but we can still do with a single ASID denomination
														
 
															+ *         for each mm. Corresponds to kPCID + 2048.
														
 
															+ *
														
 
															+ */
														
 
															 /* There are 12 bits of space for ASIDS in CR3 */
														
 
															 #define CR3_HW_ASID_BITS		12
														
 
															+
														
 
															 /*
														
 
															  * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
														
 
															  * user/kernel switches
														
 
															  */
														
 
															-#define PTI_CONSUMED_ASID_BITS		0
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+# define PTI_CONSUMED_PCID_BITS	1
														
 
															+#else
														
 
															+# define PTI_CONSUMED_PCID_BITS	0
														
 
															+#endif
														
 
															+
														
 
															+#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
														
 
															-#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
														
 
															 /*
														
 
															  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
														
 
															- * for them being zero-based.  Another -1 is because ASID 0 is reserved for
														
 
															+ * for them being zero-based.  Another -1 is because PCID 0 is reserved for
														
 
															  * use by non-PCID-aware users.
														
 
															  */
														
 
															-#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
														
 
															+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
														
 
															+/*
														
 
															+ * 6 because 6 should be plenty and struct tlb_state will fit in two cache
														
 
															+ * lines.
														
 
															+ */
														
 
															+#define TLB_NR_DYN_ASIDS	6
														
 
															+
														
 
															+/*
														
 
															+ * Given @asid, compute kPCID
														
 
															+ */
														
 
															 static inline u16 kern_pcid(u16 asid)
														
 
															 {
														
 
															 	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
														
 
															+
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	/*
														
 
															+	 * Make sure that the dynamic ASID space does not confict with the
														
 
															+	 * bit we are using to switch between user and kernel ASIDs.
														
 
															+	 */
														
 
															+	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
														
 
															+
														
 
															+	/*
														
 
															+	 * The ASID being passed in here should have respected the
														
 
															+	 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
														
 
															+	 */
														
 
															+	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
														
 
															+#endif
														
 
															 	/*
														
 
															+	 * The dynamically-assigned ASIDs that get passed in are small
														
 
															+	 * (<TLB_NR_DYN_ASIDS).  They never have the high switch bit set,
														
 
															+	 * so do not bother to clear it.
														
 
															+	 *
														
 
															 	 * If PCID is on, ASID-aware code paths put the ASID+1 into the
														
 
															 	 * PCID bits.  This serves two purposes.  It prevents a nasty
														
 
															 	 * situation in which PCID-unaware code saves CR3, loads some other
														
@@ -53,6 +105,18 @@ static inline u16 kern_pcid(u16 asid)
 
															 	return asid + 1;
														
 
															 }
														
 
															+/*
														
 
															+ * Given @asid, compute uPCID
														
 
															+ */
														
 
															+static inline u16 user_pcid(u16 asid)
														
 
															+{
														
 
															+	u16 ret = kern_pcid(asid);
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
														
 
															+#endif
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															 struct pgd_t;
														
 
															 static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
														
 
															 {
														
@@ -95,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
 
															 	return !static_cpu_has(X86_FEATURE_PCID);
														
 
															 }
														
 
															-/*
														
 
															- * 6 because 6 should be plenty and struct tlb_state will fit in
														
 
															- * two cache lines.
														
 
															- */
														
 
															-#define TLB_NR_DYN_ASIDS 6
														
 
															-
														
 
															 struct tlb_context {
														
 
															 	u64 ctx_id;
														
 
															 	u64 tlb_gen;
														
@@ -134,6 +192,24 @@ struct tlb_state {
 
															 	 */
														
 
															 	bool is_lazy;
														
 
															+	/*
														
 
															+	 * If set we changed the page tables in such a way that we
														
 
															+	 * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
														
 
															+	 * This tells us to go invalidate all the non-loaded ctxs[]
														
 
															+	 * on the next context switch.
														
 
															+	 *
														
 
															+	 * The current ctx was kept up-to-date as it ran and does not
														
 
															+	 * need to be invalidated.
														
 
															+	 */
														
 
															+	bool invalidate_other;
														
 
															+
														
 
															+	/*
														
 
															+	 * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
														
 
															+	 * the corresponding user PCID needs a flush next time we
														
 
															+	 * switch to it; see SWITCH_TO_USER_CR3.
														
 
															+	 */
														
 
															+	unsigned short user_pcid_flush_mask;
														
 
															+
														
 
															 	/*
														
 
															 	 * Access to this CR4 shadow and to H/W CR4 is protected by
														
 
															 	 * disabling interrupts when modifying either one.
														
@@ -214,6 +290,14 @@ static inline unsigned long cr4_read_shadow(void)
 
															 	return this_cpu_read(cpu_tlbstate.cr4);
														
 
															 }
														
 
															+/*
														
 
															+ * Mark all other ASIDs as invalid, preserves the current.
														
 
															+ */
														
 
															+static inline void invalidate_other_asid(void)
														
 
															+{
														
 
															+	this_cpu_write(cpu_tlbstate.invalidate_other, true);
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Save some of cr4 feature set we're using (e.g.  Pentium 4MB
														
 
															  * enable and PPro Global page enable), so that any CPU's that boot
														
@@ -233,15 +317,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
															 extern void initialize_tlbstate_and_flush(void);
														
 
															+/*
														
 
															+ * Given an ASID, flush the corresponding user ASID.  We can delay this
														
 
															+ * until the next time we switch to it.
														
 
															+ *
														
 
															+ * See SWITCH_TO_USER_CR3.
														
 
															+ */
														
 
															+static inline void invalidate_user_asid(u16 asid)
														
 
															+{
														
 
															+	/* There is no user ASID if address space separation is off */
														
 
															+	if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
														
 
															+		return;
														
 
															+
														
 
															+	/*
														
 
															+	 * We only have a single ASID if PCID is off and the CR3
														
 
															+	 * write will have flushed it.
														
 
															+	 */
														
 
															+	if (!cpu_feature_enabled(X86_FEATURE_PCID))
														
 
															+		return;
														
 
															+
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return;
														
 
															+
														
 
															+	__set_bit(kern_pcid(asid),
														
 
															+		  (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * flush the entire current user mapping
														
 
															  */
														
 
															 static inline void __native_flush_tlb(void)
														
 
															 {
														
 
															+	invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
														
 
															 	/*
														
 
															-	 * If current->mm == NULL then we borrow a mm which may change during a
														
 
															-	 * task switch and therefore we must not be preempted while we write CR3
														
 
															-	 * back:
														
 
															+	 * If current->mm == NULL then we borrow a mm which may change
														
 
															+	 * during a task switch and therefore we must not be preempted
														
 
															+	 * while we write CR3 back:
														
 
															 	 */
														
 
															 	preempt_disable();
														
 
															 	native_write_cr3(__native_read_cr3());
														
@@ -259,6 +370,8 @@ static inline void __native_flush_tlb_global(void)
 
															 		/*
														
 
															 		 * Using INVPCID is considerably faster than a pair of writes
														
 
															 		 * to CR4 sandwiched inside an IRQ flag save/restore.
														
 
															+		 *
														
 
															+		 * Note, this works with CR4.PCIDE=0 or 1.
														
 
															 		 */
														
 
															 		invpcid_flush_all();
														
 
															 		return;
														
@@ -285,7 +398,21 @@ static inline void __native_flush_tlb_global(void)
 
															  */
														
 
															 static inline void __native_flush_tlb_single(unsigned long addr)
														
 
															 {
														
 
															+	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
														
 
															+
														
 
															 	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
														
 
															+
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return;
														
 
															+
														
 
															+	/*
														
 
															+	 * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
														
 
															+	 * Just use invalidate_user_asid() in case we are called early.
														
 
															+	 */
														
 
															+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
														
 
															+		invalidate_user_asid(loaded_mm_asid);
														
 
															+	else
														
 
															+		invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
														
 
															 }
														
 
															 /*
														
@@ -301,14 +428,6 @@ static inline void __flush_tlb_all(void)
 
															 		 */
														
 
															 		__flush_tlb();
														
 
															 	}
														
 
															-
														
 
															-	/*
														
 
															-	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
														
 
															-	 * we'd end up flushing kernel translations for the current ASID but
														
 
															-	 * we might fail to flush kernel translations for other cached ASIDs.
														
 
															-	 *
														
 
															-	 * To avoid this issue, we force PCID off if PGE is off.
														
 
															-	 */
														
 
															 }
														
 
															 /*
														
@@ -318,6 +437,16 @@ static inline void __flush_tlb_one(unsigned long addr)
 
															 {
														
 
															 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
														
 
															 	__flush_tlb_single(addr);
														
 
															+
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return;
														
 
															+
														
 
															+	/*
														
 
															+	 * __flush_tlb_single() will have cleared the TLB entry for this ASID,
														
 
															+	 * but since kernel space is replicated across all, we must also
														
 
															+	 * invalidate all others.
														
 
															+	 */
														
 
															+	invalidate_other_asid();
														
 
															 }
														
 
															 #define TLB_FLUSH_ALL	-1UL
														
@@ -378,6 +507,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 
															 void native_flush_tlb_others(const struct cpumask *cpumask,
														
 
															 			     const struct flush_tlb_info *info);
														
 
															+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
														
 
															+{
														
 
															+	/*
														
 
															+	 * Bump the generation count.  This also serves as a full barrier
														
 
															+	 * that synchronizes with switch_mm(): callers are required to order
														
 
															+	 * their read of mm_cpumask after their writes to the paging
														
 
															+	 * structures.
														
 
															+	 */
														
 
															+	return atomic64_inc_return(&mm->context.tlb_gen);
														
 
															+}
														
 
															+
														
 
															 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
														
 
															 					struct mm_struct *mm)
														
 
															 {
														
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -7,6 +7,7 @@
 
															 #ifdef CONFIG_X86_VSYSCALL_EMULATION
														
 
															 extern void map_vsyscall(void);
														
 
															+extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
														
 
															 /*
														
 
															  * Called on instruction fetch fault in vsyscall page.
														
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -78,7 +78,12 @@
 
															 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
														
 
															 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
														
 
															 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
														
 
															-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */
														
 
															+
														
 
															+#define X86_CR3_PCID_BITS	12
														
 
															+#define X86_CR3_PCID_MASK	(_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
														
 
															+
														
 
															+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
														
 
															+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
														
 
															 /*
														
 
															  * Intel CPU features in CR4
														
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -17,6 +17,7 @@
 
															 #include <asm/sigframe.h>
														
 
															 #include <asm/bootparam.h>
														
 
															 #include <asm/suspend.h>
														
 
															+#include <asm/tlbflush.h>
														
 
															 #ifdef CONFIG_XEN
														
 
															 #include <xen/interface/xen.h>
														
@@ -94,6 +95,9 @@ void common(void) {
 
															 	BLANK();
														
 
															 	DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
														
 
															+	/* TLB state for the entry code */
														
 
															+	OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
														
 
															+
														
 
															 	/* Layout info for cpu_entry_area */
														
 
															 	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
														
 
															 	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
														
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -922,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
															 	}
														
 
															 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
														
 
															+
														
 
															+	/* Assume for now that ALL x86 CPUs are insecure */
														
 
															+	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
														
 
															+
														
 
															 	fpu__init_system(c);
														
 
															 #ifdef CONFIG_X86_32
														
@@ -1360,7 +1364,10 @@ void syscall_init(void)
 
															 		(entry_SYSCALL_64_trampoline - _entry_trampoline);
														
 
															 	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
														
 
															-	wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
														
 
															+	if (static_cpu_has(X86_FEATURE_PTI))
														
 
															+		wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
														
 
															+	else
														
 
															+		wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
														
 
															 #ifdef CONFIG_IA32_EMULATION
														
 
															 	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
														
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -297,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
 
															 	unsigned long sp;
														
 
															 #endif
														
 
															 	printk(KERN_DEFAULT
														
 
															-	       "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter,
														
 
															+	       "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
														
 
															 	       IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT"         : "",
														
 
															 	       IS_ENABLED(CONFIG_SMP)     ? " SMP"             : "",
														
 
															 	       debug_pagealloc_enabled()  ? " DEBUG_PAGEALLOC" : "",
														
 
															-	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "");
														
 
															+	       IS_ENABLED(CONFIG_KASAN)   ? " KASAN"           : "",
														
 
															+	       IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
														
 
															+	       (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
														
 
															 	if (notify_die(DIE_OOPS, str, regs, err,
														
 
															 			current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)
														
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
 
															 	.balign	PAGE_SIZE; \
														
 
															 GLOBAL(name)
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+/*
														
 
															+ * Each PGD needs to be 8k long and 8k aligned.  We do not
														
 
															+ * ever go out to userspace with these, so we do not
														
 
															+ * strictly *need* the second page, but this allows us to
														
 
															+ * have a single set_pgd() implementation that does not
														
 
															+ * need to worry about whether it has 4k or 8k to work
														
 
															+ * with.
														
 
															+ *
														
 
															+ * This ensures PGDs are 8k long:
														
 
															+ */
														
 
															+#define PTI_USER_PGD_FILL	512
														
 
															+/* This ensures they are 8k-aligned: */
														
 
															+#define NEXT_PGD_PAGE(name) \
														
 
															+	.balign 2 * PAGE_SIZE; \
														
 
															+GLOBAL(name)
														
 
															+#else
														
 
															+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
														
 
															+#define PTI_USER_PGD_FILL	0
														
 
															+#endif
														
 
															+
														
 
															 /* Automate the creation of 1 to 1 mapping pmd entries */
														
 
															 #define PMDS(START, PERM, COUNT)			\
														
 
															 	i = 0 ;						\
														
@@ -350,13 +371,14 @@ GLOBAL(name)
 
															 	.endr
														
 
															 	__INITDATA
														
 
															-NEXT_PAGE(early_top_pgt)
														
 
															+NEXT_PGD_PAGE(early_top_pgt)
														
 
															 	.fill	511,8,0
														
 
															 #ifdef CONFIG_X86_5LEVEL
														
 
															 	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
														
 
															 #else
														
 
															 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
														
 
															 #endif
														
 
															+	.fill	PTI_USER_PGD_FILL,8,0
														
 
															 NEXT_PAGE(early_dynamic_pgts)
														
 
															 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
														
@@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
 
															 	.data
														
 
															 #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
														
 
															-NEXT_PAGE(init_top_pgt)
														
 
															+NEXT_PGD_PAGE(init_top_pgt)
														
 
															 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
														
 
															 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
														
 
															 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
														
 
															 	.org    init_top_pgt + PGD_START_KERNEL*8, 0
														
 
															 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
														
 
															 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
														
 
															+	.fill	PTI_USER_PGD_FILL,8,0
														
 
															 NEXT_PAGE(level3_ident_pgt)
														
 
															 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
														
@@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
 
															 	 */
														
 
															 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
														
 
															 #else
														
 
															-NEXT_PAGE(init_top_pgt)
														
 
															+NEXT_PGD_PAGE(init_top_pgt)
														
 
															 	.fill	512,8,0
														
 
															+	.fill	PTI_USER_PGD_FILL,8,0
														
 
															 #endif
														
 
															 #ifdef CONFIG_X86_5LEVEL
														
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -24,6 +24,7 @@
 
															 #include <linux/uaccess.h>
														
 
															 #include <asm/ldt.h>
														
 
															+#include <asm/tlb.h>
														
 
															 #include <asm/desc.h>
														
 
															 #include <asm/mmu_context.h>
														
 
															 #include <asm/syscalls.h>
														
@@ -51,13 +52,11 @@ static void refresh_ldt_segments(void)
 
															 static void flush_ldt(void *__mm)
														
 
															 {
														
 
															 	struct mm_struct *mm = __mm;
														
 
															-	mm_context_t *pc;
														
 
															 	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
														
 
															 		return;
														
 
															-	pc = &mm->context;
														
 
															-	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
														
 
															+	load_mm_ldt(mm);
														
 
															 	refresh_ldt_segments();
														
 
															 }
														
@@ -94,10 +93,126 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 
															 		return NULL;
														
 
															 	}
														
 
															+	/* The new LDT isn't aliased for PTI yet. */
														
 
															+	new_ldt->slot = -1;
														
 
															+
														
 
															 	new_ldt->nr_entries = num_entries;
														
 
															 	return new_ldt;
														
 
															 }
														
 
															+/*
														
 
															+ * If PTI is enabled, this maps the LDT into the kernelmode and
														
 
															+ * usermode tables for the given mm.
														
 
															+ *
														
 
															+ * There is no corresponding unmap function.  Even if the LDT is freed, we
														
 
															+ * leave the PTEs around until the slot is reused or the mm is destroyed.
														
 
															+ * This is harmless: the LDT is always in ordinary memory, and no one will
														
 
															+ * access the freed slot.
														
 
															+ *
														
 
															+ * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
														
 
															+ * it useful, and the flush would slow down modify_ldt().
														
 
															+ */
														
 
															+static int
														
 
															+map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
														
 
															+{
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	bool is_vmalloc, had_top_level_entry;
														
 
															+	unsigned long va;
														
 
															+	spinlock_t *ptl;
														
 
															+	pgd_t *pgd;
														
 
															+	int i;
														
 
															+
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * Any given ldt_struct should have map_ldt_struct() called at most
														
 
															+	 * once.
														
 
															+	 */
														
 
															+	WARN_ON(ldt->slot != -1);
														
 
															+
														
 
															+	/*
														
 
															+	 * Did we already have the top level entry allocated?  We can't
														
 
															+	 * use pgd_none() for this because it doens't do anything on
														
 
															+	 * 4-level page table kernels.
														
 
															+	 */
														
 
															+	pgd = pgd_offset(mm, LDT_BASE_ADDR);
														
 
															+	had_top_level_entry = (pgd->pgd != 0);
														
 
															+
														
 
															+	is_vmalloc = is_vmalloc_addr(ldt->entries);
														
 
															+
														
 
															+	for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
														
 
															+		unsigned long offset = i << PAGE_SHIFT;
														
 
															+		const void *src = (char *)ldt->entries + offset;
														
 
															+		unsigned long pfn;
														
 
															+		pte_t pte, *ptep;
														
 
															+
														
 
															+		va = (unsigned long)ldt_slot_va(slot) + offset;
														
 
															+		pfn = is_vmalloc ? vmalloc_to_pfn(src) :
														
 
															+			page_to_pfn(virt_to_page(src));
														
 
															+		/*
														
 
															+		 * Treat the PTI LDT range as a *userspace* range.
														
 
															+		 * get_locked_pte() will allocate all needed pagetables
														
 
															+		 * and account for them in this mm.
														
 
															+		 */
														
 
															+		ptep = get_locked_pte(mm, va, &ptl);
														
 
															+		if (!ptep)
														
 
															+			return -ENOMEM;
														
 
															+		/*
														
 
															+		 * Map it RO so the easy to find address is not a primary
														
 
															+		 * target via some kernel interface which misses a
														
 
															+		 * permission check.
														
 
															+		 */
														
 
															+		pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
														
 
															+		set_pte_at(mm, va, ptep, pte);
														
 
															+		pte_unmap_unlock(ptep, ptl);
														
 
															+	}
														
 
															+
														
 
															+	if (mm->context.ldt) {
														
 
															+		/*
														
 
															+		 * We already had an LDT.  The top-level entry should already
														
 
															+		 * have been allocated and synchronized with the usermode
														
 
															+		 * tables.
														
 
															+		 */
														
 
															+		WARN_ON(!had_top_level_entry);
														
 
															+		if (static_cpu_has(X86_FEATURE_PTI))
														
 
															+			WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * This is the first time we're mapping an LDT for this process.
														
 
															+		 * Sync the pgd to the usermode tables.
														
 
															+		 */
														
 
															+		WARN_ON(had_top_level_entry);
														
 
															+		if (static_cpu_has(X86_FEATURE_PTI)) {
														
 
															+			WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
														
 
															+			set_pgd(kernel_to_user_pgdp(pgd), *pgd);
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	va = (unsigned long)ldt_slot_va(slot);
														
 
															+	flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
														
 
															+
														
 
															+	ldt->slot = slot;
														
 
															+#endif
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static void free_ldt_pgtables(struct mm_struct *mm)
														
 
															+{
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	struct mmu_gather tlb;
														
 
															+	unsigned long start = LDT_BASE_ADDR;
														
 
															+	unsigned long end = start + (1UL << PGDIR_SHIFT);
														
 
															+
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return;
														
 
															+
														
 
															+	tlb_gather_mmu(&tlb, mm, start, end);
														
 
															+	free_pgd_range(&tlb, start, end, start, end);
														
 
															+	tlb_finish_mmu(&tlb, start, end);
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															 /* After calling this, the LDT is immutable. */
														
 
															 static void finalize_ldt_struct(struct ldt_struct *ldt)
														
 
															 {
														
@@ -156,6 +271,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
 
															 	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
														
 
															 	finalize_ldt_struct(new_ldt);
														
 
															+	retval = map_ldt_struct(mm, new_ldt, 0);
														
 
															+	if (retval) {
														
 
															+		free_ldt_pgtables(mm);
														
 
															+		free_ldt_struct(new_ldt);
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															 	mm->context.ldt = new_ldt;
														
 
															 out_unlock:
														
@@ -174,6 +295,11 @@ void destroy_context_ldt(struct mm_struct *mm)
 
															 	mm->context.ldt = NULL;
														
 
															 }
														
 
															+void ldt_arch_exit_mmap(struct mm_struct *mm)
														
 
															+{
														
 
															+	free_ldt_pgtables(mm);
														
 
															+}
														
 
															+
														
 
															 static int read_ldt(void __user *ptr, unsigned long bytecount)
														
 
															 {
														
 
															 	struct mm_struct *mm = current->mm;
														
@@ -287,6 +413,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
															 	new_ldt->entries[ldt_info.entry_number] = ldt;
														
 
															 	finalize_ldt_struct(new_ldt);
														
 
															+	/*
														
 
															+	 * If we are using PTI, map the new LDT into the userspace pagetables.
														
 
															+	 * If there is already an LDT, use the other slot so that other CPUs
														
 
															+	 * will continue to use the old LDT until install_ldt() switches
														
 
															+	 * them over to the new LDT.
														
 
															+	 */
														
 
															+	error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
														
 
															+	if (error) {
														
 
															+		free_ldt_struct(old_ldt);
														
 
															+		goto out_unlock;
														
 
															+	}
														
 
															+
														
 
															 	install_ldt(mm, new_ldt);
														
 
															 	free_ldt_struct(old_ldt);
														
 
															 	error = 0;
														
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -93,17 +93,10 @@ static void set_tls_desc(struct task_struct *p, int idx,
 
															 	cpu = get_cpu();
														
 
															 	while (n-- > 0) {
														
 
															-		if (LDT_empty(info) || LDT_zero(info)) {
														
 
															+		if (LDT_empty(info) || LDT_zero(info))
														
 
															 			memset(desc, 0, sizeof(*desc));
														
 
															-		} else {
														
 
															+		else
														
 
															 			fill_ldt(desc, info);
														
 
															-
														
 
															-			/*
														
 
															-			 * Always set the accessed bit so that the CPU
														
 
															-			 * doesn't try to write to the (read-only) GDT.
														
 
															-			 */
														
 
															-			desc->type |= 1;
														
 
															-		}
														
 
															 		++info;
														
 
															 		++desc;
														
 
															 	}
														
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -61,11 +61,17 @@ jiffies_64 = jiffies;
 
															 		. = ALIGN(HPAGE_SIZE);				\
														
 
															 		__end_rodata_hpage_align = .;
														
 
															+#define ALIGN_ENTRY_TEXT_BEGIN	. = ALIGN(PMD_SIZE);
														
 
															+#define ALIGN_ENTRY_TEXT_END	. = ALIGN(PMD_SIZE);
														
 
															+
														
 
															 #else
														
 
															 #define X64_ALIGN_RODATA_BEGIN
														
 
															 #define X64_ALIGN_RODATA_END
														
 
															+#define ALIGN_ENTRY_TEXT_BEGIN
														
 
															+#define ALIGN_ENTRY_TEXT_END
														
 
															+
														
 
															 #endif
														
 
															 PHDRS {
														
@@ -102,8 +108,10 @@ SECTIONS
 
															 		CPUIDLE_TEXT
														
 
															 		LOCK_TEXT
														
 
															 		KPROBES_TEXT
														
 
															+		ALIGN_ENTRY_TEXT_BEGIN
														
 
															 		ENTRY_TEXT
														
 
															 		IRQENTRY_TEXT
														
 
															+		ALIGN_ENTRY_TEXT_END
														
 
															 		SOFTIRQENTRY_TEXT
														
 
															 		*(.fixup)
														
 
															 		*(.gnu.warning)
														
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -41,9 +41,10 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 
															 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
														
 
															 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
														
 
															-obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
														
 
															-obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
														
 
															-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
														
 
															+obj-$(CONFIG_X86_INTEL_MPX)			+= mpx.o
														
 
															+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)	+= pkeys.o
														
 
															+obj-$(CONFIG_RANDOMIZE_MEMORY)			+= kaslr.o
														
 
															+obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= pti.o
														
 
															 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt.o
														
 
															 obj-$(CONFIG_AMD_MEM_ENCRYPT)	+= mem_encrypt_boot.o
														
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -38,6 +38,32 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
 
															 		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
														
 
															 }
														
 
															+static void percpu_setup_debug_store(int cpu)
														
 
															+{
														
 
															+#ifdef CONFIG_CPU_SUP_INTEL
														
 
															+	int npages;
														
 
															+	void *cea;
														
 
															+
														
 
															+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
														
 
															+		return;
														
 
															+
														
 
															+	cea = &get_cpu_entry_area(cpu)->cpu_debug_store;
														
 
															+	npages = sizeof(struct debug_store) / PAGE_SIZE;
														
 
															+	BUILD_BUG_ON(sizeof(struct debug_store) % PAGE_SIZE != 0);
														
 
															+	cea_map_percpu_pages(cea, &per_cpu(cpu_debug_store, cpu), npages,
														
 
															+			     PAGE_KERNEL);
														
 
															+
														
 
															+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers;
														
 
															+	/*
														
 
															+	 * Force the population of PMDs for not yet allocated per cpu
														
 
															+	 * memory like debug store buffers.
														
 
															+	 */
														
 
															+	npages = sizeof(struct debug_store_buffers) / PAGE_SIZE;
														
 
															+	for (; npages; npages--, cea += PAGE_SIZE)
														
 
															+		cea_set_pte(cea, 0, PAGE_NONE);
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															 /* Setup the fixmap mappings only once per-processor */
														
 
															 static void __init setup_cpu_entry_area(int cpu)
														
 
															 {
														
@@ -109,6 +135,7 @@ static void __init setup_cpu_entry_area(int cpu)
 
															 	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
														
 
															 		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
														
 
															 #endif
														
 
															+	percpu_setup_debug_store(cpu);
														
 
															 }
														
 
															 static __init void setup_cpu_entry_area_ptes(void)
														
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -5,7 +5,7 @@
 
															 static int ptdump_show(struct seq_file *m, void *v)
														
 
															 {
														
 
															-	ptdump_walk_pgd_level(m, NULL);
														
 
															+	ptdump_walk_pgd_level_debugfs(m, NULL, false);
														
 
															 	return 0;
														
 
															 }
														
@@ -22,21 +22,89 @@ static const struct file_operations ptdump_fops = {
 
															 	.release	= single_release,
														
 
															 };
														
 
															-static struct dentry *pe;
														
 
															+static int ptdump_show_curknl(struct seq_file *m, void *v)
														
 
															+{
														
 
															+	if (current->mm->pgd) {
														
 
															+		down_read(&current->mm->mmap_sem);
														
 
															+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
														
 
															+		up_read(&current->mm->mmap_sem);
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int ptdump_open_curknl(struct inode *inode, struct file *filp)
														
 
															+{
														
 
															+	return single_open(filp, ptdump_show_curknl, NULL);
														
 
															+}
														
 
															+
														
 
															+static const struct file_operations ptdump_curknl_fops = {
														
 
															+	.owner		= THIS_MODULE,
														
 
															+	.open		= ptdump_open_curknl,
														
 
															+	.read		= seq_read,
														
 
															+	.llseek		= seq_lseek,
														
 
															+	.release	= single_release,
														
 
															+};
														
 
															+
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+static struct dentry *pe_curusr;
														
 
															+
														
 
															+static int ptdump_show_curusr(struct seq_file *m, void *v)
														
 
															+{
														
 
															+	if (current->mm->pgd) {
														
 
															+		down_read(&current->mm->mmap_sem);
														
 
															+		ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
														
 
															+		up_read(&current->mm->mmap_sem);
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static int ptdump_open_curusr(struct inode *inode, struct file *filp)
														
 
															+{
														
 
															+	return single_open(filp, ptdump_show_curusr, NULL);
														
 
															+}
														
 
															+
														
 
															+static const struct file_operations ptdump_curusr_fops = {
														
 
															+	.owner		= THIS_MODULE,
														
 
															+	.open		= ptdump_open_curusr,
														
 
															+	.read		= seq_read,
														
 
															+	.llseek		= seq_lseek,
														
 
															+	.release	= single_release,
														
 
															+};
														
 
															+#endif
														
 
															+
														
 
															+static struct dentry *dir, *pe_knl, *pe_curknl;
														
 
															 static int __init pt_dump_debug_init(void)
														
 
															 {
														
 
															-	pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL,
														
 
															-				 &ptdump_fops);
														
 
															-	if (!pe)
														
 
															+	dir = debugfs_create_dir("page_tables", NULL);
														
 
															+	if (!dir)
														
 
															 		return -ENOMEM;
														
 
															+	pe_knl = debugfs_create_file("kernel", 0400, dir, NULL,
														
 
															+				     &ptdump_fops);
														
 
															+	if (!pe_knl)
														
 
															+		goto err;
														
 
															+
														
 
															+	pe_curknl = debugfs_create_file("current_kernel", 0400,
														
 
															+					dir, NULL, &ptdump_curknl_fops);
														
 
															+	if (!pe_curknl)
														
 
															+		goto err;
														
 
															+
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	pe_curusr = debugfs_create_file("current_user", 0400,
														
 
															+					dir, NULL, &ptdump_curusr_fops);
														
 
															+	if (!pe_curusr)
														
 
															+		goto err;
														
 
															+#endif
														
 
															 	return 0;
														
 
															+err:
														
 
															+	debugfs_remove_recursive(dir);
														
 
															+	return -ENOMEM;
														
 
															 }
														
 
															 static void __exit pt_dump_debug_exit(void)
														
 
															 {
														
 
															-	debugfs_remove_recursive(pe);
														
 
															+	debugfs_remove_recursive(dir);
														
 
															 }
														
 
															 module_init(pt_dump_debug_init);
														
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -52,11 +52,17 @@ enum address_markers_idx {
 
															 	USER_SPACE_NR = 0,
														
 
															 	KERNEL_SPACE_NR,
														
 
															 	LOW_KERNEL_NR,
														
 
															+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
														
 
															+	LDT_NR,
														
 
															+#endif
														
 
															 	VMALLOC_START_NR,
														
 
															 	VMEMMAP_START_NR,
														
 
															 #ifdef CONFIG_KASAN
														
 
															 	KASAN_SHADOW_START_NR,
														
 
															 	KASAN_SHADOW_END_NR,
														
 
															+#endif
														
 
															+#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
														
 
															+	LDT_NR,
														
 
															 #endif
														
 
															 	CPU_ENTRY_AREA_NR,
														
 
															 #ifdef CONFIG_X86_ESPFIX64
														
@@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
 
															 #ifdef CONFIG_KASAN
														
 
															 	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" },
														
 
															 	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" },
														
 
															+#endif
														
 
															+#ifdef CONFIG_MODIFY_LDT_SYSCALL
														
 
															+	[LDT_NR]		= { LDT_BASE_ADDR,	"LDT remap" },
														
 
															 #endif
														
 
															 	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
														
 
															 #ifdef CONFIG_X86_ESPFIX64
														
@@ -467,7 +476,7 @@ static inline bool is_hypervisor_range(int idx)
 
															 }
														
 
															 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
														
 
															-				       bool checkwx)
														
 
															+				       bool checkwx, bool dmesg)
														
 
															 {
														
 
															 #ifdef CONFIG_X86_64
														
 
															 	pgd_t *start = (pgd_t *) &init_top_pgt;
														
@@ -480,7 +489,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
															 	if (pgd) {
														
 
															 		start = pgd;
														
 
															-		st.to_dmesg = true;
														
 
															+		st.to_dmesg = dmesg;
														
 
															 	}
														
 
															 	st.check_wx = checkwx;
														
@@ -518,13 +527,37 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
															 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
														
 
															 {
														
 
															-	ptdump_walk_pgd_level_core(m, pgd, false);
														
 
															+	ptdump_walk_pgd_level_core(m, pgd, false, true);
														
 
															+}
														
 
															+
														
 
															+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
														
 
															+{
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	if (user && static_cpu_has(X86_FEATURE_PTI))
														
 
															+		pgd = kernel_to_user_pgdp(pgd);
														
 
															+#endif
														
 
															+	ptdump_walk_pgd_level_core(m, pgd, false, false);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
														
 
															+
														
 
															+static void ptdump_walk_user_pgd_level_checkwx(void)
														
 
															+{
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+	pgd_t *pgd = (pgd_t *) &init_top_pgt;
														
 
															+
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return;
														
 
															+
														
 
															+	pr_info("x86/mm: Checking user space page tables\n");
														
 
															+	pgd = kernel_to_user_pgdp(pgd);
														
 
															+	ptdump_walk_pgd_level_core(NULL, pgd, true, false);
														
 
															+#endif
														
 
															 }
														
 
															-EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level);
														
 
															 void ptdump_walk_pgd_level_checkwx(void)
														
 
															 {
														
 
															-	ptdump_walk_pgd_level_core(NULL, NULL, true);
														
 
															+	ptdump_walk_pgd_level_core(NULL, NULL, true, false);
														
 
															+	ptdump_walk_user_pgd_level_checkwx();
														
 
															 }
														
 
															 static int __init pt_dump_init(void)
														
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -20,6 +20,7 @@
 
															 #include <asm/kaslr.h>
														
 
															 #include <asm/hypervisor.h>
														
 
															 #include <asm/cpufeature.h>
														
 
															+#include <asm/pti.h>
														
 
															 /*
														
 
															  * We need to define the tracepoints somewhere, and tlb.c
														
@@ -160,6 +161,12 @@ struct map_range {
 
															 static int page_size_mask;
														
 
															+static void enable_global_pages(void)
														
 
															+{
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		__supported_pte_mask |= _PAGE_GLOBAL;
														
 
															+}
														
 
															+
														
 
															 static void __init probe_page_size_mask(void)
														
 
															 {
														
 
															 	/*
														
@@ -177,11 +184,11 @@ static void __init probe_page_size_mask(void)
 
															 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
														
 
															 	/* Enable PGE if available */
														
 
															+	__supported_pte_mask &= ~_PAGE_GLOBAL;
														
 
															 	if (boot_cpu_has(X86_FEATURE_PGE)) {
														
 
															 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
														
 
															-		__supported_pte_mask |= _PAGE_GLOBAL;
														
 
															-	} else
														
 
															-		__supported_pte_mask &= ~_PAGE_GLOBAL;
														
 
															+		enable_global_pages();
														
 
															+	}
														
 
															 	/* Enable 1 GB linear kernel mappings if available: */
														
 
															 	if (direct_gbpages && boot_cpu_has(X86_FEATURE_GBPAGES)) {
														
@@ -194,34 +201,44 @@ static void __init probe_page_size_mask(void)
 
															 static void setup_pcid(void)
														
 
															 {
														
 
															-#ifdef CONFIG_X86_64
														
 
															-	if (boot_cpu_has(X86_FEATURE_PCID)) {
														
 
															-		if (boot_cpu_has(X86_FEATURE_PGE)) {
														
 
															-			/*
														
 
															-			 * This can't be cr4_set_bits_and_update_boot() --
														
 
															-			 * the trampoline code can't handle CR4.PCIDE and
														
 
															-			 * it wouldn't do any good anyway.  Despite the name,
														
 
															-			 * cr4_set_bits_and_update_boot() doesn't actually
														
 
															-			 * cause the bits in question to remain set all the
														
 
															-			 * way through the secondary boot asm.
														
 
															-			 *
														
 
															-			 * Instead, we brute-force it and set CR4.PCIDE
														
 
															-			 * manually in start_secondary().
														
 
															-			 */
														
 
															-			cr4_set_bits(X86_CR4_PCIDE);
														
 
															-		} else {
														
 
															-			/*
														
 
															-			 * flush_tlb_all(), as currently implemented, won't
														
 
															-			 * work if PCID is on but PGE is not.  Since that
														
 
															-			 * combination doesn't exist on real hardware, there's
														
 
															-			 * no reason to try to fully support it, but it's
														
 
															-			 * polite to avoid corrupting data if we're on
														
 
															-			 * an improperly configured VM.
														
 
															-			 */
														
 
															-			setup_clear_cpu_cap(X86_FEATURE_PCID);
														
 
															-		}
														
 
															+	if (!IS_ENABLED(CONFIG_X86_64))
														
 
															+		return;
														
 
															+
														
 
															+	if (!boot_cpu_has(X86_FEATURE_PCID))
														
 
															+		return;
														
 
															+
														
 
															+	if (boot_cpu_has(X86_FEATURE_PGE)) {
														
 
															+		/*
														
 
															+		 * This can't be cr4_set_bits_and_update_boot() -- the
														
 
															+		 * trampoline code can't handle CR4.PCIDE and it wouldn't
														
 
															+		 * do any good anyway.  Despite the name,
														
 
															+		 * cr4_set_bits_and_update_boot() doesn't actually cause
														
 
															+		 * the bits in question to remain set all the way through
														
 
															+		 * the secondary boot asm.
														
 
															+		 *
														
 
															+		 * Instead, we brute-force it and set CR4.PCIDE manually in
														
 
															+		 * start_secondary().
														
 
															+		 */
														
 
															+		cr4_set_bits(X86_CR4_PCIDE);
														
 
															+
														
 
															+		/*
														
 
															+		 * INVPCID's single-context modes (2/3) only work if we set
														
 
															+		 * X86_CR4_PCIDE, *and* we INVPCID support.  It's unusable
														
 
															+		 * on systems that have X86_CR4_PCIDE clear, or that have
														
 
															+		 * no INVPCID support at all.
														
 
															+		 */
														
 
															+		if (boot_cpu_has(X86_FEATURE_INVPCID))
														
 
															+			setup_force_cpu_cap(X86_FEATURE_INVPCID_SINGLE);
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * flush_tlb_all(), as currently implemented, won't work if
														
 
															+		 * PCID is on but PGE is not.  Since that combination
														
 
															+		 * doesn't exist on real hardware, there's no reason to try
														
 
															+		 * to fully support it, but it's polite to avoid corrupting
														
 
															+		 * data if we're on an improperly configured VM.
														
 
															+		 */
														
 
															+		setup_clear_cpu_cap(X86_FEATURE_PCID);
														
 
															 	}
														
 
															-#endif
														
 
															 }
														
 
															 #ifdef CONFIG_X86_32
														
@@ -622,6 +639,7 @@ void __init init_mem_mapping(void)
 
															 {
														
 
															 	unsigned long end;
														
 
															+	pti_check_boottime_disable();
														
 
															 	probe_page_size_mask();
														
 
															 	setup_pcid();
														
@@ -845,7 +863,7 @@ void __init zone_sizes_init(void)
 
															 	free_area_init_nodes(max_zone_pfns);
														
 
															 }
														
 
															-DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
														
 
															+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
														
 
															 	.loaded_mm = &init_mm,
														
 
															 	.next_asid = 1,
														
 
															 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
														
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -355,14 +355,15 @@ static inline void _pgd_free(pgd_t *pgd)
 
															 		kmem_cache_free(pgd_cache, pgd);
														
 
															 }
														
 
															 #else
														
 
															+
														
 
															 static inline pgd_t *_pgd_alloc(void)
														
 
															 {
														
 
															-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
														
 
															+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
														
 
															 }
														
 
															 static inline void _pgd_free(pgd_t *pgd)
														
 
															 {
														
 
															-	free_page((unsigned long)pgd);
														
 
															+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
														
 
															 }
														
 
															 #endif /* CONFIG_X86_PAE */
														
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -0,0 +1,387 @@
 
															+/*
														
 
															+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
														
 
															+ *
														
 
															+ * This program is free software; you can redistribute it and/or modify
														
 
															+ * it under the terms of version 2 of the GNU General Public License as
														
 
															+ * published by the Free Software Foundation.
														
 
															+ *
														
 
															+ * This program is distributed in the hope that it will be useful, but
														
 
															+ * WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
														
 
															+ * General Public License for more details.
														
 
															+ *
														
 
															+ * This code is based in part on work published here:
														
 
															+ *
														
 
															+ *	https://github.com/IAIK/KAISER
														
 
															+ *
														
 
															+ * The original work was written by and and signed off by for the Linux
														
 
															+ * kernel by:
														
 
															+ *
														
 
															+ *   Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
														
 
															+ *   Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
														
 
															+ *   Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
														
 
															+ *   Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
														
 
															+ *
														
 
															+ * Major changes to the original code by: Dave Hansen <dave.hansen@intel.com>
														
 
															+ * Mostly rewritten by Thomas Gleixner <tglx@linutronix.de> and
														
 
															+ *		       Andy Lutomirsky <luto@amacapital.net>
														
 
															+ */
														
 
															+#include <linux/kernel.h>
														
 
															+#include <linux/errno.h>
														
 
															+#include <linux/string.h>
														
 
															+#include <linux/types.h>
														
 
															+#include <linux/bug.h>
														
 
															+#include <linux/init.h>
														
 
															+#include <linux/spinlock.h>
														
 
															+#include <linux/mm.h>
														
 
															+#include <linux/uaccess.h>
														
 
															+
														
 
															+#include <asm/cpufeature.h>
														
 
															+#include <asm/hypervisor.h>
														
 
															+#include <asm/vsyscall.h>
														
 
															+#include <asm/cmdline.h>
														
 
															+#include <asm/pti.h>
														
 
															+#include <asm/pgtable.h>
														
 
															+#include <asm/pgalloc.h>
														
 
															+#include <asm/tlbflush.h>
														
 
															+#include <asm/desc.h>
														
 
															+
														
 
															+#undef pr_fmt
														
 
															+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
														
 
															+
														
 
															+/* Backporting helper */
														
 
															+#ifndef __GFP_NOTRACK
														
 
															+#define __GFP_NOTRACK	0
														
 
															+#endif
														
 
															+
														
 
															+static void __init pti_print_if_insecure(const char *reason)
														
 
															+{
														
 
															+	if (boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
														
 
															+		pr_info("%s\n", reason);
														
 
															+}
														
 
															+
														
 
															+static void __init pti_print_if_secure(const char *reason)
														
 
															+{
														
 
															+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
														
 
															+		pr_info("%s\n", reason);
														
 
															+}
														
 
															+
														
 
															+void __init pti_check_boottime_disable(void)
														
 
															+{
														
 
															+	char arg[5];
														
 
															+	int ret;
														
 
															+
														
 
															+	if (hypervisor_is_type(X86_HYPER_XEN_PV)) {
														
 
															+		pti_print_if_insecure("disabled on XEN PV.");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
														
 
															+	if (ret > 0)  {
														
 
															+		if (ret == 3 && !strncmp(arg, "off", 3)) {
														
 
															+			pti_print_if_insecure("disabled on command line.");
														
 
															+			return;
														
 
															+		}
														
 
															+		if (ret == 2 && !strncmp(arg, "on", 2)) {
														
 
															+			pti_print_if_secure("force enabled on command line.");
														
 
															+			goto enable;
														
 
															+		}
														
 
															+		if (ret == 4 && !strncmp(arg, "auto", 4))
														
 
															+			goto autosel;
														
 
															+	}
														
 
															+
														
 
															+	if (cmdline_find_option_bool(boot_command_line, "nopti")) {
														
 
															+		pti_print_if_insecure("disabled on command line.");
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+autosel:
														
 
															+	if (!boot_cpu_has_bug(X86_BUG_CPU_INSECURE))
														
 
															+		return;
														
 
															+enable:
														
 
															+	setup_force_cpu_cap(X86_FEATURE_PTI);
														
 
															+}
														
 
															+
														
 
															+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
														
 
															+{
														
 
															+	/*
														
 
															+	 * Changes to the high (kernel) portion of the kernelmode page
														
 
															+	 * tables are not automatically propagated to the usermode tables.
														
 
															+	 *
														
 
															+	 * Users should keep in mind that, unlike the kernelmode tables,
														
 
															+	 * there is no vmalloc_fault equivalent for the usermode tables.
														
 
															+	 * Top-level entries added to init_mm's usermode pgd after boot
														
 
															+	 * will not be automatically propagated to other mms.
														
 
															+	 */
														
 
															+	if (!pgdp_maps_userspace(pgdp))
														
 
															+		return pgd;
														
 
															+
														
 
															+	/*
														
 
															+	 * The user page tables get the full PGD, accessible from
														
 
															+	 * userspace:
														
 
															+	 */
														
 
															+	kernel_to_user_pgdp(pgdp)->pgd = pgd.pgd;
														
 
															+
														
 
															+	/*
														
 
															+	 * If this is normal user memory, make it NX in the kernel
														
 
															+	 * pagetables so that, if we somehow screw up and return to
														
 
															+	 * usermode with the kernel CR3 loaded, we'll get a page fault
														
 
															+	 * instead of allowing user code to execute with the wrong CR3.
														
 
															+	 *
														
 
															+	 * As exceptions, we don't set NX if:
														
 
															+	 *  - _PAGE_USER is not set.  This could be an executable
														
 
															+	 *     EFI runtime mapping or something similar, and the kernel
														
 
															+	 *     may execute from it
														
 
															+	 *  - we don't have NX support
														
 
															+	 *  - we're clearing the PGD (i.e. the new pgd is not present).
														
 
															+	 */
														
 
															+	if ((pgd.pgd & (_PAGE_USER|_PAGE_PRESENT)) == (_PAGE_USER|_PAGE_PRESENT) &&
														
 
															+	    (__supported_pte_mask & _PAGE_NX))
														
 
															+		pgd.pgd |= _PAGE_NX;
														
 
															+
														
 
															+	/* return the copy of the PGD we want the kernel to use: */
														
 
															+	return pgd;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Walk the user copy of the page tables (optionally) trying to allocate
														
 
															+ * page table pages on the way down.
														
 
															+ *
														
 
															+ * Returns a pointer to a P4D on success, or NULL on failure.
														
 
															+ */
														
 
															+static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
														
 
															+{
														
 
															+	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
														
 
															+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
														
 
															+
														
 
															+	if (address < PAGE_OFFSET) {
														
 
															+		WARN_ONCE(1, "attempt to walk user address\n");
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															+	if (pgd_none(*pgd)) {
														
 
															+		unsigned long new_p4d_page = __get_free_page(gfp);
														
 
															+		if (!new_p4d_page)
														
 
															+			return NULL;
														
 
															+
														
 
															+		if (pgd_none(*pgd)) {
														
 
															+			set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
														
 
															+			new_p4d_page = 0;
														
 
															+		}
														
 
															+		if (new_p4d_page)
														
 
															+			free_page(new_p4d_page);
														
 
															+	}
														
 
															+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
														
 
															+
														
 
															+	return p4d_offset(pgd, address);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Walk the user copy of the page tables (optionally) trying to allocate
														
 
															+ * page table pages on the way down.
														
 
															+ *
														
 
															+ * Returns a pointer to a PMD on success, or NULL on failure.
														
 
															+ */
														
 
															+static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
														
 
															+{
														
 
															+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
														
 
															+	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
														
 
															+	pud_t *pud;
														
 
															+
														
 
															+	BUILD_BUG_ON(p4d_large(*p4d) != 0);
														
 
															+	if (p4d_none(*p4d)) {
														
 
															+		unsigned long new_pud_page = __get_free_page(gfp);
														
 
															+		if (!new_pud_page)
														
 
															+			return NULL;
														
 
															+
														
 
															+		if (p4d_none(*p4d)) {
														
 
															+			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
														
 
															+			new_pud_page = 0;
														
 
															+		}
														
 
															+		if (new_pud_page)
														
 
															+			free_page(new_pud_page);
														
 
															+	}
														
 
															+
														
 
															+	pud = pud_offset(p4d, address);
														
 
															+	/* The user page tables do not use large mappings: */
														
 
															+	if (pud_large(*pud)) {
														
 
															+		WARN_ON(1);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+	if (pud_none(*pud)) {
														
 
															+		unsigned long new_pmd_page = __get_free_page(gfp);
														
 
															+		if (!new_pmd_page)
														
 
															+			return NULL;
														
 
															+
														
 
															+		if (pud_none(*pud)) {
														
 
															+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
														
 
															+			new_pmd_page = 0;
														
 
															+		}
														
 
															+		if (new_pmd_page)
														
 
															+			free_page(new_pmd_page);
														
 
															+	}
														
 
															+
														
 
															+	return pmd_offset(pud, address);
														
 
															+}
														
 
															+
														
 
															+#ifdef CONFIG_X86_VSYSCALL_EMULATION
														
 
															+/*
														
 
															+ * Walk the shadow copy of the page tables (optionally) trying to allocate
														
 
															+ * page table pages on the way down.  Does not support large pages.
														
 
															+ *
														
 
															+ * Note: this is only used when mapping *new* kernel data into the
														
 
															+ * user/shadow page tables.  It is never used for userspace data.
														
 
															+ *
														
 
															+ * Returns a pointer to a PTE on success, or NULL on failure.
														
 
															+ */
														
 
															+static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
														
 
															+{
														
 
															+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
														
 
															+	pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
														
 
															+	pte_t *pte;
														
 
															+
														
 
															+	/* We can't do anything sensible if we hit a large mapping. */
														
 
															+	if (pmd_large(*pmd)) {
														
 
															+		WARN_ON(1);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															+	if (pmd_none(*pmd)) {
														
 
															+		unsigned long new_pte_page = __get_free_page(gfp);
														
 
															+		if (!new_pte_page)
														
 
															+			return NULL;
														
 
															+
														
 
															+		if (pmd_none(*pmd)) {
														
 
															+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
														
 
															+			new_pte_page = 0;
														
 
															+		}
														
 
															+		if (new_pte_page)
														
 
															+			free_page(new_pte_page);
														
 
															+	}
														
 
															+
														
 
															+	pte = pte_offset_kernel(pmd, address);
														
 
															+	if (pte_flags(*pte) & _PAGE_USER) {
														
 
															+		WARN_ONCE(1, "attempt to walk to user pte\n");
														
 
															+		return NULL;
														
 
															+	}
														
 
															+	return pte;
														
 
															+}
														
 
															+
														
 
															+static void __init pti_setup_vsyscall(void)
														
 
															+{
														
 
															+	pte_t *pte, *target_pte;
														
 
															+	unsigned int level;
														
 
															+
														
 
															+	pte = lookup_address(VSYSCALL_ADDR, &level);
														
 
															+	if (!pte || WARN_ON(level != PG_LEVEL_4K) || pte_none(*pte))
														
 
															+		return;
														
 
															+
														
 
															+	target_pte = pti_user_pagetable_walk_pte(VSYSCALL_ADDR);
														
 
															+	if (WARN_ON(!target_pte))
														
 
															+		return;
														
 
															+
														
 
															+	*target_pte = *pte;
														
 
															+	set_vsyscall_pgtable_user_bits(kernel_to_user_pgdp(swapper_pg_dir));
														
 
															+}
														
 
															+#else
														
 
															+static void __init pti_setup_vsyscall(void) { }
														
 
															+#endif
														
 
															+
														
 
															+static void __init
														
 
															+pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
														
 
															+{
														
 
															+	unsigned long addr;
														
 
															+
														
 
															+	/*
														
 
															+	 * Clone the populated PMDs which cover start to end. These PMD areas
														
 
															+	 * can have holes.
														
 
															+	 */
														
 
															+	for (addr = start; addr < end; addr += PMD_SIZE) {
														
 
															+		pmd_t *pmd, *target_pmd;
														
 
															+		pgd_t *pgd;
														
 
															+		p4d_t *p4d;
														
 
															+		pud_t *pud;
														
 
															+
														
 
															+		pgd = pgd_offset_k(addr);
														
 
															+		if (WARN_ON(pgd_none(*pgd)))
														
 
															+			return;
														
 
															+		p4d = p4d_offset(pgd, addr);
														
 
															+		if (WARN_ON(p4d_none(*p4d)))
														
 
															+			return;
														
 
															+		pud = pud_offset(p4d, addr);
														
 
															+		if (pud_none(*pud))
														
 
															+			continue;
														
 
															+		pmd = pmd_offset(pud, addr);
														
 
															+		if (pmd_none(*pmd))
														
 
															+			continue;
														
 
															+
														
 
															+		target_pmd = pti_user_pagetable_walk_pmd(addr);
														
 
															+		if (WARN_ON(!target_pmd))
														
 
															+			return;
														
 
															+
														
 
															+		/*
														
 
															+		 * Copy the PMD.  That is, the kernelmode and usermode
														
 
															+		 * tables will share the last-level page tables of this
														
 
															+		 * address range
														
 
															+		 */
														
 
															+		*target_pmd = pmd_clear_flags(*pmd, clear);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Clone a single p4d (i.e. a top-level entry on 4-level systems and a
														
 
															+ * next-level entry on 5-level systems.
														
 
															+ */
														
 
															+static void __init pti_clone_p4d(unsigned long addr)
														
 
															+{
														
 
															+	p4d_t *kernel_p4d, *user_p4d;
														
 
															+	pgd_t *kernel_pgd;
														
 
															+
														
 
															+	user_p4d = pti_user_pagetable_walk_p4d(addr);
														
 
															+	kernel_pgd = pgd_offset_k(addr);
														
 
															+	kernel_p4d = p4d_offset(kernel_pgd, addr);
														
 
															+	*user_p4d = *kernel_p4d;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Clone the CPU_ENTRY_AREA into the user space visible page table.
														
 
															+ */
														
 
															+static void __init pti_clone_user_shared(void)
														
 
															+{
														
 
															+	pti_clone_p4d(CPU_ENTRY_AREA_BASE);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Clone the ESPFIX P4D into the user space visinble page table
														
 
															+ */
														
 
															+static void __init pti_setup_espfix64(void)
														
 
															+{
														
 
															+#ifdef CONFIG_X86_ESPFIX64
														
 
															+	pti_clone_p4d(ESPFIX_BASE_ADDR);
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Clone the populated PMDs of the entry and irqentry text and force it RO.
														
 
															+ */
														
 
															+static void __init pti_clone_entry_text(void)
														
 
															+{
														
 
															+	pti_clone_pmds((unsigned long) __entry_text_start,
														
 
															+			(unsigned long) __irqentry_text_end, _PAGE_RW);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Initialize kernel page table isolation
														
 
															+ */
														
 
															+void __init pti_init(void)
														
 
															+{
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI))
														
 
															+		return;
														
 
															+
														
 
															+	pr_info("enabled\n");
														
 
															+
														
 
															+	pti_clone_user_shared();
														
 
															+	pti_clone_entry_text();
														
 
															+	pti_setup_espfix64();
														
 
															+	pti_setup_vsyscall();
														
 
															+}
														
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,38 @@
 
															  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
														
 
															  */
														
 
															+/*
														
 
															+ * We get here when we do something requiring a TLB invalidation
														
 
															+ * but could not go invalidate all of the contexts.  We do the
														
 
															+ * necessary invalidation by clearing out the 'ctx_id' which
														
 
															+ * forces a TLB flush when the context is loaded.
														
 
															+ */
														
 
															+void clear_asid_other(void)
														
 
															+{
														
 
															+	u16 asid;
														
 
															+
														
 
															+	/*
														
 
															+	 * This is only expected to be set if we have disabled
														
 
															+	 * kernel _PAGE_GLOBAL pages.
														
 
															+	 */
														
 
															+	if (!static_cpu_has(X86_FEATURE_PTI)) {
														
 
															+		WARN_ON_ONCE(1);
														
 
															+		return;
														
 
															+	}
														
 
															+
														
 
															+	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
														
 
															+		/* Do not need to flush the current asid */
														
 
															+		if (asid == this_cpu_read(cpu_tlbstate.loaded_mm_asid))
														
 
															+			continue;
														
 
															+		/*
														
 
															+		 * Make sure the next time we go to switch to
														
 
															+		 * this asid, we do a flush:
														
 
															+		 */
														
 
															+		this_cpu_write(cpu_tlbstate.ctxs[asid].ctx_id, 0);
														
 
															+	}
														
 
															+	this_cpu_write(cpu_tlbstate.invalidate_other, false);
														
 
															+}
														
 
															+
														
 
															 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
														
@@ -42,6 +74,9 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 
															 		return;
														
 
															 	}
														
 
															+	if (this_cpu_read(cpu_tlbstate.invalidate_other))
														
 
															+		clear_asid_other();
														
 
															+
														
 
															 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
														
 
															 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
														
 
															 		    next->context.ctx_id)
														
@@ -65,6 +100,25 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
 
															 	*need_flush = true;
														
 
															 }
														
 
															+static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush)
														
 
															+{
														
 
															+	unsigned long new_mm_cr3;
														
 
															+
														
 
															+	if (need_flush) {
														
 
															+		invalidate_user_asid(new_asid);
														
 
															+		new_mm_cr3 = build_cr3(pgdir, new_asid);
														
 
															+	} else {
														
 
															+		new_mm_cr3 = build_cr3_noflush(pgdir, new_asid);
														
 
															+	}
														
 
															+
														
 
															+	/*
														
 
															+	 * Caution: many callers of this function expect
														
 
															+	 * that load_cr3() is serializing and orders TLB
														
 
															+	 * fills with respect to the mm_cpumask writes.
														
 
															+	 */
														
 
															+	write_cr3(new_mm_cr3);
														
 
															+}
														
 
															+
														
 
															 void leave_mm(int cpu)
														
 
															 {
														
 
															 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
														
@@ -195,7 +249,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
															 		if (need_flush) {
														
 
															 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
														
 
															 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
														
 
															-			write_cr3(build_cr3(next->pgd, new_asid));
														
 
															+			load_new_mm_cr3(next->pgd, new_asid, true);
														
 
															 			/*
														
 
															 			 * NB: This gets called via leave_mm() in the idle path
														
@@ -208,7 +262,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
															 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
														
 
															 		} else {
														
 
															 			/* The new ASID is already up to date. */
														
 
															-			write_cr3(build_cr3_noflush(next->pgd, new_asid));
														
 
															+			load_new_mm_cr3(next->pgd, new_asid, false);
														
 
															 			/* See above wrt _rcuidle. */
														
 
															 			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
														
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -196,6 +196,9 @@ static pgd_t *efi_pgd;
 
															  * because we want to avoid inserting EFI region mappings (EFI_VA_END
														
 
															  * to EFI_VA_START) into the standard kernel page tables. Everything
														
 
															  * else can be shared, see efi_sync_low_kernel_mappings().
														
 
															+ *
														
 
															+ * We don't want the pgd on the pgd_list and cannot use pgd_alloc() for the
														
 
															+ * allocation.
														
 
															  */
														
 
															 int __init efi_alloc_page_tables(void)
														
 
															 {
														
@@ -208,7 +211,7 @@ int __init efi_alloc_page_tables(void)
 
															 		return 0;
														
 
															 	gfp_mask = GFP_KERNEL | __GFP_ZERO;
														
 
															-	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
														
 
															+	efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER);
														
 
															 	if (!efi_pgd)
														
 
															 		return -ENOMEM;
														
--- a/include/linux/pti.h
+++ b/include/linux/pti.h
@@ -0,0 +1,11 @@
 
															+// SPDX-License-Identifier: GPL-2.0
														
 
															+#ifndef _INCLUDE_PTI_H
														
 
															+#define _INCLUDE_PTI_H
														
 
															+
														
 
															+#ifdef CONFIG_PAGE_TABLE_ISOLATION
														
 
															+#include <asm/pti.h>
														
 
															+#else
														
 
															+static inline void pti_init(void) { }
														
 
															+#endif
														
 
															+
														
 
															+#endif
														
--- a/init/main.c
+++ b/init/main.c
@@ -75,6 +75,7 @@
 
															 #include <linux/slab.h>
														
 
															 #include <linux/perf_event.h>
														
 
															 #include <linux/ptrace.h>
														
 
															+#include <linux/pti.h>
														
 
															 #include <linux/blkdev.h>
														
 
															 #include <linux/elevator.h>
														
 
															 #include <linux/sched_clock.h>
														
@@ -506,6 +507,8 @@ static void __init mm_init(void)
 
															 	ioremap_huge_init();
														
 
															 	/* Should be run before the first non-init thread is created */
														
 
															 	init_espfix_bsp();
														
 
															+	/* Should be run after espfix64 is set up. */
														
 
															+	pti_init();
														
 
															 }
														
 
															 asmlinkage __visible void __init start_kernel(void)
														
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -54,6 +54,16 @@ config SECURITY_NETWORK
 
															 	  implement socket and networking access controls.
														
 
															 	  If you are unsure how to answer this question, answer N.
														
 
															+config PAGE_TABLE_ISOLATION
														
 
															+	bool "Remove the kernel mapping in user mode"
														
 
															+	depends on X86_64 && !UML
														
 
															+	help
														
 
															+	  This feature reduces the number of hardware side channels by
														
 
															+	  ensuring that the majority of kernel addresses are not mapped
														
 
															+	  into userspace.
														
 
															+
														
 
															+	  See Documentation/x86/pagetable-isolation.txt for more details.
														
 
															+
														
 
															 config SECURITY_INFINIBAND
														
 
															 	bool "Infiniband Security Hooks"
														
 
															 	depends on SECURITY && INFINIBAND
														
--- a/tools/testing/selftests/x86/ldt_gdt.c
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -122,8 +122,7 @@ static void check_valid_segment(uint16_t index, int ldt,
 
															 	 * NB: Different Linux versions do different things with the
														
 
															 	 * accessed bit in set_thread_area().
														
 
															 	 */
														
 
															-	if (ar != expected_ar &&
														
 
															-	    (ldt || ar != (expected_ar | AR_ACCESSED))) {
														
 
															+	if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
														
 
															 		printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
														
 
															 		       (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
														
 
															 		nerrs++;