8 年之前 · 7a69f9c60b
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1638,7 +1638,7 @@ config ARCH_SELECT_MEMORY_MODEL
 
				 config HAVE_ARCH_PFN_VALID
			
 
				 	def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
			
 
				 
			
 
				-config HAVE_GENERIC_RCU_GUP
			
 
				+config HAVE_GENERIC_GUP
			
 
				 	def_bool y
			
 
				 	depends on ARM_LPAE
			
 
				 
			
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY
 
				 config ZONE_DMA
			
 
				 	def_bool y
			
 
				 
			
 
				-config HAVE_GENERIC_RCU_GUP
			
 
				+config HAVE_GENERIC_GUP
			
 
				 	def_bool y
			
 
				 
			
 
				 config ARCH_DMA_ADDR_T_64BIT
			
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -184,7 +184,7 @@ config PPC
 
				 	select HAVE_FUNCTION_GRAPH_TRACER
			
 
				 	select HAVE_FUNCTION_TRACER
			
 
				 	select HAVE_GCC_PLUGINS
			
 
				-	select HAVE_GENERIC_RCU_GUP
			
 
				+	select HAVE_GENERIC_GUP
			
 
				 	select HAVE_HW_BREAKPOINT		if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
			
 
				 	select HAVE_IDE
			
 
				 	select HAVE_IOREMAP_PROT
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,7 @@ config X86
 
				 	select ARCH_USE_BUILTIN_BSWAP
			
 
				 	select ARCH_USE_QUEUED_RWLOCKS
			
 
				 	select ARCH_USE_QUEUED_SPINLOCKS
			
 
				-	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
			
 
				+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
			
 
				 	select ARCH_WANT_FRAME_POINTERS
			
 
				 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
			
 
				 	select BUILDTIME_EXTABLE_SORT
			
@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
 
				 	bool
			
 
				 	depends on STA2X11
			
 
				 
			
 
				+config HAVE_GENERIC_GUP
			
 
				+	def_bool y
			
 
				+
			
 
				 source "net/Kconfig"
			
 
				 
			
 
				 source "drivers/Kconfig"
			
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
 
				 	memset((char *)gdt->address, 0x0, gdt->size);
			
 
				 	desc = (struct desc_struct *)gdt->address;
			
 
				 
			
 
				-	/* The first GDT is a dummy and the second is unused. */
			
 
				-	desc += 2;
			
 
				+	/* The first GDT is a dummy. */
			
 
				+	desc++;
			
 
				+
			
 
				+	if (IS_ENABLED(CONFIG_X86_64)) {
			
 
				+		/* __KERNEL32_CS */
			
 
				+		desc->limit0 = 0xffff;
			
 
				+		desc->base0 = 0x0000;
			
 
				+		desc->base1 = 0x0000;
			
 
				+		desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
			
 
				+		desc->s = DESC_TYPE_CODE_DATA;
			
 
				+		desc->dpl = 0;
			
 
				+		desc->p = 1;
			
 
				+		desc->limit = 0xf;
			
 
				+		desc->avl = 0;
			
 
				+		desc->l = 0;
			
 
				+		desc->d = SEG_OP_SIZE_32BIT;
			
 
				+		desc->g = SEG_GRANULARITY_4KB;
			
 
				+		desc->base2 = 0x00;
			
 
				+		desc++;
			
 
				+	} else {
			
 
				+		/* Second entry is unused on 32-bit */
			
 
				+		desc++;
			
 
				+	}
			
 
				 
			
 
				+	/* __KERNEL_CS */
			
 
				 	desc->limit0 = 0xffff;
			
 
				 	desc->base0 = 0x0000;
			
 
				 	desc->base1 = 0x0000;
			
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
 
				 	desc->p = 1;
			
 
				 	desc->limit = 0xf;
			
 
				 	desc->avl = 0;
			
 
				-	desc->l = 0;
			
 
				-	desc->d = SEG_OP_SIZE_32BIT;
			
 
				+	if (IS_ENABLED(CONFIG_X86_64)) {
			
 
				+		desc->l = 1;
			
 
				+		desc->d = 0;
			
 
				+	} else {
			
 
				+		desc->l = 0;
			
 
				+		desc->d = SEG_OP_SIZE_32BIT;
			
 
				+	}
			
 
				 	desc->g = SEG_GRANULARITY_4KB;
			
 
				 	desc->base2 = 0x00;
			
 
				-
			
 
				 	desc++;
			
 
				+
			
 
				+	/* __KERNEL_DS */
			
 
				 	desc->limit0 = 0xffff;
			
 
				 	desc->base0 = 0x0000;
			
 
				 	desc->base1 = 0x0000;
			
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
 
				 	desc->d = SEG_OP_SIZE_32BIT;
			
 
				 	desc->g = SEG_GRANULARITY_4KB;
			
 
				 	desc->base2 = 0x00;
			
 
				-
			
 
				-#ifdef CONFIG_X86_64
			
 
				-	/* Task segment value */
			
 
				 	desc++;
			
 
				-	desc->limit0 = 0x0000;
			
 
				-	desc->base0 = 0x0000;
			
 
				-	desc->base1 = 0x0000;
			
 
				-	desc->type = SEG_TYPE_TSS;
			
 
				-	desc->s = 0;
			
 
				-	desc->dpl = 0;
			
 
				-	desc->p = 1;
			
 
				-	desc->limit = 0x0;
			
 
				-	desc->avl = 0;
			
 
				-	desc->l = 0;
			
 
				-	desc->d = 0;
			
 
				-	desc->g = SEG_GRANULARITY_4KB;
			
 
				-	desc->base2 = 0x00;
			
 
				-#endif /* CONFIG_X86_64 */
			
 
				+
			
 
				+	if (IS_ENABLED(CONFIG_X86_64)) {
			
 
				+		/* Task segment value */
			
 
				+		desc->limit0 = 0x0000;
			
 
				+		desc->base0 = 0x0000;
			
 
				+		desc->base1 = 0x0000;
			
 
				+		desc->type = SEG_TYPE_TSS;
			
 
				+		desc->s = 0;
			
 
				+		desc->dpl = 0;
			
 
				+		desc->p = 1;
			
 
				+		desc->limit = 0x0;
			
 
				+		desc->avl = 0;
			
 
				+		desc->l = 0;
			
 
				+		desc->d = 0;
			
 
				+		desc->g = SEG_GRANULARITY_4KB;
			
 
				+		desc->base2 = 0x00;
			
 
				+		desc++;
			
 
				+	}
			
 
				 
			
 
				 	asm volatile("cli");
			
 
				 	asm volatile ("lgdt %0" : : "m" (*gdt));
			
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -346,6 +346,48 @@ preferred_addr:
 
				 	/* Set up the stack */
			
 
				 	leaq	boot_stack_end(%rbx), %rsp
			
 
				 
			
 
				+#ifdef CONFIG_X86_5LEVEL
			
 
				+	/* Check if 5-level paging has already enabled */
			
 
				+	movq	%cr4, %rax
			
 
				+	testl	$X86_CR4_LA57, %eax
			
 
				+	jnz	lvl5
			
 
				+
			
 
				+	/*
			
 
				+	 * At this point we are in long mode with 4-level paging enabled,
			
 
				+	 * but we want to enable 5-level paging.
			
 
				+	 *
			
 
				+	 * The problem is that we cannot do it directly. Setting LA57 in
			
 
				+	 * long mode would trigger #GP. So we need to switch off long mode
			
 
				+	 * first.
			
 
				+	 *
			
 
				+	 * NOTE: This is not going to work if bootloader put us above 4G
			
 
				+	 * limit.
			
 
				+	 *
			
 
				+	 * The first step is go into compatibility mode.
			
 
				+	 */
			
 
				+
			
 
				+	/* Clear additional page table */
			
 
				+	leaq	lvl5_pgtable(%rbx), %rdi
			
 
				+	xorq	%rax, %rax
			
 
				+	movq	$(PAGE_SIZE/8), %rcx
			
 
				+	rep	stosq
			
 
				+
			
 
				+	/*
			
 
				+	 * Setup current CR3 as the first and only entry in a new top level
			
 
				+	 * page table.
			
 
				+	 */
			
 
				+	movq	%cr3, %rdi
			
 
				+	leaq	0x7 (%rdi), %rax
			
 
				+	movq	%rax, lvl5_pgtable(%rbx)
			
 
				+
			
 
				+	/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
			
 
				+	pushq	$__KERNEL32_CS
			
 
				+	leaq	compatible_mode(%rip), %rax
			
 
				+	pushq	%rax
			
 
				+	lretq
			
 
				+lvl5:
			
 
				+#endif
			
 
				+
			
 
				 	/* Zero EFLAGS */
			
 
				 	pushq	$0
			
 
				 	popfq
			
@@ -429,6 +471,44 @@ relocated:
 
				 	jmp	*%rax
			
 
				 
			
 
				 	.code32
			
 
				+#ifdef CONFIG_X86_5LEVEL
			
 
				+compatible_mode:
			
 
				+	/* Setup data and stack segments */
			
 
				+	movl	$__KERNEL_DS, %eax
			
 
				+	movl	%eax, %ds
			
 
				+	movl	%eax, %ss
			
 
				+
			
 
				+	/* Disable paging */
			
 
				+	movl	%cr0, %eax
			
 
				+	btrl	$X86_CR0_PG_BIT, %eax
			
 
				+	movl	%eax, %cr0
			
 
				+
			
 
				+	/* Point CR3 to 5-level paging */
			
 
				+	leal	lvl5_pgtable(%ebx), %eax
			
 
				+	movl	%eax, %cr3
			
 
				+
			
 
				+	/* Enable PAE and LA57 mode */
			
 
				+	movl	%cr4, %eax
			
 
				+	orl	$(X86_CR4_PAE | X86_CR4_LA57), %eax
			
 
				+	movl	%eax, %cr4
			
 
				+
			
 
				+	/* Calculate address we are running at */
			
 
				+	call	1f
			
 
				+1:	popl	%edi
			
 
				+	subl	$1b, %edi
			
 
				+
			
 
				+	/* Prepare stack for far return to Long Mode */
			
 
				+	pushl	$__KERNEL_CS
			
 
				+	leal	lvl5(%edi), %eax
			
 
				+	push	%eax
			
 
				+
			
 
				+	/* Enable paging back */
			
 
				+	movl	$(X86_CR0_PG | X86_CR0_PE), %eax
			
 
				+	movl	%eax, %cr0
			
 
				+
			
 
				+	lret
			
 
				+#endif
			
 
				+
			
 
				 no_longmode:
			
 
				 	/* This isn't an x86-64 CPU so hang */
			
 
				 1:
			
@@ -442,7 +522,7 @@ gdt:
 
				 	.word	gdt_end - gdt
			
 
				 	.long	gdt
			
 
				 	.word	0
			
 
				-	.quad	0x0000000000000000	/* NULL descriptor */
			
 
				+	.quad	0x00cf9a000000ffff	/* __KERNEL32_CS */
			
 
				 	.quad	0x00af9a000000ffff	/* __KERNEL_CS */
			
 
				 	.quad	0x00cf92000000ffff	/* __KERNEL_DS */
			
 
				 	.quad	0x0080890000000000	/* TS descriptor */
			
@@ -486,3 +566,7 @@ boot_stack_end:
 
				 	.balign 4096
			
 
				 pgtable:
			
 
				 	.fill BOOT_PGT_SIZE, 1, 0
			
 
				+#ifdef CONFIG_X86_5LEVEL
			
 
				+lvl5_pgtable:
			
 
				+	.fill PAGE_SIZE, 1, 0
			
 
				+#endif
			
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
 
				 static struct alloc_pgt_data pgt_data;
			
 
				 
			
 
				 /* The top level page table entry pointer. */
			
 
				-static unsigned long level4p;
			
 
				+static unsigned long top_level_pgt;
			
 
				 
			
 
				 /*
			
 
				  * Mapping information structure passed to kernel_ident_mapping_init().
			
@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
 
				 	 * If we came here via startup_32(), cr3 will be _pgtable already
			
 
				 	 * and we must append to the existing area instead of entirely
			
 
				 	 * overwriting it.
			
 
				+	 *
			
 
				+	 * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
			
 
				+	 * the top-level page table is allocated separately.
			
 
				+	 *
			
 
				+	 * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
			
 
				+	 * cases. On 4-level paging it's equal to 'top_level_pgt'.
			
 
				 	 */
			
 
				-	level4p = read_cr3();
			
 
				-	if (level4p == (unsigned long)_pgtable) {
			
 
				+	top_level_pgt = read_cr3_pa();
			
 
				+	if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
			
 
				 		debug_putstr("booted via startup_32()\n");
			
 
				 		pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
			
 
				 		pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
			
@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
 
				 		pgt_data.pgt_buf = _pgtable;
			
 
				 		pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
			
 
				 		memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
			
 
				-		level4p = (unsigned long)alloc_pgt_page(&pgt_data);
			
 
				+		top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
 
				 		return;
			
 
				 
			
 
				 	/* Build the mapping. */
			
 
				-	kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
			
 
				+	kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
			
 
				 				  start, end);
			
 
				 }
			
 
				 
			
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
 
				  */
			
 
				 void finalize_identity_maps(void)
			
 
				 {
			
 
				-	write_cr3(level4p);
			
 
				+	write_cr3(top_level_pgt);
			
 
				 }
			
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
 
				 	 * If width of "canonical tail" ever becomes variable, this will need
			
 
				 	 * to be updated to remain correct on both old and new CPUs.
			
 
				 	 *
			
 
				-	 * Change top 16 bits to be the sign-extension of 47th bit
			
 
				+	 * Change top bits to match most significant bit (47th or 56th bit
			
 
				+	 * depending on paging mode) in the address.
			
 
				 	 */
			
 
				 	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
			
 
				 	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
			
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
 
				 
			
 
				 static void refresh_pce(void *ignored)
			
 
				 {
			
 
				-	if (current->active_mm)
			
 
				-		load_mm_cr4(current->active_mm);
			
 
				+	load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
			
 
				 }
			
 
				 
			
 
				 static void x86_pmu_event_mapped(struct perf_event *event)
			
@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
 
				 
			
 
				 		/* IRQs are off, so this synchronizes with smp_store_release */
			
 
				 		ldt = lockless_dereference(current->active_mm->context.ldt);
			
 
				-		if (!ldt || idx > ldt->size)
			
 
				+		if (!ldt || idx > ldt->nr_entries)
			
 
				 			return 0;
			
 
				 
			
 
				 		desc = &ldt->entries[idx];
			
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
 
				 	__kernel_fpu_begin();						\
			
 
				 									\
			
 
				 	if (efi_scratch.use_pgd) {					\
			
 
				-		efi_scratch.prev_cr3 = read_cr3();			\
			
 
				+		efi_scratch.prev_cr3 = __read_cr3();			\
			
 
				 		write_cr3((unsigned long)efi_scratch.efi_pgt);		\
			
 
				 		__flush_tlb_all();					\
			
 
				 	}								\
			
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
 
				 #ifdef CONFIG_SMP
			
 
				 	unsigned int irq_resched_count;
			
 
				 	unsigned int irq_call_count;
			
 
				-	unsigned int irq_tlb_count;
			
 
				 #endif
			
 
				+	unsigned int irq_tlb_count;
			
 
				 #ifdef CONFIG_X86_THERMAL_VECTOR
			
 
				 	unsigned int irq_thermal_count;
			
 
				 #endif
			
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,12 +37,6 @@ typedef struct {
 
				 #endif
			
 
				 } mm_context_t;
			
 
				 
			
 
				-#ifdef CONFIG_SMP
			
 
				 void leave_mm(int cpu);
			
 
				-#else
			
 
				-static inline void leave_mm(int cpu)
			
 
				-{
			
 
				-}
			
 
				-#endif
			
 
				 
			
 
				 #endif /* _ASM_X86_MMU_H */
			
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -47,7 +47,7 @@ struct ldt_struct {
 
				 	 * allocations, but it's not worth trying to optimize.
			
 
				 	 */
			
 
				 	struct desc_struct *entries;
			
 
				-	unsigned int size;
			
 
				+	unsigned int nr_entries;
			
 
				 };
			
 
				 
			
 
				 /*
			
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 
				 	 */
			
 
				 
			
 
				 	if (unlikely(ldt))
			
 
				-		set_ldt(ldt->entries, ldt->size);
			
 
				+		set_ldt(ldt->entries, ldt->nr_entries);
			
 
				 	else
			
 
				 		clear_LDT();
			
 
				 #else
			
 
				 	clear_LDT();
			
 
				 #endif
			
 
				+}
			
 
				+
			
 
				+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
			
 
				+{
			
 
				+#ifdef CONFIG_MODIFY_LDT_SYSCALL
			
 
				+	/*
			
 
				+	 * Load the LDT if either the old or new mm had an LDT.
			
 
				+	 *
			
 
				+	 * An mm will never go from having an LDT to not having an LDT.  Two
			
 
				+	 * mms never share an LDT, so we don't gain anything by checking to
			
 
				+	 * see whether the LDT changed.  There's also no guarantee that
			
 
				+	 * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
			
 
				+	 * then prev->context.ldt will also be non-NULL.
			
 
				+	 *
			
 
				+	 * If we really cared, we could optimize the case where prev == next
			
 
				+	 * and we're exiting lazy mode.  Most of the time, if this happens,
			
 
				+	 * we don't actually need to reload LDTR, but modify_ldt() is mostly
			
 
				+	 * used by legacy code and emulators where we don't need this level of
			
 
				+	 * performance.
			
 
				+	 *
			
 
				+	 * This uses | instead of || because it generates better code.
			
 
				+	 */
			
 
				+	if (unlikely((unsigned long)prev->context.ldt |
			
 
				+		     (unsigned long)next->context.ldt))
			
 
				+		load_mm_ldt(next);
			
 
				+#endif
			
 
				 
			
 
				 	DEBUG_LOCKS_WARN_ON(preemptible());
			
 
				 }
			
 
				 
			
 
				 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
			
 
				 {
			
 
				-#ifdef CONFIG_SMP
			
 
				 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
			
 
				 		this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				 static inline int init_new_context(struct task_struct *tsk,
			
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static inline bool __pkru_allows_pkey(u16 pkey, bool write)
			
 
				-{
			
 
				-	u32 pkru = read_pkru();
			
 
				-
			
 
				-	if (!__pkru_allows_read(pkru, pkey))
			
 
				-		return false;
			
 
				-	if (write && !__pkru_allows_write(pkru, pkey))
			
 
				-		return false;
			
 
				-
			
 
				-	return true;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * We only want to enforce protection keys on the current process
			
 
				  * because we effectively have no access to PKRU for other
			
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 
				 	return __pkru_allows_pkey(vma_pkey(vma), write);
			
 
				 }
			
 
				 
			
 
				+
			
 
				+/*
			
 
				+ * This can be used from process context to figure out what the value of
			
 
				+ * CR3 is without needing to do a (slow) __read_cr3().
			
 
				+ *
			
 
				+ * It's intended to be used for code like KVM that sneakily changes CR3
			
 
				+ * and needs to restore it.  It needs to be used very carefully.
			
 
				+ */
			
 
				+static inline unsigned long __get_current_cr3_fast(void)
			
 
				+{
			
 
				+	unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
			
 
				+
			
 
				+	/* For now, be very restrictive about when this can be called. */
			
 
				+	VM_WARN_ON(in_nmi() || !in_atomic());
			
 
				+
			
 
				+	VM_BUG_ON(cr3 != __read_cr3());
			
 
				+	return cr3;
			
 
				+}
			
 
				+
			
 
				 #endif /* _ASM_X86_MMU_CONTEXT_H */
			
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
 
				 	PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
			
 
				 }
			
 
				 
			
 
				-static inline unsigned long read_cr3(void)
			
 
				+static inline unsigned long __read_cr3(void)
			
 
				 {
			
 
				 	return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
			
 
				 }
			
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
 
				 }
			
 
				 
			
 
				 static inline void flush_tlb_others(const struct cpumask *cpumask,
			
 
				-				    struct mm_struct *mm,
			
 
				-				    unsigned long start,
			
 
				-				    unsigned long end)
			
 
				+				    const struct flush_tlb_info *info)
			
 
				 {
			
 
				-	PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
			
 
				+	PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
			
 
				 }
			
 
				 
			
 
				 static inline int paravirt_pgd_alloc(struct mm_struct *mm)
			
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -51,6 +51,7 @@ struct mm_struct;
 
				 struct desc_struct;
			
 
				 struct task_struct;
			
 
				 struct cpumask;
			
 
				+struct flush_tlb_info;
			
 
				 
			
 
				 /*
			
 
				  * Wrapper type for pointers to code which uses the non-standard
			
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
 
				 	void (*flush_tlb_kernel)(void);
			
 
				 	void (*flush_tlb_single)(unsigned long addr);
			
 
				 	void (*flush_tlb_others)(const struct cpumask *cpus,
			
 
				-				 struct mm_struct *mm,
			
 
				-				 unsigned long start,
			
 
				-				 unsigned long end);
			
 
				+				 const struct flush_tlb_info *info);
			
 
				 
			
 
				 	/* Hooks for allocating and freeing a pagetable top-level */
			
 
				 	int  (*pgd_alloc)(struct mm_struct *mm);
			
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
 
				 #define __pte_to_swp_entry(pte)		((swp_entry_t){ (pte).pte_high })
			
 
				 #define __swp_entry_to_pte(x)		((pte_t){ { .pte_high = (x).val } })
			
 
				 
			
 
				+#define gup_get_pte gup_get_pte
			
 
				+/*
			
 
				+ * WARNING: only to be used in the get_user_pages_fast() implementation.
			
 
				+ *
			
 
				+ * With get_user_pages_fast(), we walk down the pagetables without taking
			
 
				+ * any locks.  For this we would like to load the pointers atomically,
			
 
				+ * but that is not possible (without expensive cmpxchg8b) on PAE.  What
			
 
				+ * we do have is the guarantee that a PTE will only either go from not
			
 
				+ * present to present, or present to not present or both -- it will not
			
 
				+ * switch to a completely different present page without a TLB flush in
			
 
				+ * between; something that we are blocking by holding interrupts off.
			
 
				+ *
			
 
				+ * Setting ptes from not present to present goes:
			
 
				+ *
			
 
				+ *   ptep->pte_high = h;
			
 
				+ *   smp_wmb();
			
 
				+ *   ptep->pte_low = l;
			
 
				+ *
			
 
				+ * And present to not present goes:
			
 
				+ *
			
 
				+ *   ptep->pte_low = 0;
			
 
				+ *   smp_wmb();
			
 
				+ *   ptep->pte_high = 0;
			
 
				+ *
			
 
				+ * We must ensure here that the load of pte_low sees 'l' iff pte_high
			
 
				+ * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
			
 
				+ * don't see an older value of pte_high.  *Then* we recheck pte_low,
			
 
				+ * which ensures that we haven't picked up a changed pte high. We might
			
 
				+ * have gotten rubbish values from pte_low and pte_high, but we are
			
 
				+ * guaranteed that pte_low will not have the present bit set *unless*
			
 
				+ * it is 'l'. Because get_user_pages_fast() only operates on present ptes
			
 
				+ * we're safe.
			
 
				+ */
			
 
				+static inline pte_t gup_get_pte(pte_t *ptep)
			
 
				+{
			
 
				+	pte_t pte;
			
 
				+
			
 
				+	do {
			
 
				+		pte.pte_low = ptep->pte_low;
			
 
				+		smp_rmb();
			
 
				+		pte.pte_high = ptep->pte_high;
			
 
				+		smp_rmb();
			
 
				+	} while (unlikely(pte.pte_low != ptep->pte_low));
			
 
				+
			
 
				+	return pte;
			
 
				+}
			
 
				+
			
 
				 #endif /* _ASM_X86_PGTABLE_3LEVEL_H */
			
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
 
				 	return 0;
			
 
				 }
			
 
				 #endif
			
 
				+
			
 
				+static inline int pgd_devmap(pgd_t pgd)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				 #endif
			
 
				 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
			
 
				 
			
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
 
				 static inline void __meminit init_trampoline_default(void)
			
 
				 {
			
 
				 	/* Default trampoline pgd value */
			
 
				-	trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
			
 
				+	trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
			
 
				 }
			
 
				 # ifdef CONFIG_RANDOMIZE_MEMORY
			
 
				 void __meminit init_trampoline(void);
			
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
			
 
				+{
			
 
				+	u32 pkru = read_pkru();
			
 
				+
			
 
				+	if (!__pkru_allows_read(pkru, pkey))
			
 
				+		return false;
			
 
				+	if (write && !__pkru_allows_write(pkru, pkey))
			
 
				+		return false;
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * 'pteval' can come from a PTE, PMD or PUD.  We only check
			
 
				+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
			
 
				+ * same value on all 3 types.
			
 
				+ */
			
 
				+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
			
 
				+{
			
 
				+	unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
			
 
				+
			
 
				+	if (write)
			
 
				+		need_pte_bits |= _PAGE_RW;
			
 
				+
			
 
				+	if ((pteval & need_pte_bits) != need_pte_bits)
			
 
				+		return 0;
			
 
				+
			
 
				+	return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
			
 
				+}
			
 
				+
			
 
				+#define pte_access_permitted pte_access_permitted
			
 
				+static inline bool pte_access_permitted(pte_t pte, bool write)
			
 
				+{
			
 
				+	return __pte_access_permitted(pte_val(pte), write);
			
 
				+}
			
 
				+
			
 
				+#define pmd_access_permitted pmd_access_permitted
			
 
				+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
			
 
				+{
			
 
				+	return __pte_access_permitted(pmd_val(pmd), write);
			
 
				+}
			
 
				+
			
 
				+#define pud_access_permitted pud_access_permitted
			
 
				+static inline bool pud_access_permitted(pud_t pud, bool write)
			
 
				+{
			
 
				+	return __pte_access_permitted(pud_val(pud), write);
			
 
				+}
			
 
				+
			
 
				 #include <asm-generic/pgtable.h>
			
 
				 #endif	/* __ASSEMBLY__ */
			
 
				 
			
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
 
				 #include <linux/bitops.h>
			
 
				 #include <linux/threads.h>
			
 
				 
			
 
				+extern p4d_t level4_kernel_pgt[512];
			
 
				+extern p4d_t level4_ident_pgt[512];
			
 
				 extern pud_t level3_kernel_pgt[512];
			
 
				 extern pud_t level3_ident_pgt[512];
			
 
				 extern pmd_t level2_kernel_pgt[512];
			
 
				 extern pmd_t level2_fixmap_pgt[512];
			
 
				 extern pmd_t level2_ident_pgt[512];
			
 
				 extern pte_t level1_fixmap_pgt[512];
			
 
				-extern pgd_t init_level4_pgt[];
			
 
				+extern pgd_t init_top_pgt[];
			
 
				 
			
 
				-#define swapper_pg_dir init_level4_pgt
			
 
				+#define swapper_pg_dir init_top_pgt
			
 
				 
			
 
				 extern void paging_init(void);
			
 
				 
			
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
 
				 extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
			
 
				 extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
			
 
				 
			
 
				-#endif /* !__ASSEMBLY__ */
			
 
				+#define gup_fast_permitted gup_fast_permitted
			
 
				+static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
			
 
				+		int write)
			
 
				+{
			
 
				+	unsigned long len, end;
			
 
				+
			
 
				+	len = (unsigned long)nr_pages << PAGE_SHIFT;
			
 
				+	end = start + len;
			
 
				+	if (end < start)
			
 
				+		return false;
			
 
				+	if (end >> __VIRTUAL_MASK_SHIFT)
			
 
				+		return false;
			
 
				+	return true;
			
 
				+}
			
 
				 
			
 
				+#endif /* !__ASSEMBLY__ */
			
 
				 #endif /* _ASM_X86_PGTABLE_64_H */
			
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 
				 #else
			
 
				 #define X86_VM_MASK	0 /* No VM86 support */
			
 
				 #endif
			
 
				+
			
 
				+/*
			
 
				+ * CR3's layout varies depending on several things.
			
 
				+ *
			
 
				+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
			
 
				+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
			
 
				+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
			
 
				+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
			
 
				+ * CR3[2:0] and CR3[11:5] are ignored.
			
 
				+ *
			
 
				+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
			
 
				+ *
			
 
				+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
			
 
				+ * written as 1 to prevent the write to CR3 from flushing the TLB.
			
 
				+ *
			
 
				+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
			
 
				+ * that the top-level paging structure is encrypted.
			
 
				+ *
			
 
				+ * All of the remaining bits indicate the physical address of the top-level
			
 
				+ * paging structure.
			
 
				+ *
			
 
				+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
			
 
				+ */
			
 
				+#ifdef CONFIG_X86_64
			
 
				+/* Mask off the address space ID bits. */
			
 
				+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
			
 
				+#define CR3_PCID_MASK 0xFFFull
			
 
				+#else
			
 
				+/*
			
 
				+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
			
 
				+ * a tiny bit of code size by setting all the bits.
			
 
				+ */
			
 
				+#define CR3_ADDR_MASK 0xFFFFFFFFull
			
 
				+#define CR3_PCID_MASK 0ull
			
 
				+#endif
			
 
				+
			
 
				 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
			
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
 
				 native_cpuid_reg(ecx)
			
 
				 native_cpuid_reg(edx)
			
 
				 
			
 
				+/*
			
 
				+ * Friendlier CR3 helpers.
			
 
				+ */
			
 
				+static inline unsigned long read_cr3_pa(void)
			
 
				+{
			
 
				+	return __read_cr3() & CR3_ADDR_MASK;
			
 
				+}
			
 
				+
			
 
				 static inline void load_cr3(pgd_t *pgdir)
			
 
				 {
			
 
				 	write_cr3(__pa(pgdir));
			
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
 
				 	asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
			
 
				 }
			
 
				 
			
 
				-static inline unsigned long native_read_cr3(void)
			
 
				+static inline unsigned long __native_read_cr3(void)
			
 
				 {
			
 
				 	unsigned long val;
			
 
				 	asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
			
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
 
				 	native_write_cr2(x);
			
 
				 }
			
 
				 
			
 
				-static inline unsigned long read_cr3(void)
			
 
				+/*
			
 
				+ * Careful!  CR3 contains more than just an address.  You probably want
			
 
				+ * read_cr3_pa() instead.
			
 
				+ */
			
 
				+static inline unsigned long __read_cr3(void)
			
 
				 {
			
 
				-	return native_read_cr3();
			
 
				+	return __native_read_cr3();
			
 
				 }
			
 
				 
			
 
				 static inline void write_cr3(unsigned long x)
			
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,14 @@
 
				+#ifndef _ARCH_X86_TLBBATCH_H
			
 
				+#define _ARCH_X86_TLBBATCH_H
			
 
				+
			
 
				+#include <linux/cpumask.h>
			
 
				+
			
 
				+struct arch_tlbflush_unmap_batch {
			
 
				+	/*
			
 
				+	 * Each bit set is a CPU that potentially has a TLB entry for one of
			
 
				+	 * the PFNs being flushed..
			
 
				+	 */
			
 
				+	struct cpumask cpumask;
			
 
				+};
			
 
				+
			
 
				+#endif /* _ARCH_X86_TLBBATCH_H */
			
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,7 @@
 
				 #include <asm/processor.h>
			
 
				 #include <asm/cpufeature.h>
			
 
				 #include <asm/special_insns.h>
			
 
				+#include <asm/smp.h>
			
 
				 
			
 
				 static inline void __invpcid(unsigned long pcid, unsigned long addr,
			
 
				 			     unsigned long type)
			
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
 
				 #endif
			
 
				 
			
 
				 struct tlb_state {
			
 
				-#ifdef CONFIG_SMP
			
 
				-	struct mm_struct *active_mm;
			
 
				+	/*
			
 
				+	 * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
			
 
				+	 * are on.  This means that it may not match current->active_mm,
			
 
				+	 * which will contain the previous user mm when we're in lazy TLB
			
 
				+	 * mode even if we've already switched back to swapper_pg_dir.
			
 
				+	 */
			
 
				+	struct mm_struct *loaded_mm;
			
 
				 	int state;
			
 
				-#endif
			
 
				 
			
 
				 	/*
			
 
				 	 * Access to this CR4 shadow and to H/W CR4 is protected by
			
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
 
				 	 * back:
			
 
				 	 */
			
 
				 	preempt_disable();
			
 
				-	native_write_cr3(native_read_cr3());
			
 
				+	native_write_cr3(__native_read_cr3());
			
 
				 	preempt_enable();
			
 
				 }
			
 
				 
			
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
 
				  *  - flush_tlb_page(vma, vmaddr) flushes one page
			
 
				  *  - flush_tlb_range(vma, start, end) flushes a range of pages
			
 
				  *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
			
 
				- *  - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
			
 
				+ *  - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
			
 
				  *
			
 
				  * ..but the i386 has somewhat limited tlb flushing capabilities,
			
 
				  * and page-granular flushes are available only on i486 and up.
			
 
				  */
			
 
				-
			
 
				-#ifndef CONFIG_SMP
			
 
				-
			
 
				-/* "_up" is for UniProcessor.
			
 
				- *
			
 
				- * This is a helper for other header functions.  *Not* intended to be called
			
 
				- * directly.  All global TLB flushes need to either call this, or to bump the
			
 
				- * vm statistics themselves.
			
 
				- */
			
 
				-static inline void __flush_tlb_up(void)
			
 
				-{
			
 
				-	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
			
 
				-	__flush_tlb();
			
 
				-}
			
 
				-
			
 
				-static inline void flush_tlb_all(void)
			
 
				-{
			
 
				-	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
			
 
				-	__flush_tlb_all();
			
 
				-}
			
 
				-
			
 
				-static inline void local_flush_tlb(void)
			
 
				-{
			
 
				-	__flush_tlb_up();
			
 
				-}
			
 
				-
			
 
				-static inline void flush_tlb_mm(struct mm_struct *mm)
			
 
				-{
			
 
				-	if (mm == current->active_mm)
			
 
				-		__flush_tlb_up();
			
 
				-}
			
 
				-
			
 
				-static inline void flush_tlb_page(struct vm_area_struct *vma,
			
 
				-				  unsigned long addr)
			
 
				-{
			
 
				-	if (vma->vm_mm == current->active_mm)
			
 
				-		__flush_tlb_one(addr);
			
 
				-}
			
 
				-
			
 
				-static inline void flush_tlb_range(struct vm_area_struct *vma,
			
 
				-				   unsigned long start, unsigned long end)
			
 
				-{
			
 
				-	if (vma->vm_mm == current->active_mm)
			
 
				-		__flush_tlb_up();
			
 
				-}
			
 
				-
			
 
				-static inline void flush_tlb_mm_range(struct mm_struct *mm,
			
 
				-	   unsigned long start, unsigned long end, unsigned long vmflag)
			
 
				-{
			
 
				-	if (mm == current->active_mm)
			
 
				-		__flush_tlb_up();
			
 
				-}
			
 
				-
			
 
				-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
			
 
				-					   struct mm_struct *mm,
			
 
				-					   unsigned long start,
			
 
				-					   unsigned long end)
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static inline void reset_lazy_tlbstate(void)
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static inline void flush_tlb_kernel_range(unsigned long start,
			
 
				-					  unsigned long end)
			
 
				-{
			
 
				-	flush_tlb_all();
			
 
				-}
			
 
				-
			
 
				-#else  /* SMP */
			
 
				-
			
 
				-#include <asm/smp.h>
			
 
				+struct flush_tlb_info {
			
 
				+	struct mm_struct *mm;
			
 
				+	unsigned long start;
			
 
				+	unsigned long end;
			
 
				+};
			
 
				 
			
 
				 #define local_flush_tlb() __flush_tlb()
			
 
				 
			
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 
				 		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
			
 
				 
			
 
				 extern void flush_tlb_all(void);
			
 
				-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
			
 
				 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
			
 
				 				unsigned long end, unsigned long vmflag);
			
 
				 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
			
 
				 
			
 
				+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
			
 
				+{
			
 
				+	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
			
 
				+}
			
 
				+
			
 
				 void native_flush_tlb_others(const struct cpumask *cpumask,
			
 
				-				struct mm_struct *mm,
			
 
				-				unsigned long start, unsigned long end);
			
 
				+			     const struct flush_tlb_info *info);
			
 
				 
			
 
				 #define TLBSTATE_OK	1
			
 
				 #define TLBSTATE_LAZY	2
			
 
				 
			
 
				-static inline void reset_lazy_tlbstate(void)
			
 
				+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
			
 
				+					struct mm_struct *mm)
			
 
				 {
			
 
				-	this_cpu_write(cpu_tlbstate.state, 0);
			
 
				-	this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
			
 
				+	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
			
 
				 }
			
 
				 
			
 
				-#endif	/* SMP */
			
 
				+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
			
 
				 
			
 
				 #ifndef CONFIG_PARAVIRT
			
 
				-#define flush_tlb_others(mask, mm, start, end)	\
			
 
				-	native_flush_tlb_others(mask, mm, start, end)
			
 
				+#define flush_tlb_others(mask, info)	\
			
 
				+	native_flush_tlb_others(mask, info)
			
 
				 #endif
			
 
				 
			
 
				 #endif /* _ASM_X86_TLBFLUSH_H */
			
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -1,6 +1,8 @@
 
				 #ifndef _ASM_X86_UV_UV_H
			
 
				 #define _ASM_X86_UV_UV_H
			
 
				 
			
 
				+#include <asm/tlbflush.h>
			
 
				+
			
 
				 enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
			
 
				 
			
 
				 struct cpumask;
			
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
 
				 extern void uv_nmi_init(void);
			
 
				 extern void uv_system_init(void);
			
 
				 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
			
 
				-						 struct mm_struct *mm,
			
 
				-						 unsigned long start,
			
 
				-						 unsigned long end,
			
 
				-						 unsigned int cpu);
			
 
				+						 const struct flush_tlb_info *info);
			
 
				 
			
 
				 #else	/* X86_UV */
			
 
				 
			
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void)	{ return 0; }
 
				 static inline void uv_cpu_init(void)	{ }
			
 
				 static inline void uv_system_init(void)	{ }
			
 
				 static inline const struct cpumask *
			
 
				-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
			
 
				-		    unsigned long start, unsigned long end, unsigned int cpu)
			
 
				+uv_flush_tlb_others(const struct cpumask *cpumask,
			
 
				+		    const struct flush_tlb_info *info)
			
 
				 { return cpumask; }
			
 
				 
			
 
				 #endif	/* X86_UV */
			
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
 
				 #define X86_CR4_OSFXSR		_BITUL(X86_CR4_OSFXSR_BIT)
			
 
				 #define X86_CR4_OSXMMEXCPT_BIT	10 /* enable unmasked SSE exceptions */
			
 
				 #define X86_CR4_OSXMMEXCPT	_BITUL(X86_CR4_OSXMMEXCPT_BIT)
			
 
				+#define X86_CR4_LA57_BIT	12 /* enable 5-level page tables */
			
 
				+#define X86_CR4_LA57		_BITUL(X86_CR4_LA57_BIT)
			
 
				 #define X86_CR4_VMXE_BIT	13 /* enable VMX virtualization */
			
 
				 #define X86_CR4_VMXE		_BITUL(X86_CR4_VMXE_BIT)
			
 
				 #define X86_CR4_SMXE_BIT	14 /* enable safer mode (TXT) */
			
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
 
				 CFLAGS_REMOVE_kvmclock.o = -pg
			
 
				 CFLAGS_REMOVE_ftrace.o = -pg
			
 
				 CFLAGS_REMOVE_early_printk.o = -pg
			
 
				+CFLAGS_REMOVE_head64.o = -pg
			
 
				 endif
			
 
				 
			
 
				 KASAN_SANITIZE_head$(BITS).o				:= n
			
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
 
				 	p4d_t *p4d;
			
 
				 
			
 
				 	/* Install the espfix pud into the kernel page directory */
			
 
				-	pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
			
 
				+	pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
			
 
				 	p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
			
 
				 	p4d_populate(&init_mm, p4d, espfix_pud_page);
			
 
				 
			
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -33,17 +33,120 @@
 
				 /*
			
 
				  * Manage page tables very early on.
			
 
				  */
			
 
				-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
			
 
				+extern pgd_t early_top_pgt[PTRS_PER_PGD];
			
 
				 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
			
 
				-static unsigned int __initdata next_early_pgt = 2;
			
 
				+static unsigned int __initdata next_early_pgt;
			
 
				 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
			
 
				 
			
 
				+#define __head	__section(.head.text)
			
 
				+
			
 
				+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
			
 
				+{
			
 
				+	return ptr - (void *)_text + (void *)physaddr;
			
 
				+}
			
 
				+
			
 
				+void __head __startup_64(unsigned long physaddr)
			
 
				+{
			
 
				+	unsigned long load_delta, *p;
			
 
				+	pgdval_t *pgd;
			
 
				+	p4dval_t *p4d;
			
 
				+	pudval_t *pud;
			
 
				+	pmdval_t *pmd, pmd_entry;
			
 
				+	int i;
			
 
				+
			
 
				+	/* Is the address too large? */
			
 
				+	if (physaddr >> MAX_PHYSMEM_BITS)
			
 
				+		for (;;);
			
 
				+
			
 
				+	/*
			
 
				+	 * Compute the delta between the address I am compiled to run at
			
 
				+	 * and the address I am actually running at.
			
 
				+	 */
			
 
				+	load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
			
 
				+
			
 
				+	/* Is the address not 2M aligned? */
			
 
				+	if (load_delta & ~PMD_PAGE_MASK)
			
 
				+		for (;;);
			
 
				+
			
 
				+	/* Fixup the physical addresses in the page table */
			
 
				+
			
 
				+	pgd = fixup_pointer(&early_top_pgt, physaddr);
			
 
				+	pgd[pgd_index(__START_KERNEL_map)] += load_delta;
			
 
				+
			
 
				+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
			
 
				+		p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
			
 
				+		p4d[511] += load_delta;
			
 
				+	}
			
 
				+
			
 
				+	pud = fixup_pointer(&level3_kernel_pgt, physaddr);
			
 
				+	pud[510] += load_delta;
			
 
				+	pud[511] += load_delta;
			
 
				+
			
 
				+	pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
			
 
				+	pmd[506] += load_delta;
			
 
				+
			
 
				+	/*
			
 
				+	 * Set up the identity mapping for the switchover.  These
			
 
				+	 * entries should *NOT* have the global bit set!  This also
			
 
				+	 * creates a bunch of nonsense entries but that is fine --
			
 
				+	 * it avoids problems around wraparound.
			
 
				+	 */
			
 
				+
			
 
				+	pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
			
 
				+	pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
			
 
				+
			
 
				+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
			
 
				+		p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
			
 
				+
			
 
				+		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
			
 
				+		pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
			
 
				+		pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
			
 
				+
			
 
				+		i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
			
 
				+		p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
			
 
				+		p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
			
 
				+	} else {
			
 
				+		i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
			
 
				+		pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
			
 
				+		pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
			
 
				+	}
			
 
				+
			
 
				+	i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
			
 
				+	pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
			
 
				+	pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
			
 
				+
			
 
				+	pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
			
 
				+	pmd_entry +=  physaddr;
			
 
				+
			
 
				+	for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
			
 
				+		int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
			
 
				+		pmd[idx] = pmd_entry + i * PMD_SIZE;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Fixup the kernel text+data virtual addresses. Note that
			
 
				+	 * we might write invalid pmds, when the kernel is relocated
			
 
				+	 * cleanup_highmap() fixes this up along with the mappings
			
 
				+	 * beyond _end.
			
 
				+	 */
			
 
				+
			
 
				+	pmd = fixup_pointer(level2_kernel_pgt, physaddr);
			
 
				+	for (i = 0; i < PTRS_PER_PMD; i++) {
			
 
				+		if (pmd[i] & _PAGE_PRESENT)
			
 
				+			pmd[i] += load_delta;
			
 
				+	}
			
 
				+
			
 
				+	/* Fixup phys_base */
			
 
				+	p = fixup_pointer(&phys_base, physaddr);
			
 
				+	*p += load_delta;
			
 
				+}
			
 
				+
			
 
				 /* Wipe all early page tables except for the kernel symbol map */
			
 
				 static void __init reset_early_page_tables(void)
			
 
				 {
			
 
				-	memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
			
 
				+	memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
			
 
				 	next_early_pgt = 0;
			
 
				-	write_cr3(__pa_nodebug(early_level4_pgt));
			
 
				+	write_cr3(__pa_nodebug(early_top_pgt));
			
 
				 }
			
 
				 
			
 
				 /* Create a new PMD entry */
			
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
 
				 {
			
 
				 	unsigned long physaddr = address - __PAGE_OFFSET;
			
 
				 	pgdval_t pgd, *pgd_p;
			
 
				+	p4dval_t p4d, *p4d_p;
			
 
				 	pudval_t pud, *pud_p;
			
 
				 	pmdval_t pmd, *pmd_p;
			
 
				 
			
 
				 	/* Invalid address or early pgt is done ?  */
			
 
				-	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
			
 
				+	if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
			
 
				 		return -1;
			
 
				 
			
 
				 again:
			
 
				-	pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
			
 
				+	pgd_p = &early_top_pgt[pgd_index(address)].pgd;
			
 
				 	pgd = *pgd_p;
			
 
				 
			
 
				 	/*
			
@@ -67,8 +171,25 @@ again:
 
				 	 * critical -- __PAGE_OFFSET would point us back into the dynamic
			
 
				 	 * range and we might end up looping forever...
			
 
				 	 */
			
 
				-	if (pgd)
			
 
				-		pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
			
 
				+	if (!IS_ENABLED(CONFIG_X86_5LEVEL))
			
 
				+		p4d_p = pgd_p;
			
 
				+	else if (pgd)
			
 
				+		p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
			
 
				+	else {
			
 
				+		if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
			
 
				+			reset_early_page_tables();
			
 
				+			goto again;
			
 
				+		}
			
 
				+
			
 
				+		p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
			
 
				+		memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
			
 
				+		*pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
			
 
				+	}
			
 
				+	p4d_p += p4d_index(address);
			
 
				+	p4d = *p4d_p;
			
 
				+
			
 
				+	if (p4d)
			
 
				+		pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
			
 
				 	else {
			
 
				 		if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
			
 
				 			reset_early_page_tables();
			
@@ -77,7 +198,7 @@ again:
 
				 
			
 
				 		pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
			
 
				 		memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
			
 
				-		*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
			
 
				+		*p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
			
 
				 	}
			
 
				 	pud_p += pud_index(address);
			
 
				 	pud = *pud_p;
			
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 
				 
			
 
				 	clear_bss();
			
 
				 
			
 
				-	clear_page(init_level4_pgt);
			
 
				+	clear_page(init_top_pgt);
			
 
				 
			
 
				 	kasan_early_init();
			
 
				 
			
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 
				 	 */
			
 
				 	load_ucode_bsp();
			
 
				 
			
 
				-	/* set init_level4_pgt kernel high mapping*/
			
 
				-	init_level4_pgt[511] = early_level4_pgt[511];
			
 
				+	/* set init_top_pgt kernel high mapping*/
			
 
				+	init_top_pgt[511] = early_top_pgt[511];
			
 
				 
			
 
				 	x86_64_start_reservations(real_mode_data);
			
 
				 }
			
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,10 +37,11 @@
 
				  *
			
 
				  */
			
 
				 
			
 
				+#define p4d_index(x)	(((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
			
 
				 #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
			
 
				 
			
 
				-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
			
 
				-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
			
 
				+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
			
 
				+PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
			
 
				 L3_START_KERNEL = pud_index(__START_KERNEL_map)
			
 
				 
			
 
				 	.text
			
@@ -72,101 +73,12 @@ startup_64:
 
				 	/* Sanitize CPU configuration */
			
 
				 	call verify_cpu
			
 
				 
			
 
				-	/*
			
 
				-	 * Compute the delta between the address I am compiled to run at and the
			
 
				-	 * address I am actually running at.
			
 
				-	 */
			
 
				-	leaq	_text(%rip), %rbp
			
 
				-	subq	$_text - __START_KERNEL_map, %rbp
			
 
				-
			
 
				-	/* Is the address not 2M aligned? */
			
 
				-	testl	$~PMD_PAGE_MASK, %ebp
			
 
				-	jnz	bad_address
			
 
				-
			
 
				-	/*
			
 
				-	 * Is the address too large?
			
 
				-	 */
			
 
				-	leaq	_text(%rip), %rax
			
 
				-	shrq	$MAX_PHYSMEM_BITS, %rax
			
 
				-	jnz	bad_address
			
 
				-
			
 
				-	/*
			
 
				-	 * Fixup the physical addresses in the page table
			
 
				-	 */
			
 
				-	addq	%rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
			
 
				-
			
 
				-	addq	%rbp, level3_kernel_pgt + (510*8)(%rip)
			
 
				-	addq	%rbp, level3_kernel_pgt + (511*8)(%rip)
			
 
				-
			
 
				-	addq	%rbp, level2_fixmap_pgt + (506*8)(%rip)
			
 
				-
			
 
				-	/*
			
 
				-	 * Set up the identity mapping for the switchover.  These
			
 
				-	 * entries should *NOT* have the global bit set!  This also
			
 
				-	 * creates a bunch of nonsense entries but that is fine --
			
 
				-	 * it avoids problems around wraparound.
			
 
				-	 */
			
 
				 	leaq	_text(%rip), %rdi
			
 
				-	leaq	early_level4_pgt(%rip), %rbx
			
 
				-
			
 
				-	movq	%rdi, %rax
			
 
				-	shrq	$PGDIR_SHIFT, %rax
			
 
				-
			
 
				-	leaq	(PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
			
 
				-	movq	%rdx, 0(%rbx,%rax,8)
			
 
				-	movq	%rdx, 8(%rbx,%rax,8)
			
 
				-
			
 
				-	addq	$PAGE_SIZE, %rdx
			
 
				-	movq	%rdi, %rax
			
 
				-	shrq	$PUD_SHIFT, %rax
			
 
				-	andl	$(PTRS_PER_PUD-1), %eax
			
 
				-	movq	%rdx, PAGE_SIZE(%rbx,%rax,8)
			
 
				-	incl	%eax
			
 
				-	andl	$(PTRS_PER_PUD-1), %eax
			
 
				-	movq	%rdx, PAGE_SIZE(%rbx,%rax,8)
			
 
				-
			
 
				-	addq	$PAGE_SIZE * 2, %rbx
			
 
				-	movq	%rdi, %rax
			
 
				-	shrq	$PMD_SHIFT, %rdi
			
 
				-	addq	$(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
			
 
				-	leaq	(_end - 1)(%rip), %rcx
			
 
				-	shrq	$PMD_SHIFT, %rcx
			
 
				-	subq	%rdi, %rcx
			
 
				-	incl	%ecx
			
 
				-
			
 
				-1:
			
 
				-	andq	$(PTRS_PER_PMD - 1), %rdi
			
 
				-	movq	%rax, (%rbx,%rdi,8)
			
 
				-	incq	%rdi
			
 
				-	addq	$PMD_SIZE, %rax
			
 
				-	decl	%ecx
			
 
				-	jnz	1b
			
 
				-
			
 
				-	test %rbp, %rbp
			
 
				-	jz .Lskip_fixup
			
 
				+	pushq	%rsi
			
 
				+	call	__startup_64
			
 
				+	popq	%rsi
			
 
				 
			
 
				-	/*
			
 
				-	 * Fixup the kernel text+data virtual addresses. Note that
			
 
				-	 * we might write invalid pmds, when the kernel is relocated
			
 
				-	 * cleanup_highmap() fixes this up along with the mappings
			
 
				-	 * beyond _end.
			
 
				-	 */
			
 
				-	leaq	level2_kernel_pgt(%rip), %rdi
			
 
				-	leaq	PAGE_SIZE(%rdi), %r8
			
 
				-	/* See if it is a valid page table entry */
			
 
				-1:	testb	$_PAGE_PRESENT, 0(%rdi)
			
 
				-	jz	2f
			
 
				-	addq	%rbp, 0(%rdi)
			
 
				-	/* Go to the next page */
			
 
				-2:	addq	$8, %rdi
			
 
				-	cmp	%r8, %rdi
			
 
				-	jne	1b
			
 
				-
			
 
				-	/* Fixup phys_base */
			
 
				-	addq	%rbp, phys_base(%rip)
			
 
				-
			
 
				-.Lskip_fixup:
			
 
				-	movq	$(early_level4_pgt - __START_KERNEL_map), %rax
			
 
				+	movq	$(early_top_pgt - __START_KERNEL_map), %rax
			
 
				 	jmp 1f
			
 
				 ENTRY(secondary_startup_64)
			
 
				 	/*
			
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
 
				 	/* Sanitize CPU configuration */
			
 
				 	call verify_cpu
			
 
				 
			
 
				-	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
			
 
				+	movq	$(init_top_pgt - __START_KERNEL_map), %rax
			
 
				 1:
			
 
				 
			
 
				-	/* Enable PAE mode and PGE */
			
 
				+	/* Enable PAE mode, PGE and LA57 */
			
 
				 	movl	$(X86_CR4_PAE | X86_CR4_PGE), %ecx
			
 
				+#ifdef CONFIG_X86_5LEVEL
			
 
				+	orl	$X86_CR4_LA57, %ecx
			
 
				+#endif
			
 
				 	movq	%rcx, %cr4
			
 
				 
			
 
				-	/* Setup early boot stage 4 level pagetables. */
			
 
				+	/* Setup early boot stage 4-/5-level pagetables. */
			
 
				 	addq	phys_base(%rip), %rax
			
 
				 	movq	%rax, %cr3
			
 
				 
			
@@ -417,9 +332,13 @@ GLOBAL(name)
 
				 	.endr
			
 
				 
			
 
				 	__INITDATA
			
 
				-NEXT_PAGE(early_level4_pgt)
			
 
				+NEXT_PAGE(early_top_pgt)
			
 
				 	.fill	511,8,0
			
 
				+#ifdef CONFIG_X86_5LEVEL
			
 
				+	.quad	level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
			
 
				+#else
			
 
				 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
			
 
				+#endif
			
 
				 
			
 
				 NEXT_PAGE(early_dynamic_pgts)
			
 
				 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
			
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
 
				 	.data
			
 
				 
			
 
				 #ifndef CONFIG_XEN
			
 
				-NEXT_PAGE(init_level4_pgt)
			
 
				+NEXT_PAGE(init_top_pgt)
			
 
				 	.fill	512,8,0
			
 
				 #else
			
 
				-NEXT_PAGE(init_level4_pgt)
			
 
				+NEXT_PAGE(init_top_pgt)
			
 
				 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
			
 
				-	.org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
			
 
				+	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
			
 
				 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
			
 
				-	.org    init_level4_pgt + L4_START_KERNEL*8, 0
			
 
				+	.org    init_top_pgt + PGD_START_KERNEL*8, 0
			
 
				 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
			
 
				 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
			
 
				 
			
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
 
				 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_X86_5LEVEL
			
 
				+NEXT_PAGE(level4_kernel_pgt)
			
 
				+	.fill	511,8,0
			
 
				+	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
			
 
				+#endif
			
 
				+
			
 
				 NEXT_PAGE(level3_kernel_pgt)
			
 
				 	.fill	L3_START_KERNEL,8,0
			
 
				 	/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
			
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -22,24 +22,25 @@
 
				 #include <asm/syscalls.h>
			
 
				 
			
 
				 /* context.lock is held for us, so we don't need any locking. */
			
 
				-static void flush_ldt(void *current_mm)
			
 
				+static void flush_ldt(void *__mm)
			
 
				 {
			
 
				+	struct mm_struct *mm = __mm;
			
 
				 	mm_context_t *pc;
			
 
				 
			
 
				-	if (current->active_mm != current_mm)
			
 
				+	if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
			
 
				 		return;
			
 
				 
			
 
				-	pc = &current->active_mm->context;
			
 
				-	set_ldt(pc->ldt->entries, pc->ldt->size);
			
 
				+	pc = &mm->context;
			
 
				+	set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
			
 
				 }
			
 
				 
			
 
				 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
			
 
				-static struct ldt_struct *alloc_ldt_struct(unsigned int size)
			
 
				+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
			
 
				 {
			
 
				 	struct ldt_struct *new_ldt;
			
 
				 	unsigned int alloc_size;
			
 
				 
			
 
				-	if (size > LDT_ENTRIES)
			
 
				+	if (num_entries > LDT_ENTRIES)
			
 
				 		return NULL;
			
 
				 
			
 
				 	new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
			
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
 
				 		return NULL;
			
 
				 
			
 
				 	BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
			
 
				-	alloc_size = size * LDT_ENTRY_SIZE;
			
 
				+	alloc_size = num_entries * LDT_ENTRY_SIZE;
			
 
				 
			
 
				 	/*
			
 
				 	 * Xen is very picky: it requires a page-aligned LDT that has no
			
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
 
				 		return NULL;
			
 
				 	}
			
 
				 
			
 
				-	new_ldt->size = size;
			
 
				+	new_ldt->nr_entries = num_entries;
			
 
				 	return new_ldt;
			
 
				 }
			
 
				 
			
 
				 /* After calling this, the LDT is immutable. */
			
 
				 static void finalize_ldt_struct(struct ldt_struct *ldt)
			
 
				 {
			
 
				-	paravirt_alloc_ldt(ldt->entries, ldt->size);
			
 
				+	paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
			
 
				 }
			
 
				 
			
 
				 /* context.lock is held */
			
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 
				 	if (likely(!ldt))
			
 
				 		return;
			
 
				 
			
 
				-	paravirt_free_ldt(ldt->entries, ldt->size);
			
 
				-	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
			
 
				+	paravirt_free_ldt(ldt->entries, ldt->nr_entries);
			
 
				+	if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
			
 
				 		vfree_atomic(ldt->entries);
			
 
				 	else
			
 
				 		free_page((unsigned long)ldt->entries);
			
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				-	new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
			
 
				+	new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
			
 
				 	if (!new_ldt) {
			
 
				 		retval = -ENOMEM;
			
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				 	memcpy(new_ldt->entries, old_mm->context.ldt->entries,
			
 
				-	       new_ldt->size * LDT_ENTRY_SIZE);
			
 
				+	       new_ldt->nr_entries * LDT_ENTRY_SIZE);
			
 
				 	finalize_ldt_struct(new_ldt);
			
 
				 
			
 
				 	mm->context.ldt = new_ldt;
			
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
 
				 
			
 
				 static int read_ldt(void __user *ptr, unsigned long bytecount)
			
 
				 {
			
 
				-	int retval;
			
 
				-	unsigned long size;
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				+	unsigned long entries_size;
			
 
				+	int retval;
			
 
				 
			
 
				 	mutex_lock(&mm->context.lock);
			
 
				 
			
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
 
				 	if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
			
 
				 		bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
			
 
				 
			
 
				-	size = mm->context.ldt->size * LDT_ENTRY_SIZE;
			
 
				-	if (size > bytecount)
			
 
				-		size = bytecount;
			
 
				+	entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
			
 
				+	if (entries_size > bytecount)
			
 
				+		entries_size = bytecount;
			
 
				 
			
 
				-	if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
			
 
				+	if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
			
 
				 		retval = -EFAULT;
			
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				-	if (size != bytecount) {
			
 
				+	if (entries_size != bytecount) {
			
 
				 		/* Zero-fill the rest and pretend we read bytecount bytes. */
			
 
				-		if (clear_user(ptr + size, bytecount - size)) {
			
 
				+		if (clear_user(ptr + entries_size, bytecount - entries_size)) {
			
 
				 			retval = -EFAULT;
			
 
				 			goto out_unlock;
			
 
				 		}
			
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
				 {
			
 
				 	struct mm_struct *mm = current->mm;
			
 
				 	struct ldt_struct *new_ldt, *old_ldt;
			
 
				-	unsigned int oldsize, newsize;
			
 
				+	unsigned int old_nr_entries, new_nr_entries;
			
 
				 	struct user_desc ldt_info;
			
 
				 	struct desc_struct ldt;
			
 
				 	int error;
			
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 
				 
			
 
				 	mutex_lock(&mm->context.lock);
			
 
				 
			
 
				-	old_ldt = mm->context.ldt;
			
 
				-	oldsize = old_ldt ? old_ldt->size : 0;
			
 
				-	newsize = max(ldt_info.entry_number + 1, oldsize);
			
 
				+	old_ldt       = mm->context.ldt;
			
 
				+	old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
			
 
				+	new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
			
 
				 
			
 
				 	error = -ENOMEM;
			
 
				-	new_ldt = alloc_ldt_struct(newsize);
			
 
				+	new_ldt = alloc_ldt_struct(new_nr_entries);
			
 
				 	if (!new_ldt)
			
 
				 		goto out_unlock;
			
 
				 
			
 
				 	if (old_ldt)
			
 
				-		memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
			
 
				+		memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
			
 
				+
			
 
				 	new_ldt->entries[ldt_info.entry_number] = ldt;
			
 
				 	finalize_ldt_struct(new_ldt);
			
 
				 
			
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
 
				 void arch_crash_save_vmcoreinfo(void)
			
 
				 {
			
 
				 	VMCOREINFO_NUMBER(phys_base);
			
 
				-	VMCOREINFO_SYMBOL(init_level4_pgt);
			
 
				+	VMCOREINFO_SYMBOL(init_top_pgt);
			
 
				 
			
 
				 #ifdef CONFIG_NUMA
			
 
				 	VMCOREINFO_SYMBOL(node_data);
			
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
 
				 
			
 
				 	.read_cr2 = native_read_cr2,
			
 
				 	.write_cr2 = native_write_cr2,
			
 
				-	.read_cr3 = native_read_cr3,
			
 
				+	.read_cr3 = __native_read_cr3,
			
 
				 	.write_cr3 = native_write_cr3,
			
 
				 
			
 
				 	.flush_tlb_user = native_flush_tlb,
			
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
				 
			
 
				 	cr0 = read_cr0();
			
 
				 	cr2 = read_cr2();
			
 
				-	cr3 = read_cr3();
			
 
				+	cr3 = __read_cr3();
			
 
				 	cr4 = __read_cr4();
			
 
				 	printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
			
 
				 			cr0, cr2, cr3, cr4);
			
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
 
				 
			
 
				 	cr0 = read_cr0();
			
 
				 	cr2 = read_cr2();
			
 
				-	cr3 = read_cr3();
			
 
				+	cr3 = __read_cr3();
			
 
				 	cr4 = __read_cr4();
			
 
				 
			
 
				 	printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
			
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
 
				 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
			
 
				 				dead_task->comm,
			
 
				 				dead_task->mm->context.ldt->entries,
			
 
				-				dead_task->mm->context.ldt->size);
			
 
				+				dead_task->mm->context.ldt->nr_entries);
			
 
				 			BUG();
			
 
				 		}
			
 
				 #endif
			
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
 
				 void play_dead_common(void)
			
 
				 {
			
 
				 	idle_task_exit();
			
 
				-	reset_lazy_tlbstate();
			
 
				 
			
 
				 	/* Ack it */
			
 
				 	(void)cpu_report_death();
			
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 
				 
			
 
				 		mutex_lock(&child->mm->context.lock);
			
 
				 		if (unlikely(!child->mm->context.ldt ||
			
 
				-			     seg >= child->mm->context.ldt->size))
			
 
				+			     seg >= child->mm->context.ldt->nr_entries))
			
 
				 			addr = -1L; /* bogus selector, access would fault */
			
 
				 		else {
			
 
				 			desc = &child->mm->context.ldt->entries[seg];
			
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,6 +49,7 @@
 
				 #include <asm/kexec.h>
			
 
				 #include <asm/apic.h>
			
 
				 #include <asm/irq_remapping.h>
			
 
				+#include <asm/mmu_context.h>
			
 
				 
			
 
				 #include "trace.h"
			
 
				 #include "pmu.h"
			
@@ -597,6 +598,7 @@ struct vcpu_vmx {
 
				 		int           gs_ldt_reload_needed;
			
 
				 		int           fs_reload_needed;
			
 
				 		u64           msr_host_bndcfgs;
			
 
				+		unsigned long vmcs_host_cr3;	/* May not match real cr3 */
			
 
				 		unsigned long vmcs_host_cr4;	/* May not match real cr4 */
			
 
				 	} host_state;
			
 
				 	struct {
			
@@ -5013,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 
				 	u32 low32, high32;
			
 
				 	unsigned long tmpl;
			
 
				 	struct desc_ptr dt;
			
 
				-	unsigned long cr0, cr4;
			
 
				+	unsigned long cr0, cr3, cr4;
			
 
				 
			
 
				 	cr0 = read_cr0();
			
 
				 	WARN_ON(cr0 & X86_CR0_TS);
			
 
				 	vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
			
 
				-	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
			
 
				+
			
 
				+	/*
			
 
				+	 * Save the most likely value for this task's CR3 in the VMCS.
			
 
				+	 * We can't use __get_current_cr3_fast() because we're not atomic.
			
 
				+	 */
			
 
				+	cr3 = __read_cr3();
			
 
				+	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
			
 
				+	vmx->host_state.vmcs_host_cr3 = cr3;
			
 
				 
			
 
				 	/* Save the most likely value for this task's CR4 in the VMCS. */
			
 
				 	cr4 = cr4_read_shadow();
			
@@ -8822,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
 
				 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				 	struct vcpu_vmx *vmx = to_vmx(vcpu);
			
 
				-	unsigned long debugctlmsr, cr4;
			
 
				+	unsigned long debugctlmsr, cr3, cr4;
			
 
				 
			
 
				 	/* Don't enter VMX if guest state is invalid, let the exit handler
			
 
				 	   start emulation until we arrive back to a valid state */
			
@@ -8844,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
				 	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
			
 
				 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
			
 
				 
			
 
				+	cr3 = __get_current_cr3_fast();
			
 
				+	if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
			
 
				+		vmcs_writel(HOST_CR3, cr3);
			
 
				+		vmx->host_state.vmcs_host_cr3 = cr3;
			
 
				+	}
			
 
				+
			
 
				 	cr4 = cr4_read_shadow();
			
 
				 	if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
			
 
				 		vmcs_writel(HOST_CR4, cr4);
			
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
 
				 #ifdef CONFIG_MODIFY_LDT_SYSCALL
			
 
				 	seg >>= 3;
			
 
				 	mutex_lock(&current->mm->context.lock);
			
 
				-	if (current->mm->context.ldt && seg < current->mm->context.ldt->size)
			
 
				+	if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
			
 
				 		ret = current->mm->context.ldt->entries[seg];
			
 
				 	mutex_unlock(&current->mm->context.lock);
			
 
				 #endif
			
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
 
				 KCOV_INSTRUMENT_tlb.o	:= n
			
 
				 
			
 
				 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
			
 
				-	    pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
			
 
				+	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o
			
 
				 
			
 
				 # Make sure __phys_addr has no stackprotector
			
 
				 nostackp := $(call cc-option, -fno-stack-protector)
			
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
 
				 				       bool checkwx)
			
 
				 {
			
 
				 #ifdef CONFIG_X86_64
			
 
				-	pgd_t *start = (pgd_t *) &init_level4_pgt;
			
 
				+	pgd_t *start = (pgd_t *) &init_top_pgt;
			
 
				 #else
			
 
				 	pgd_t *start = swapper_pg_dir;
			
 
				 #endif
			
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
 
				 	 * Do _not_ use "current" here. We might be inside
			
 
				 	 * an interrupt in the middle of a task switch..
			
 
				 	 */
			
 
				-	pgd_paddr = read_cr3();
			
 
				+	pgd_paddr = read_cr3_pa();
			
 
				 	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
			
 
				 	if (!pmd_k)
			
 
				 		return -1;
			
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
 
				 
			
 
				 static void dump_pagetable(unsigned long address)
			
 
				 {
			
 
				-	pgd_t *base = __va(read_cr3());
			
 
				+	pgd_t *base = __va(read_cr3_pa());
			
 
				 	pgd_t *pgd = &base[pgd_index(address)];
			
 
				 	p4d_t *p4d;
			
 
				 	pud_t *pud;
			
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
 
				 	 * happen within a race in page table update. In the later
			
 
				 	 * case just flush:
			
 
				 	 */
			
 
				-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
			
 
				+	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
			
 
				 	pgd_ref = pgd_offset_k(address);
			
 
				 	if (pgd_none(*pgd_ref))
			
 
				 		return -1;
			
@@ -555,7 +555,7 @@ static int bad_address(void *p)
 
				 
			
 
				 static void dump_pagetable(unsigned long address)
			
 
				 {
			
 
				-	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
			
 
				+	pgd_t *base = __va(read_cr3_pa());
			
 
				 	pgd_t *pgd = base + pgd_index(address);
			
 
				 	p4d_t *p4d;
			
 
				 	pud_t *pud;
			
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 
				 		pgd_t *pgd;
			
 
				 		pte_t *pte;
			
 
				 
			
 
				-		pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
			
 
				+		pgd = __va(read_cr3_pa());
			
 
				 		pgd += pgd_index(address);
			
 
				 
			
 
				 		pte = lookup_address_in_pgd(pgd, address, &level);
			
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -1,496 +0,0 @@
 
				-/*
			
 
				- * Lockless get_user_pages_fast for x86
			
 
				- *
			
 
				- * Copyright (C) 2008 Nick Piggin
			
 
				- * Copyright (C) 2008 Novell Inc.
			
 
				- */
			
 
				-#include <linux/sched.h>
			
 
				-#include <linux/mm.h>
			
 
				-#include <linux/vmstat.h>
			
 
				-#include <linux/highmem.h>
			
 
				-#include <linux/swap.h>
			
 
				-#include <linux/memremap.h>
			
 
				-
			
 
				-#include <asm/mmu_context.h>
			
 
				-#include <asm/pgtable.h>
			
 
				-
			
 
				-static inline pte_t gup_get_pte(pte_t *ptep)
			
 
				-{
			
 
				-#ifndef CONFIG_X86_PAE
			
 
				-	return READ_ONCE(*ptep);
			
 
				-#else
			
 
				-	/*
			
 
				-	 * With get_user_pages_fast, we walk down the pagetables without taking
			
 
				-	 * any locks.  For this we would like to load the pointers atomically,
			
 
				-	 * but that is not possible (without expensive cmpxchg8b) on PAE.  What
			
 
				-	 * we do have is the guarantee that a pte will only either go from not
			
 
				-	 * present to present, or present to not present or both -- it will not
			
 
				-	 * switch to a completely different present page without a TLB flush in
			
 
				-	 * between; something that we are blocking by holding interrupts off.
			
 
				-	 *
			
 
				-	 * Setting ptes from not present to present goes:
			
 
				-	 * ptep->pte_high = h;
			
 
				-	 * smp_wmb();
			
 
				-	 * ptep->pte_low = l;
			
 
				-	 *
			
 
				-	 * And present to not present goes:
			
 
				-	 * ptep->pte_low = 0;
			
 
				-	 * smp_wmb();
			
 
				-	 * ptep->pte_high = 0;
			
 
				-	 *
			
 
				-	 * We must ensure here that the load of pte_low sees l iff pte_high
			
 
				-	 * sees h. We load pte_high *after* loading pte_low, which ensures we
			
 
				-	 * don't see an older value of pte_high.  *Then* we recheck pte_low,
			
 
				-	 * which ensures that we haven't picked up a changed pte high. We might
			
 
				-	 * have got rubbish values from pte_low and pte_high, but we are
			
 
				-	 * guaranteed that pte_low will not have the present bit set *unless*
			
 
				-	 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
			
 
				-	 * we're safe.
			
 
				-	 *
			
 
				-	 * gup_get_pte should not be used or copied outside gup.c without being
			
 
				-	 * very careful -- it does not atomically load the pte or anything that
			
 
				-	 * is likely to be useful for you.
			
 
				-	 */
			
 
				-	pte_t pte;
			
 
				-
			
 
				-retry:
			
 
				-	pte.pte_low = ptep->pte_low;
			
 
				-	smp_rmb();
			
 
				-	pte.pte_high = ptep->pte_high;
			
 
				-	smp_rmb();
			
 
				-	if (unlikely(pte.pte_low != ptep->pte_low))
			
 
				-		goto retry;
			
 
				-
			
 
				-	return pte;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
			
 
				-{
			
 
				-	while ((*nr) - nr_start) {
			
 
				-		struct page *page = pages[--(*nr)];
			
 
				-
			
 
				-		ClearPageReferenced(page);
			
 
				-		put_page(page);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * 'pteval' can come from a pte, pmd, pud or p4d.  We only check
			
 
				- * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
			
 
				- * same value on all 4 types.
			
 
				- */
			
 
				-static inline int pte_allows_gup(unsigned long pteval, int write)
			
 
				-{
			
 
				-	unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
			
 
				-
			
 
				-	if (write)
			
 
				-		need_pte_bits |= _PAGE_RW;
			
 
				-
			
 
				-	if ((pteval & need_pte_bits) != need_pte_bits)
			
 
				-		return 0;
			
 
				-
			
 
				-	/* Check memory protection keys permissions. */
			
 
				-	if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
			
 
				-		return 0;
			
 
				-
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * The performance critical leaf functions are made noinline otherwise gcc
			
 
				- * inlines everything into a single function which results in too much
			
 
				- * register pressure.
			
 
				- */
			
 
				-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
			
 
				-		unsigned long end, int write, struct page **pages, int *nr)
			
 
				-{
			
 
				-	struct dev_pagemap *pgmap = NULL;
			
 
				-	int nr_start = *nr, ret = 0;
			
 
				-	pte_t *ptep, *ptem;
			
 
				-
			
 
				-	/*
			
 
				-	 * Keep the original mapped PTE value (ptem) around since we
			
 
				-	 * might increment ptep off the end of the page when finishing
			
 
				-	 * our loop iteration.
			
 
				-	 */
			
 
				-	ptem = ptep = pte_offset_map(&pmd, addr);
			
 
				-	do {
			
 
				-		pte_t pte = gup_get_pte(ptep);
			
 
				-		struct page *page;
			
 
				-
			
 
				-		/* Similar to the PMD case, NUMA hinting must take slow path */
			
 
				-		if (pte_protnone(pte))
			
 
				-			break;
			
 
				-
			
 
				-		if (!pte_allows_gup(pte_val(pte), write))
			
 
				-			break;
			
 
				-
			
 
				-		if (pte_devmap(pte)) {
			
 
				-			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
			
 
				-			if (unlikely(!pgmap)) {
			
 
				-				undo_dev_pagemap(nr, nr_start, pages);
			
 
				-				break;
			
 
				-			}
			
 
				-		} else if (pte_special(pte))
			
 
				-			break;
			
 
				-
			
 
				-		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
			
 
				-		page = pte_page(pte);
			
 
				-		get_page(page);
			
 
				-		put_dev_pagemap(pgmap);
			
 
				-		SetPageReferenced(page);
			
 
				-		pages[*nr] = page;
			
 
				-		(*nr)++;
			
 
				-
			
 
				-	} while (ptep++, addr += PAGE_SIZE, addr != end);
			
 
				-	if (addr == end)
			
 
				-		ret = 1;
			
 
				-	pte_unmap(ptem);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static inline void get_head_page_multiple(struct page *page, int nr)
			
 
				-{
			
 
				-	VM_BUG_ON_PAGE(page != compound_head(page), page);
			
 
				-	VM_BUG_ON_PAGE(page_count(page) == 0, page);
			
 
				-	page_ref_add(page, nr);
			
 
				-	SetPageReferenced(page);
			
 
				-}
			
 
				-
			
 
				-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
			
 
				-		unsigned long end, struct page **pages, int *nr)
			
 
				-{
			
 
				-	int nr_start = *nr;
			
 
				-	struct dev_pagemap *pgmap = NULL;
			
 
				-
			
 
				-	do {
			
 
				-		struct page *page = pfn_to_page(pfn);
			
 
				-
			
 
				-		pgmap = get_dev_pagemap(pfn, pgmap);
			
 
				-		if (unlikely(!pgmap)) {
			
 
				-			undo_dev_pagemap(nr, nr_start, pages);
			
 
				-			return 0;
			
 
				-		}
			
 
				-		SetPageReferenced(page);
			
 
				-		pages[*nr] = page;
			
 
				-		get_page(page);
			
 
				-		put_dev_pagemap(pgmap);
			
 
				-		(*nr)++;
			
 
				-		pfn++;
			
 
				-	} while (addr += PAGE_SIZE, addr != end);
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
			
 
				-		unsigned long end, struct page **pages, int *nr)
			
 
				-{
			
 
				-	unsigned long fault_pfn;
			
 
				-
			
 
				-	fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
			
 
				-	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
			
 
				-}
			
 
				-
			
 
				-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
			
 
				-		unsigned long end, struct page **pages, int *nr)
			
 
				-{
			
 
				-	unsigned long fault_pfn;
			
 
				-
			
 
				-	fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
			
 
				-	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
			
 
				-}
			
 
				-
			
 
				-static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
			
 
				-		unsigned long end, int write, struct page **pages, int *nr)
			
 
				-{
			
 
				-	struct page *head, *page;
			
 
				-	int refs;
			
 
				-
			
 
				-	if (!pte_allows_gup(pmd_val(pmd), write))
			
 
				-		return 0;
			
 
				-
			
 
				-	VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
			
 
				-	if (pmd_devmap(pmd))
			
 
				-		return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
			
 
				-
			
 
				-	/* hugepages are never "special" */
			
 
				-	VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
			
 
				-
			
 
				-	refs = 0;
			
 
				-	head = pmd_page(pmd);
			
 
				-	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
			
 
				-	do {
			
 
				-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
			
 
				-		pages[*nr] = page;
			
 
				-		(*nr)++;
			
 
				-		page++;
			
 
				-		refs++;
			
 
				-	} while (addr += PAGE_SIZE, addr != end);
			
 
				-	get_head_page_multiple(head, refs);
			
 
				-
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
			
 
				-		int write, struct page **pages, int *nr)
			
 
				-{
			
 
				-	unsigned long next;
			
 
				-	pmd_t *pmdp;
			
 
				-
			
 
				-	pmdp = pmd_offset(&pud, addr);
			
 
				-	do {
			
 
				-		pmd_t pmd = *pmdp;
			
 
				-
			
 
				-		next = pmd_addr_end(addr, end);
			
 
				-		if (pmd_none(pmd))
			
 
				-			return 0;
			
 
				-		if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
			
 
				-			/*
			
 
				-			 * NUMA hinting faults need to be handled in the GUP
			
 
				-			 * slowpath for accounting purposes and so that they
			
 
				-			 * can be serialised against THP migration.
			
 
				-			 */
			
 
				-			if (pmd_protnone(pmd))
			
 
				-				return 0;
			
 
				-			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
			
 
				-				return 0;
			
 
				-		} else {
			
 
				-			if (!gup_pte_range(pmd, addr, next, write, pages, nr))
			
 
				-				return 0;
			
 
				-		}
			
 
				-	} while (pmdp++, addr = next, addr != end);
			
 
				-
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
			
 
				-		unsigned long end, int write, struct page **pages, int *nr)
			
 
				-{
			
 
				-	struct page *head, *page;
			
 
				-	int refs;
			
 
				-
			
 
				-	if (!pte_allows_gup(pud_val(pud), write))
			
 
				-		return 0;
			
 
				-
			
 
				-	VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
			
 
				-	if (pud_devmap(pud))
			
 
				-		return __gup_device_huge_pud(pud, addr, end, pages, nr);
			
 
				-
			
 
				-	/* hugepages are never "special" */
			
 
				-	VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
			
 
				-
			
 
				-	refs = 0;
			
 
				-	head = pud_page(pud);
			
 
				-	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
			
 
				-	do {
			
 
				-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
			
 
				-		pages[*nr] = page;
			
 
				-		(*nr)++;
			
 
				-		page++;
			
 
				-		refs++;
			
 
				-	} while (addr += PAGE_SIZE, addr != end);
			
 
				-	get_head_page_multiple(head, refs);
			
 
				-
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
			
 
				-			int write, struct page **pages, int *nr)
			
 
				-{
			
 
				-	unsigned long next;
			
 
				-	pud_t *pudp;
			
 
				-
			
 
				-	pudp = pud_offset(&p4d, addr);
			
 
				-	do {
			
 
				-		pud_t pud = *pudp;
			
 
				-
			
 
				-		next = pud_addr_end(addr, end);
			
 
				-		if (pud_none(pud))
			
 
				-			return 0;
			
 
				-		if (unlikely(pud_large(pud))) {
			
 
				-			if (!gup_huge_pud(pud, addr, next, write, pages, nr))
			
 
				-				return 0;
			
 
				-		} else {
			
 
				-			if (!gup_pmd_range(pud, addr, next, write, pages, nr))
			
 
				-				return 0;
			
 
				-		}
			
 
				-	} while (pudp++, addr = next, addr != end);
			
 
				-
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
			
 
				-			int write, struct page **pages, int *nr)
			
 
				-{
			
 
				-	unsigned long next;
			
 
				-	p4d_t *p4dp;
			
 
				-
			
 
				-	p4dp = p4d_offset(&pgd, addr);
			
 
				-	do {
			
 
				-		p4d_t p4d = *p4dp;
			
 
				-
			
 
				-		next = p4d_addr_end(addr, end);
			
 
				-		if (p4d_none(p4d))
			
 
				-			return 0;
			
 
				-		BUILD_BUG_ON(p4d_large(p4d));
			
 
				-		if (!gup_pud_range(p4d, addr, next, write, pages, nr))
			
 
				-			return 0;
			
 
				-	} while (p4dp++, addr = next, addr != end);
			
 
				-
			
 
				-	return 1;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
			
 
				- * back to the regular GUP.
			
 
				- */
			
 
				-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
			
 
				-			  struct page **pages)
			
 
				-{
			
 
				-	struct mm_struct *mm = current->mm;
			
 
				-	unsigned long addr, len, end;
			
 
				-	unsigned long next;
			
 
				-	unsigned long flags;
			
 
				-	pgd_t *pgdp;
			
 
				-	int nr = 0;
			
 
				-
			
 
				-	start &= PAGE_MASK;
			
 
				-	addr = start;
			
 
				-	len = (unsigned long) nr_pages << PAGE_SHIFT;
			
 
				-	end = start + len;
			
 
				-	if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
			
 
				-					(void __user *)start, len)))
			
 
				-		return 0;
			
 
				-
			
 
				-	/*
			
 
				-	 * XXX: batch / limit 'nr', to avoid large irq off latency
			
 
				-	 * needs some instrumenting to determine the common sizes used by
			
 
				-	 * important workloads (eg. DB2), and whether limiting the batch size
			
 
				-	 * will decrease performance.
			
 
				-	 *
			
 
				-	 * It seems like we're in the clear for the moment. Direct-IO is
			
 
				-	 * the main guy that batches up lots of get_user_pages, and even
			
 
				-	 * they are limited to 64-at-a-time which is not so many.
			
 
				-	 */
			
 
				-	/*
			
 
				-	 * This doesn't prevent pagetable teardown, but does prevent
			
 
				-	 * the pagetables and pages from being freed on x86.
			
 
				-	 *
			
 
				-	 * So long as we atomically load page table pointers versus teardown
			
 
				-	 * (which we do on x86, with the above PAE exception), we can follow the
			
 
				-	 * address down to the the page and take a ref on it.
			
 
				-	 */
			
 
				-	local_irq_save(flags);
			
 
				-	pgdp = pgd_offset(mm, addr);
			
 
				-	do {
			
 
				-		pgd_t pgd = *pgdp;
			
 
				-
			
 
				-		next = pgd_addr_end(addr, end);
			
 
				-		if (pgd_none(pgd))
			
 
				-			break;
			
 
				-		if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
			
 
				-			break;
			
 
				-	} while (pgdp++, addr = next, addr != end);
			
 
				-	local_irq_restore(flags);
			
 
				-
			
 
				-	return nr;
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * get_user_pages_fast() - pin user pages in memory
			
 
				- * @start:	starting user address
			
 
				- * @nr_pages:	number of pages from start to pin
			
 
				- * @write:	whether pages will be written to
			
 
				- * @pages:	array that receives pointers to the pages pinned.
			
 
				- * 		Should be at least nr_pages long.
			
 
				- *
			
 
				- * Attempt to pin user pages in memory without taking mm->mmap_sem.
			
 
				- * If not successful, it will fall back to taking the lock and
			
 
				- * calling get_user_pages().
			
 
				- *
			
 
				- * Returns number of pages pinned. This may be fewer than the number
			
 
				- * requested. If nr_pages is 0 or negative, returns 0. If no pages
			
 
				- * were pinned, returns -errno.
			
 
				- */
			
 
				-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
			
 
				-			struct page **pages)
			
 
				-{
			
 
				-	struct mm_struct *mm = current->mm;
			
 
				-	unsigned long addr, len, end;
			
 
				-	unsigned long next;
			
 
				-	pgd_t *pgdp;
			
 
				-	int nr = 0;
			
 
				-
			
 
				-	start &= PAGE_MASK;
			
 
				-	addr = start;
			
 
				-	len = (unsigned long) nr_pages << PAGE_SHIFT;
			
 
				-
			
 
				-	end = start + len;
			
 
				-	if (end < start)
			
 
				-		goto slow_irqon;
			
 
				-
			
 
				-#ifdef CONFIG_X86_64
			
 
				-	if (end >> __VIRTUAL_MASK_SHIFT)
			
 
				-		goto slow_irqon;
			
 
				-#endif
			
 
				-
			
 
				-	/*
			
 
				-	 * XXX: batch / limit 'nr', to avoid large irq off latency
			
 
				-	 * needs some instrumenting to determine the common sizes used by
			
 
				-	 * important workloads (eg. DB2), and whether limiting the batch size
			
 
				-	 * will decrease performance.
			
 
				-	 *
			
 
				-	 * It seems like we're in the clear for the moment. Direct-IO is
			
 
				-	 * the main guy that batches up lots of get_user_pages, and even
			
 
				-	 * they are limited to 64-at-a-time which is not so many.
			
 
				-	 */
			
 
				-	/*
			
 
				-	 * This doesn't prevent pagetable teardown, but does prevent
			
 
				-	 * the pagetables and pages from being freed on x86.
			
 
				-	 *
			
 
				-	 * So long as we atomically load page table pointers versus teardown
			
 
				-	 * (which we do on x86, with the above PAE exception), we can follow the
			
 
				-	 * address down to the the page and take a ref on it.
			
 
				-	 */
			
 
				-	local_irq_disable();
			
 
				-	pgdp = pgd_offset(mm, addr);
			
 
				-	do {
			
 
				-		pgd_t pgd = *pgdp;
			
 
				-
			
 
				-		next = pgd_addr_end(addr, end);
			
 
				-		if (pgd_none(pgd))
			
 
				-			goto slow;
			
 
				-		if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
			
 
				-			goto slow;
			
 
				-	} while (pgdp++, addr = next, addr != end);
			
 
				-	local_irq_enable();
			
 
				-
			
 
				-	VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
			
 
				-	return nr;
			
 
				-
			
 
				-	{
			
 
				-		int ret;
			
 
				-
			
 
				-slow:
			
 
				-		local_irq_enable();
			
 
				-slow_irqon:
			
 
				-		/* Try to get the remaining pages with get_user_pages */
			
 
				-		start += nr << PAGE_SHIFT;
			
 
				-		pages += nr;
			
 
				-
			
 
				-		ret = get_user_pages_unlocked(start,
			
 
				-					      (end - start) >> PAGE_SHIFT,
			
 
				-					      pages, write ? FOLL_WRITE : 0);
			
 
				-
			
 
				-		/* Have to be a bit careful with return values */
			
 
				-		if (nr > 0) {
			
 
				-			if (ret < 0)
			
 
				-				ret = nr;
			
 
				-			else
			
 
				-				ret += nr;
			
 
				-		}
			
 
				-
			
 
				-		return ret;
			
 
				-	}
			
 
				-}
			
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
 
				 }
			
 
				 
			
 
				 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
			
 
				-#ifdef CONFIG_SMP
			
 
				-	.active_mm = &init_mm,
			
 
				+	.loaded_mm = &init_mm,
			
 
				 	.state = 0,
			
 
				-#endif
			
 
				 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
			
 
				 };
			
 
				 EXPORT_SYMBOL_GPL(cpu_tlbstate);
			
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup);
 
				  * When memory was added make sure all the processes MM have
			
 
				  * suitable PGD entries in the local PGD level page.
			
 
				  */
			
 
				+#ifdef CONFIG_X86_5LEVEL
			
 
				+void sync_global_pgds(unsigned long start, unsigned long end)
			
 
				+{
			
 
				+	unsigned long addr;
			
 
				+
			
 
				+	for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
			
 
				+		const pgd_t *pgd_ref = pgd_offset_k(addr);
			
 
				+		struct page *page;
			
 
				+
			
 
				+		/* Check for overflow */
			
 
				+		if (addr < start)
			
 
				+			break;
			
 
				+
			
 
				+		if (pgd_none(*pgd_ref))
			
 
				+			continue;
			
 
				+
			
 
				+		spin_lock(&pgd_lock);
			
 
				+		list_for_each_entry(page, &pgd_list, lru) {
			
 
				+			pgd_t *pgd;
			
 
				+			spinlock_t *pgt_lock;
			
 
				+
			
 
				+			pgd = (pgd_t *)page_address(page) + pgd_index(addr);
			
 
				+			/* the pgt_lock only for Xen */
			
 
				+			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
			
 
				+			spin_lock(pgt_lock);
			
 
				+
			
 
				+			if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
			
 
				+				BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
			
 
				+
			
 
				+			if (pgd_none(*pgd))
			
 
				+				set_pgd(pgd, *pgd_ref);
			
 
				+
			
 
				+			spin_unlock(pgt_lock);
			
 
				+		}
			
 
				+		spin_unlock(&pgd_lock);
			
 
				+	}
			
 
				+}
			
 
				+#else
			
 
				 void sync_global_pgds(unsigned long start, unsigned long end)
			
 
				 {
			
 
				 	unsigned long addr;
			
@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
 
				 		spin_unlock(&pgd_lock);
			
 
				 	}
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  * NOTE: This function is marked __ref because it calls __init function
			
@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
 
				 	return paddr_last;
			
 
				 }
			
 
				 
			
 
				+static unsigned long __meminit
			
 
				+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
			
 
				+	      unsigned long page_size_mask)
			
 
				+{
			
 
				+	unsigned long paddr_next, paddr_last = paddr_end;
			
 
				+	unsigned long vaddr = (unsigned long)__va(paddr);
			
 
				+	int i = p4d_index(vaddr);
			
 
				+
			
 
				+	if (!IS_ENABLED(CONFIG_X86_5LEVEL))
			
 
				+		return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
			
 
				+
			
 
				+	for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
			
 
				+		p4d_t *p4d;
			
 
				+		pud_t *pud;
			
 
				+
			
 
				+		vaddr = (unsigned long)__va(paddr);
			
 
				+		p4d = p4d_page + p4d_index(vaddr);
			
 
				+		paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
			
 
				+
			
 
				+		if (paddr >= paddr_end) {
			
 
				+			if (!after_bootmem &&
			
 
				+			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
			
 
				+					     E820_TYPE_RAM) &&
			
 
				+			    !e820__mapped_any(paddr & P4D_MASK, paddr_next,
			
 
				+					     E820_TYPE_RESERVED_KERN))
			
 
				+				set_p4d(p4d, __p4d(0));
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (!p4d_none(*p4d)) {
			
 
				+			pud = pud_offset(p4d, 0);
			
 
				+			paddr_last = phys_pud_init(pud, paddr,
			
 
				+					paddr_end,
			
 
				+					page_size_mask);
			
 
				+			__flush_tlb_all();
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		pud = alloc_low_page();
			
 
				+		paddr_last = phys_pud_init(pud, paddr, paddr_end,
			
 
				+					   page_size_mask);
			
 
				+
			
 
				+		spin_lock(&init_mm.page_table_lock);
			
 
				+		p4d_populate(&init_mm, p4d, pud);
			
 
				+		spin_unlock(&init_mm.page_table_lock);
			
 
				+	}
			
 
				+	__flush_tlb_all();
			
 
				+
			
 
				+	return paddr_last;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Create page table mapping for the physical memory for specific physical
			
 
				  * addresses. The virtual and physical addresses have to be aligned on PMD level
			
@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
 
				 	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
			
 
				 		pgd_t *pgd = pgd_offset_k(vaddr);
			
 
				 		p4d_t *p4d;
			
 
				-		pud_t *pud;
			
 
				 
			
 
				 		vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
			
 
				 
			
 
				-		BUILD_BUG_ON(pgd_none(*pgd));
			
 
				-		p4d = p4d_offset(pgd, vaddr);
			
 
				-		if (p4d_val(*p4d)) {
			
 
				-			pud = (pud_t *)p4d_page_vaddr(*p4d);
			
 
				-			paddr_last = phys_pud_init(pud, __pa(vaddr),
			
 
				+		if (pgd_val(*pgd)) {
			
 
				+			p4d = (p4d_t *)pgd_page_vaddr(*pgd);
			
 
				+			paddr_last = phys_p4d_init(p4d, __pa(vaddr),
			
 
				 						   __pa(vaddr_end),
			
 
				 						   page_size_mask);
			
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		pud = alloc_low_page();
			
 
				-		paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
			
 
				+		p4d = alloc_low_page();
			
 
				+		paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
			
 
				 					   page_size_mask);
			
 
				 
			
 
				 		spin_lock(&init_mm.page_table_lock);
			
 
				-		p4d_populate(&init_mm, p4d, pud);
			
 
				+		if (IS_ENABLED(CONFIG_X86_5LEVEL))
			
 
				+			pgd_populate(&init_mm, pgd, p4d);
			
 
				+		else
			
 
				+			p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
			
 
				 		spin_unlock(&init_mm.page_table_lock);
			
 
				 		pgd_changed = true;
			
 
				 	}
			
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 
				 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
			
 
				 {
			
 
				 	/* Don't assume we're using swapper_pg_dir at this point */
			
 
				-	pgd_t *base = __va(read_cr3());
			
 
				+	pgd_t *base = __va(read_cr3_pa());
			
 
				 	pgd_t *pgd = &base[pgd_index(addr)];
			
 
				 	p4d_t *p4d = p4d_offset(pgd, addr);
			
 
				 	pud_t *pud = pud_offset(p4d, addr);
			
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,7 +12,7 @@
 
				 #include <asm/tlbflush.h>
			
 
				 #include <asm/sections.h>
			
 
				 
			
 
				-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
			
 
				+extern pgd_t early_top_pgt[PTRS_PER_PGD];
			
 
				 extern struct range pfn_mapped[E820_MAX_ENTRIES];
			
 
				 
			
 
				 static int __init map_range(struct range *range)
			
@@ -109,8 +109,8 @@ void __init kasan_early_init(void)
 
				 	for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
			
 
				 		kasan_zero_p4d[i] = __p4d(p4d_val);
			
 
				 
			
 
				-	kasan_map_early_shadow(early_level4_pgt);
			
 
				-	kasan_map_early_shadow(init_level4_pgt);
			
 
				+	kasan_map_early_shadow(early_top_pgt);
			
 
				+	kasan_map_early_shadow(init_top_pgt);
			
 
				 }
			
 
				 
			
 
				 void __init kasan_init(void)
			
@@ -121,8 +121,8 @@ void __init kasan_init(void)
 
				 	register_die_notifier(&kasan_die_notifier);
			
 
				 #endif
			
 
				 
			
 
				-	memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
			
 
				-	load_cr3(early_level4_pgt);
			
 
				+	memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
			
 
				+	load_cr3(early_top_pgt);
			
 
				 	__flush_tlb_all();
			
 
				 
			
 
				 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
			
@@ -148,7 +148,7 @@ void __init kasan_init(void)
 
				 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
			
 
				 			(void *)KASAN_SHADOW_END);
			
 
				 
			
 
				-	load_cr3(init_level4_pgt);
			
 
				+	load_cr3(init_top_pgt);
			
 
				 	__flush_tlb_all();
			
 
				 
			
 
				 	/*
			
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
 
				  *
			
 
				  * Entropy is generated using the KASLR early boot functions now shared in
			
 
				  * the lib directory (originally written by Kees Cook). Randomization is
			
 
				- * done on PGD & PUD page table levels to increase possible addresses. The
			
 
				- * physical memory mapping code was adapted to support PUD level virtual
			
 
				- * addresses. This implementation on the best configuration provides 30,000
			
 
				- * possible virtual addresses in average for each memory region. An additional
			
 
				- * low memory page is used to ensure each CPU can start with a PGD aligned
			
 
				- * virtual address (for realmode).
			
 
				+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
			
 
				+ * The physical memory mapping code was adapted to support P4D/PUD level
			
 
				+ * virtual addresses. This implementation on the best configuration provides
			
 
				+ * 30,000 possible virtual addresses in average for each memory region.
			
 
				+ * An additional low memory page is used to ensure each CPU can start with
			
 
				+ * a PGD aligned virtual address (for realmode).
			
 
				  *
			
 
				  * The order of each memory region is not changed. The feature looks at
			
 
				  * the available space for the regions based on different configuration
			
@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
 
				 	unsigned long *base;
			
 
				 	unsigned long size_tb;
			
 
				 } kaslr_regions[] = {
			
 
				-	{ &page_offset_base, 64/* Maximum */ },
			
 
				+	{ &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
			
 
				 	{ &vmalloc_base, VMALLOC_SIZE_TB },
			
 
				 	{ &vmemmap_base, 1 },
			
 
				 };
			
@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
 
				 		 */
			
 
				 		entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
			
 
				 		prandom_bytes_state(&rand_state, &rand, sizeof(rand));
			
 
				-		entropy = (rand % (entropy + 1)) & PUD_MASK;
			
 
				+		if (IS_ENABLED(CONFIG_X86_5LEVEL))
			
 
				+			entropy = (rand % (entropy + 1)) & P4D_MASK;
			
 
				+		else
			
 
				+			entropy = (rand % (entropy + 1)) & PUD_MASK;
			
 
				 		vaddr += entropy;
			
 
				 		*kaslr_regions[i].base = vaddr;
			
 
				 
			
@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
 
				 		 * randomization alignment.
			
 
				 		 */
			
 
				 		vaddr += get_padding(&kaslr_regions[i]);
			
 
				-		vaddr = round_up(vaddr + 1, PUD_SIZE);
			
 
				+		if (IS_ENABLED(CONFIG_X86_5LEVEL))
			
 
				+			vaddr = round_up(vaddr + 1, P4D_SIZE);
			
 
				+		else
			
 
				+			vaddr = round_up(vaddr + 1, PUD_SIZE);
			
 
				 		remain_entropy -= entropy;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Create PGD aligned trampoline table to allow real mode initialization
			
 
				- * of additional CPUs. Consume only 1 low memory page.
			
 
				- */
			
 
				-void __meminit init_trampoline(void)
			
 
				+static void __meminit init_trampoline_pud(void)
			
 
				 {
			
 
				 	unsigned long paddr, paddr_next;
			
 
				 	pgd_t *pgd;
			
 
				 	pud_t *pud_page, *pud_page_tramp;
			
 
				 	int i;
			
 
				 
			
 
				-	if (!kaslr_memory_enabled()) {
			
 
				-		init_trampoline_default();
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				 	pud_page_tramp = alloc_low_page();
			
 
				 
			
 
				 	paddr = 0;
			
@@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
 
				 	set_pgd(&trampoline_pgd_entry,
			
 
				 		__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
			
 
				 }
			
 
				+
			
 
				+static void __meminit init_trampoline_p4d(void)
			
 
				+{
			
 
				+	unsigned long paddr, paddr_next;
			
 
				+	pgd_t *pgd;
			
 
				+	p4d_t *p4d_page, *p4d_page_tramp;
			
 
				+	int i;
			
 
				+
			
 
				+	p4d_page_tramp = alloc_low_page();
			
 
				+
			
 
				+	paddr = 0;
			
 
				+	pgd = pgd_offset_k((unsigned long)__va(paddr));
			
 
				+	p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
			
 
				+
			
 
				+	for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
			
 
				+		p4d_t *p4d, *p4d_tramp;
			
 
				+		unsigned long vaddr = (unsigned long)__va(paddr);
			
 
				+
			
 
				+		p4d_tramp = p4d_page_tramp + p4d_index(paddr);
			
 
				+		p4d = p4d_page + p4d_index(vaddr);
			
 
				+		paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
			
 
				+
			
 
				+		*p4d_tramp = *p4d;
			
 
				+	}
			
 
				+
			
 
				+	set_pgd(&trampoline_pgd_entry,
			
 
				+		__pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Create PGD aligned trampoline table to allow real mode initialization
			
 
				+ * of additional CPUs. Consume only 1 low memory page.
			
 
				+ */
			
 
				+void __meminit init_trampoline(void)
			
 
				+{
			
 
				+
			
 
				+	if (!kaslr_memory_enabled()) {
			
 
				+		init_trampoline_default();
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (IS_ENABLED(CONFIG_X86_5LEVEL))
			
 
				+		init_trampoline_p4d();
			
 
				+	else
			
 
				+		init_trampoline_pud();
			
 
				+}
			
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
 
				 	if (current->personality & ADDR_COMPAT_LAYOUT)
			
 
				 		return 1;
			
 
				 
			
 
				-	if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
			
 
				-		return 1;
			
 
				-
			
 
				 	return sysctl_legacy_va_layout;
			
 
				 }
			
 
				 
			
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
 
				 #include <linux/debugfs.h>
			
 
				 
			
 
				 /*
			
 
				- *	Smarter SMP flushing macros.
			
 
				+ *	TLB flushing, formerly SMP-only
			
 
				  *		c/o Linus Torvalds.
			
 
				  *
			
 
				  *	These mean you can really definitely utterly forget about
			
@@ -28,39 +28,28 @@
 
				  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
			
 
				  */
			
 
				 
			
 
				-#ifdef CONFIG_SMP
			
 
				-
			
 
				-struct flush_tlb_info {
			
 
				-	struct mm_struct *flush_mm;
			
 
				-	unsigned long flush_start;
			
 
				-	unsigned long flush_end;
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- * We cannot call mmdrop() because we are in interrupt context,
			
 
				- * instead update mm->cpu_vm_mask.
			
 
				- */
			
 
				 void leave_mm(int cpu)
			
 
				 {
			
 
				-	struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
			
 
				+	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
			
 
				+
			
 
				+	/*
			
 
				+	 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
			
 
				+	 * If so, our callers still expect us to flush the TLB, but there
			
 
				+	 * aren't any user TLB entries in init_mm to worry about.
			
 
				+	 *
			
 
				+	 * This needs to happen before any other sanity checks due to
			
 
				+	 * intel_idle's shenanigans.
			
 
				+	 */
			
 
				+	if (loaded_mm == &init_mm)
			
 
				+		return;
			
 
				+
			
 
				 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
			
 
				 		BUG();
			
 
				-	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
			
 
				-		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
			
 
				-		load_cr3(swapper_pg_dir);
			
 
				-		/*
			
 
				-		 * This gets called in the idle path where RCU
			
 
				-		 * functions differently.  Tracing normally
			
 
				-		 * uses RCU, so we have to call the tracepoint
			
 
				-		 * specially here.
			
 
				-		 */
			
 
				-		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
			
 
				-	}
			
 
				+
			
 
				+	switch_mm(NULL, &init_mm, NULL);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(leave_mm);
			
 
				 
			
 
				-#endif /* CONFIG_SMP */
			
 
				-
			
 
				 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
			
 
				 	       struct task_struct *tsk)
			
 
				 {
			
@@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
				 			struct task_struct *tsk)
			
 
				 {
			
 
				 	unsigned cpu = smp_processor_id();
			
 
				+	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
			
 
				 
			
 
				-	if (likely(prev != next)) {
			
 
				-		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
			
 
				-			/*
			
 
				-			 * If our current stack is in vmalloc space and isn't
			
 
				-			 * mapped in the new pgd, we'll double-fault.  Forcibly
			
 
				-			 * map it.
			
 
				-			 */
			
 
				-			unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
			
 
				-
			
 
				-			pgd_t *pgd = next->pgd + stack_pgd_index;
			
 
				-
			
 
				-			if (unlikely(pgd_none(*pgd)))
			
 
				-				set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
			
 
				-		}
			
 
				+	/*
			
 
				+	 * NB: The scheduler will call us with prev == next when
			
 
				+	 * switching from lazy TLB mode to normal mode if active_mm
			
 
				+	 * isn't changing.  When this happens, there is no guarantee
			
 
				+	 * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
			
 
				+	 *
			
 
				+	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
			
 
				+	 */
			
 
				 
			
 
				-#ifdef CONFIG_SMP
			
 
				-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
			
 
				-		this_cpu_write(cpu_tlbstate.active_mm, next);
			
 
				-#endif
			
 
				+	this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
			
 
				 
			
 
				-		cpumask_set_cpu(cpu, mm_cpumask(next));
			
 
				+	if (real_prev == next) {
			
 
				+		/*
			
 
				+		 * There's nothing to do: we always keep the per-mm control
			
 
				+		 * regs in sync with cpu_tlbstate.loaded_mm.  Just
			
 
				+		 * sanity-check mm_cpumask.
			
 
				+		 */
			
 
				+		if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
			
 
				+			cpumask_set_cpu(cpu, mm_cpumask(next));
			
 
				+		return;
			
 
				+	}
			
 
				 
			
 
				+	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
			
 
				 		/*
			
 
				-		 * Re-load page tables.
			
 
				-		 *
			
 
				-		 * This logic has an ordering constraint:
			
 
				-		 *
			
 
				-		 *  CPU 0: Write to a PTE for 'next'
			
 
				-		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
			
 
				-		 *  CPU 1: set bit 1 in next's mm_cpumask
			
 
				-		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
			
 
				-		 *
			
 
				-		 * We need to prevent an outcome in which CPU 1 observes
			
 
				-		 * the new PTE value and CPU 0 observes bit 1 clear in
			
 
				-		 * mm_cpumask.  (If that occurs, then the IPI will never
			
 
				-		 * be sent, and CPU 0's TLB will contain a stale entry.)
			
 
				-		 *
			
 
				-		 * The bad outcome can occur if either CPU's load is
			
 
				-		 * reordered before that CPU's store, so both CPUs must
			
 
				-		 * execute full barriers to prevent this from happening.
			
 
				-		 *
			
 
				-		 * Thus, switch_mm needs a full barrier between the
			
 
				-		 * store to mm_cpumask and any operation that could load
			
 
				-		 * from next->pgd.  TLB fills are special and can happen
			
 
				-		 * due to instruction fetches or for no reason at all,
			
 
				-		 * and neither LOCK nor MFENCE orders them.
			
 
				-		 * Fortunately, load_cr3() is serializing and gives the
			
 
				-		 * ordering guarantee we need.
			
 
				-		 *
			
 
				+		 * If our current stack is in vmalloc space and isn't
			
 
				+		 * mapped in the new pgd, we'll double-fault.  Forcibly
			
 
				+		 * map it.
			
 
				 		 */
			
 
				-		load_cr3(next->pgd);
			
 
				+		unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
			
 
				 
			
 
				-		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
			
 
				+		pgd_t *pgd = next->pgd + stack_pgd_index;
			
 
				 
			
 
				-		/* Stop flush ipis for the previous mm */
			
 
				-		cpumask_clear_cpu(cpu, mm_cpumask(prev));
			
 
				+		if (unlikely(pgd_none(*pgd)))
			
 
				+			set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
			
 
				+	}
			
 
				 
			
 
				-		/* Load per-mm CR4 state */
			
 
				-		load_mm_cr4(next);
			
 
				+	this_cpu_write(cpu_tlbstate.loaded_mm, next);
			
 
				 
			
 
				-#ifdef CONFIG_MODIFY_LDT_SYSCALL
			
 
				-		/*
			
 
				-		 * Load the LDT, if the LDT is different.
			
 
				-		 *
			
 
				-		 * It's possible that prev->context.ldt doesn't match
			
 
				-		 * the LDT register.  This can happen if leave_mm(prev)
			
 
				-		 * was called and then modify_ldt changed
			
 
				-		 * prev->context.ldt but suppressed an IPI to this CPU.
			
 
				-		 * In this case, prev->context.ldt != NULL, because we
			
 
				-		 * never set context.ldt to NULL while the mm still
			
 
				-		 * exists.  That means that next->context.ldt !=
			
 
				-		 * prev->context.ldt, because mms never share an LDT.
			
 
				-		 */
			
 
				-		if (unlikely(prev->context.ldt != next->context.ldt))
			
 
				-			load_mm_ldt(next);
			
 
				-#endif
			
 
				+	WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
			
 
				+	cpumask_set_cpu(cpu, mm_cpumask(next));
			
 
				+
			
 
				+	/*
			
 
				+	 * Re-load page tables.
			
 
				+	 *
			
 
				+	 * This logic has an ordering constraint:
			
 
				+	 *
			
 
				+	 *  CPU 0: Write to a PTE for 'next'
			
 
				+	 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
			
 
				+	 *  CPU 1: set bit 1 in next's mm_cpumask
			
 
				+	 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
			
 
				+	 *
			
 
				+	 * We need to prevent an outcome in which CPU 1 observes
			
 
				+	 * the new PTE value and CPU 0 observes bit 1 clear in
			
 
				+	 * mm_cpumask.  (If that occurs, then the IPI will never
			
 
				+	 * be sent, and CPU 0's TLB will contain a stale entry.)
			
 
				+	 *
			
 
				+	 * The bad outcome can occur if either CPU's load is
			
 
				+	 * reordered before that CPU's store, so both CPUs must
			
 
				+	 * execute full barriers to prevent this from happening.
			
 
				+	 *
			
 
				+	 * Thus, switch_mm needs a full barrier between the
			
 
				+	 * store to mm_cpumask and any operation that could load
			
 
				+	 * from next->pgd.  TLB fills are special and can happen
			
 
				+	 * due to instruction fetches or for no reason at all,
			
 
				+	 * and neither LOCK nor MFENCE orders them.
			
 
				+	 * Fortunately, load_cr3() is serializing and gives the
			
 
				+	 * ordering guarantee we need.
			
 
				+	 */
			
 
				+	load_cr3(next->pgd);
			
 
				+
			
 
				+	/*
			
 
				+	 * This gets called via leave_mm() in the idle path where RCU
			
 
				+	 * functions differently.  Tracing normally uses RCU, so we have to
			
 
				+	 * call the tracepoint specially here.
			
 
				+	 */
			
 
				+	trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
			
 
				+
			
 
				+	/* Stop flush ipis for the previous mm */
			
 
				+	WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
			
 
				+		     real_prev != &init_mm);
			
 
				+	cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
			
 
				+
			
 
				+	/* Load per-mm CR4 and LDTR state */
			
 
				+	load_mm_cr4(next);
			
 
				+	switch_ldt(real_prev, next);
			
 
				+}
			
 
				+
			
 
				+static void flush_tlb_func_common(const struct flush_tlb_info *f,
			
 
				+				  bool local, enum tlb_flush_reason reason)
			
 
				+{
			
 
				+	/* This code cannot presently handle being reentered. */
			
 
				+	VM_WARN_ON(!irqs_disabled());
			
 
				+
			
 
				+	if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
			
 
				+		leave_mm(smp_processor_id());
			
 
				+		return;
			
 
				 	}
			
 
				-#ifdef CONFIG_SMP
			
 
				-	  else {
			
 
				-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
			
 
				-		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
			
 
				-
			
 
				-		if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
			
 
				-			/*
			
 
				-			 * On established mms, the mm_cpumask is only changed
			
 
				-			 * from irq context, from ptep_clear_flush() while in
			
 
				-			 * lazy tlb mode, and here. Irqs are blocked during
			
 
				-			 * schedule, protecting us from simultaneous changes.
			
 
				-			 */
			
 
				-			cpumask_set_cpu(cpu, mm_cpumask(next));
			
 
				 
			
 
				-			/*
			
 
				-			 * We were in lazy tlb mode and leave_mm disabled
			
 
				-			 * tlb flush IPI delivery. We must reload CR3
			
 
				-			 * to make sure to use no freed page tables.
			
 
				-			 *
			
 
				-			 * As above, load_cr3() is serializing and orders TLB
			
 
				-			 * fills with respect to the mm_cpumask write.
			
 
				-			 */
			
 
				-			load_cr3(next->pgd);
			
 
				-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
			
 
				-			load_mm_cr4(next);
			
 
				-			load_mm_ldt(next);
			
 
				+	if (f->end == TLB_FLUSH_ALL) {
			
 
				+		local_flush_tlb();
			
 
				+		if (local)
			
 
				+			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
			
 
				+		trace_tlb_flush(reason, TLB_FLUSH_ALL);
			
 
				+	} else {
			
 
				+		unsigned long addr;
			
 
				+		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
			
 
				+		addr = f->start;
			
 
				+		while (addr < f->end) {
			
 
				+			__flush_tlb_single(addr);
			
 
				+			addr += PAGE_SIZE;
			
 
				 		}
			
 
				+		if (local)
			
 
				+			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
			
 
				+		trace_tlb_flush(reason, nr_pages);
			
 
				 	}
			
 
				-#endif
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_SMP
			
 
				+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
			
 
				+{
			
 
				+	const struct flush_tlb_info *f = info;
			
 
				 
			
 
				-/*
			
 
				- * The flush IPI assumes that a thread switch happens in this order:
			
 
				- * [cpu0: the cpu that switches]
			
 
				- * 1) switch_mm() either 1a) or 1b)
			
 
				- * 1a) thread switch to a different mm
			
 
				- * 1a1) set cpu_tlbstate to TLBSTATE_OK
			
 
				- *	Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
			
 
				- *	if cpu0 was in lazy tlb mode.
			
 
				- * 1a2) update cpu active_mm
			
 
				- *	Now cpu0 accepts tlb flushes for the new mm.
			
 
				- * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
			
 
				- *	Now the other cpus will send tlb flush ipis.
			
 
				- * 1a4) change cr3.
			
 
				- * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
			
 
				- *	Stop ipi delivery for the old mm. This is not synchronized with
			
 
				- *	the other cpus, but flush_tlb_func ignore flush ipis for the wrong
			
 
				- *	mm, and in the worst case we perform a superfluous tlb flush.
			
 
				- * 1b) thread switch without mm change
			
 
				- *	cpu active_mm is correct, cpu0 already handles flush ipis.
			
 
				- * 1b1) set cpu_tlbstate to TLBSTATE_OK
			
 
				- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
			
 
				- *	Atomically set the bit [other cpus will start sending flush ipis],
			
 
				- *	and test the bit.
			
 
				- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
			
 
				- * 2) switch %%esp, ie current
			
 
				- *
			
 
				- * The interrupt must handle 2 special cases:
			
 
				- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
			
 
				- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
			
 
				- *   runs in kernel space, the cpu could load tlb entries for user space
			
 
				- *   pages.
			
 
				- *
			
 
				- * The good news is that cpu_tlbstate is local to each cpu, no
			
 
				- * write/read ordering problems.
			
 
				- */
			
 
				+	flush_tlb_func_common(f, true, reason);
			
 
				+}
			
 
				 
			
 
				-/*
			
 
				- * TLB flush funcation:
			
 
				- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
			
 
				- * 2) Leave the mm if we are in the lazy tlb mode.
			
 
				- */
			
 
				-static void flush_tlb_func(void *info)
			
 
				+static void flush_tlb_func_remote(void *info)
			
 
				 {
			
 
				-	struct flush_tlb_info *f = info;
			
 
				+	const struct flush_tlb_info *f = info;
			
 
				 
			
 
				 	inc_irq_stat(irq_tlb_count);
			
 
				 
			
 
				-	if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
			
 
				+	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
			
 
				 		return;
			
 
				 
			
 
				 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
			
 
				-	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
			
 
				-		if (f->flush_end == TLB_FLUSH_ALL) {
			
 
				-			local_flush_tlb();
			
 
				-			trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
			
 
				-		} else {
			
 
				-			unsigned long addr;
			
 
				-			unsigned long nr_pages =
			
 
				-				(f->flush_end - f->flush_start) / PAGE_SIZE;
			
 
				-			addr = f->flush_start;
			
 
				-			while (addr < f->flush_end) {
			
 
				-				__flush_tlb_single(addr);
			
 
				-				addr += PAGE_SIZE;
			
 
				-			}
			
 
				-			trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
			
 
				-		}
			
 
				-	} else
			
 
				-		leave_mm(smp_processor_id());
			
 
				-
			
 
				+	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
			
 
				 }
			
 
				 
			
 
				 void native_flush_tlb_others(const struct cpumask *cpumask,
			
 
				-				 struct mm_struct *mm, unsigned long start,
			
 
				-				 unsigned long end)
			
 
				+			     const struct flush_tlb_info *info)
			
 
				 {
			
 
				-	struct flush_tlb_info info;
			
 
				-
			
 
				-	info.flush_mm = mm;
			
 
				-	info.flush_start = start;
			
 
				-	info.flush_end = end;
			
 
				-
			
 
				 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
			
 
				-	if (end == TLB_FLUSH_ALL)
			
 
				+	if (info->end == TLB_FLUSH_ALL)
			
 
				 		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
			
 
				 	else
			
 
				 		trace_tlb_flush(TLB_REMOTE_SEND_IPI,
			
 
				-				(end - start) >> PAGE_SHIFT);
			
 
				+				(info->end - info->start) >> PAGE_SHIFT);
			
 
				 
			
 
				 	if (is_uv_system()) {
			
 
				 		unsigned int cpu;
			
 
				 
			
 
				 		cpu = smp_processor_id();
			
 
				-		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
			
 
				+		cpumask = uv_flush_tlb_others(cpumask, info);
			
 
				 		if (cpumask)
			
 
				-			smp_call_function_many(cpumask, flush_tlb_func,
			
 
				-								&info, 1);
			
 
				+			smp_call_function_many(cpumask, flush_tlb_func_remote,
			
 
				+					       (void *)info, 1);
			
 
				 		return;
			
 
				 	}
			
 
				-	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
			
 
				+	smp_call_function_many(cpumask, flush_tlb_func_remote,
			
 
				+			       (void *)info, 1);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -302,85 +242,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 
				 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
			
 
				 				unsigned long end, unsigned long vmflag)
			
 
				 {
			
 
				-	unsigned long addr;
			
 
				-	/* do a global flush by default */
			
 
				-	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
			
 
				-
			
 
				-	preempt_disable();
			
 
				+	int cpu;
			
 
				 
			
 
				-	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
			
 
				-		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
			
 
				-	if (base_pages_to_flush > tlb_single_page_flush_ceiling)
			
 
				-		base_pages_to_flush = TLB_FLUSH_ALL;
			
 
				+	struct flush_tlb_info info = {
			
 
				+		.mm = mm,
			
 
				+	};
			
 
				 
			
 
				-	if (current->active_mm != mm) {
			
 
				-		/* Synchronize with switch_mm. */
			
 
				-		smp_mb();
			
 
				+	cpu = get_cpu();
			
 
				 
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	if (!current->mm) {
			
 
				-		leave_mm(smp_processor_id());
			
 
				+	/* Synchronize with switch_mm. */
			
 
				+	smp_mb();
			
 
				 
			
 
				-		/* Synchronize with switch_mm. */
			
 
				-		smp_mb();
			
 
				-
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * Both branches below are implicit full barriers (MOV to CR or
			
 
				-	 * INVLPG) that synchronize with switch_mm.
			
 
				-	 */
			
 
				-	if (base_pages_to_flush == TLB_FLUSH_ALL) {
			
 
				-		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
			
 
				-		local_flush_tlb();
			
 
				+	/* Should we flush just the requested range? */
			
 
				+	if ((end != TLB_FLUSH_ALL) &&
			
 
				+	    !(vmflag & VM_HUGETLB) &&
			
 
				+	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
			
 
				+		info.start = start;
			
 
				+		info.end = end;
			
 
				 	} else {
			
 
				-		/* flush range by one by one 'invlpg' */
			
 
				-		for (addr = start; addr < end;	addr += PAGE_SIZE) {
			
 
				-			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
			
 
				-			__flush_tlb_single(addr);
			
 
				-		}
			
 
				-	}
			
 
				-	trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
			
 
				-out:
			
 
				-	if (base_pages_to_flush == TLB_FLUSH_ALL) {
			
 
				-		start = 0UL;
			
 
				-		end = TLB_FLUSH_ALL;
			
 
				+		info.start = 0UL;
			
 
				+		info.end = TLB_FLUSH_ALL;
			
 
				 	}
			
 
				-	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
			
 
				-		flush_tlb_others(mm_cpumask(mm), mm, start, end);
			
 
				-	preempt_enable();
			
 
				-}
			
 
				 
			
 
				-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
			
 
				-{
			
 
				-	struct mm_struct *mm = vma->vm_mm;
			
 
				-
			
 
				-	preempt_disable();
			
 
				-
			
 
				-	if (current->active_mm == mm) {
			
 
				-		if (current->mm) {
			
 
				-			/*
			
 
				-			 * Implicit full barrier (INVLPG) that synchronizes
			
 
				-			 * with switch_mm.
			
 
				-			 */
			
 
				-			__flush_tlb_one(start);
			
 
				-		} else {
			
 
				-			leave_mm(smp_processor_id());
			
 
				-
			
 
				-			/* Synchronize with switch_mm. */
			
 
				-			smp_mb();
			
 
				-		}
			
 
				+	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
			
 
				+		VM_WARN_ON(irqs_disabled());
			
 
				+		local_irq_disable();
			
 
				+		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
			
 
				+		local_irq_enable();
			
 
				 	}
			
 
				 
			
 
				-	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
			
 
				-		flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
			
 
				-
			
 
				-	preempt_enable();
			
 
				+	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
			
 
				+		flush_tlb_others(mm_cpumask(mm), &info);
			
 
				+	put_cpu();
			
 
				 }
			
 
				 
			
 
				+
			
 
				 static void do_flush_tlb_all(void *info)
			
 
				 {
			
 
				 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
			
@@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info)
 
				 	unsigned long addr;
			
 
				 
			
 
				 	/* flush range by one by one 'invlpg' */
			
 
				-	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
			
 
				+	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
			
 
				 		__flush_tlb_single(addr);
			
 
				 }
			
 
				 
			
@@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
				 
			
 
				 	/* Balance as user space task's flush, a bit conservative */
			
 
				 	if (end == TLB_FLUSH_ALL ||
			
 
				-	    (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
			
 
				+	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
			
 
				 		on_each_cpu(do_flush_tlb_all, NULL, 1);
			
 
				 	} else {
			
 
				 		struct flush_tlb_info info;
			
 
				-		info.flush_start = start;
			
 
				-		info.flush_end = end;
			
 
				+		info.start = start;
			
 
				+		info.end = end;
			
 
				 		on_each_cpu(do_kernel_range_flush, &info, 1);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
			
 
				+{
			
 
				+	struct flush_tlb_info info = {
			
 
				+		.mm = NULL,
			
 
				+		.start = 0UL,
			
 
				+		.end = TLB_FLUSH_ALL,
			
 
				+	};
			
 
				+
			
 
				+	int cpu = get_cpu();
			
 
				+
			
 
				+	if (cpumask_test_cpu(cpu, &batch->cpumask)) {
			
 
				+		VM_WARN_ON(irqs_disabled());
			
 
				+		local_irq_disable();
			
 
				+		flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
			
 
				+		local_irq_enable();
			
 
				+	}
			
 
				+
			
 
				+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
			
 
				+		flush_tlb_others(&batch->cpumask, &info);
			
 
				+	cpumask_clear(&batch->cpumask);
			
 
				+
			
 
				+	put_cpu();
			
 
				+}
			
 
				+
			
 
				 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
			
 
				 			     size_t count, loff_t *ppos)
			
 
				 {
			
@@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
 
				 	return 0;
			
 
				 }
			
 
				 late_initcall(create_tlb_single_page_flush_ceiling);
			
 
				-
			
 
				-#endif /* CONFIG_SMP */
			
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
 
				 	int n_pgds, i, j;
			
 
				 
			
 
				 	if (!efi_enabled(EFI_OLD_MEMMAP)) {
			
 
				-		save_pgd = (pgd_t *)read_cr3();
			
 
				+		save_pgd = (pgd_t *)__read_cr3();
			
 
				 		write_cr3((unsigned long)efi_scratch.efi_pgt);
			
 
				 		goto out;
			
 
				 	}
			
@@ -649,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
 
				 	efi_sync_low_kernel_mappings();
			
 
				 	local_irq_save(flags);
			
 
				 
			
 
				-	efi_scratch.prev_cr3 = read_cr3();
			
 
				+	efi_scratch.prev_cr3 = __read_cr3();
			
 
				 	write_cr3((unsigned long)efi_scratch.efi_pgt);
			
 
				 	__flush_tlb_all();
			
 
				 
			
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 
				 
			
 
				 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
			
 
				 {
			
 
				-	void *pgd_addr = __va(read_cr3());
			
 
				+	void *pgd_addr = __va(read_cr3_pa());
			
 
				 
			
 
				 	/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
			
 
				 	outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
			
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1123,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
 
				  * done.  The returned pointer is valid till preemption is re-enabled.
			
 
				  */
			
 
				 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
			
 
				-						struct mm_struct *mm,
			
 
				-						unsigned long start,
			
 
				-						unsigned long end,
			
 
				-						unsigned int cpu)
			
 
				+					  const struct flush_tlb_info *info)
			
 
				 {
			
 
				+	unsigned int cpu = smp_processor_id();
			
 
				 	int locals = 0, remotes = 0, hubs = 0;
			
 
				 	struct bau_desc *bau_desc;
			
 
				 	struct cpumask *flush_mask;
			
@@ -1181,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 
				 
			
 
				 	record_send_statistics(stat, locals, hubs, remotes, bau_desc);
			
 
				 
			
 
				-	if (!end || (end - start) <= PAGE_SIZE)
			
 
				-		address = start;
			
 
				+	if (!info->end || (info->end - info->start) <= PAGE_SIZE)
			
 
				+		address = info->start;
			
 
				 	else
			
 
				 		address = TLB_FLUSH_ALL;
			
 
				 
			
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
 
				 	 */
			
 
				 	ctxt->cr0 = read_cr0();
			
 
				 	ctxt->cr2 = read_cr2();
			
 
				-	ctxt->cr3 = read_cr3();
			
 
				+	ctxt->cr3 = __read_cr3();
			
 
				 	ctxt->cr4 = __read_cr4();
			
 
				 #ifdef CONFIG_X86_64
			
 
				 	ctxt->cr8 = read_cr8();
			
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
 
				 	memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
			
 
				 
			
 
				 	/* Make the page containing the relocated code executable */
			
 
				-	pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
			
 
				+	pgd = (pgd_t *)__va(read_cr3_pa()) +
			
 
				+		pgd_index(relocated_restore_code);
			
 
				 	p4d = p4d_offset(pgd, relocated_restore_code);
			
 
				 	if (p4d_large(*p4d)) {
			
 
				 		set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
			
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
 
				 
			
 
				 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
			
 
				 	trampoline_pgd[0] = trampoline_pgd_entry.pgd;
			
 
				-	trampoline_pgd[511] = init_level4_pgt[511].pgd;
			
 
				+	trampoline_pgd[511] = init_top_pgt[511].pgd;
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 
				 	spin_unlock(&mm->page_table_lock);
			
 
				 }
			
 
				 
			
 
				-
			
 
				-#ifdef CONFIG_SMP
			
 
				-/* Another cpu may still have their %cr3 pointing at the pagetable, so
			
 
				-   we need to repoint it somewhere else before we can unpin it. */
			
 
				-static void drop_other_mm_ref(void *info)
			
 
				+static void drop_mm_ref_this_cpu(void *info)
			
 
				 {
			
 
				 	struct mm_struct *mm = info;
			
 
				-	struct mm_struct *active_mm;
			
 
				-
			
 
				-	active_mm = this_cpu_read(cpu_tlbstate.active_mm);
			
 
				 
			
 
				-	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
			
 
				+	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
			
 
				 		leave_mm(smp_processor_id());
			
 
				 
			
 
				-	/* If this cpu still has a stale cr3 reference, then make sure
			
 
				-	   it has been flushed. */
			
 
				+	/*
			
 
				+	 * If this cpu still has a stale cr3 reference, then make sure
			
 
				+	 * it has been flushed.
			
 
				+	 */
			
 
				 	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
			
 
				-		load_cr3(swapper_pg_dir);
			
 
				+		xen_mc_flush();
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_SMP
			
 
				+/*
			
 
				+ * Another cpu may still have their %cr3 pointing at the pagetable, so
			
 
				+ * we need to repoint it somewhere else before we can unpin it.
			
 
				+ */
			
 
				 static void xen_drop_mm_ref(struct mm_struct *mm)
			
 
				 {
			
 
				 	cpumask_var_t mask;
			
 
				 	unsigned cpu;
			
 
				 
			
 
				-	if (current->active_mm == mm) {
			
 
				-		if (current->mm == mm)
			
 
				-			load_cr3(swapper_pg_dir);
			
 
				-		else
			
 
				-			leave_mm(smp_processor_id());
			
 
				-	}
			
 
				+	drop_mm_ref_this_cpu(mm);
			
 
				 
			
 
				 	/* Get the "official" set of cpus referring to our pagetable. */
			
 
				 	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
			
@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 
				 			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
			
 
				 			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
			
 
				 				continue;
			
 
				-			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
			
 
				+			smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
			
 
				 		}
			
 
				 		return;
			
 
				 	}
			
 
				 	cpumask_copy(mask, mm_cpumask(mm));
			
 
				 
			
 
				-	/* It's possible that a vcpu may have a stale reference to our
			
 
				-	   cr3, because its in lazy mode, and it hasn't yet flushed
			
 
				-	   its set of pending hypercalls yet.  In this case, we can
			
 
				-	   look at its actual current cr3 value, and force it to flush
			
 
				-	   if needed. */
			
 
				+	/*
			
 
				+	 * It's possible that a vcpu may have a stale reference to our
			
 
				+	 * cr3, because its in lazy mode, and it hasn't yet flushed
			
 
				+	 * its set of pending hypercalls yet.  In this case, we can
			
 
				+	 * look at its actual current cr3 value, and force it to flush
			
 
				+	 * if needed.
			
 
				+	 */
			
 
				 	for_each_online_cpu(cpu) {
			
 
				 		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
			
 
				 			cpumask_set_cpu(cpu, mask);
			
 
				 	}
			
 
				 
			
 
				-	if (!cpumask_empty(mask))
			
 
				-		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
			
 
				+	smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
			
 
				 	free_cpumask_var(mask);
			
 
				 }
			
 
				 #else
			
 
				 static void xen_drop_mm_ref(struct mm_struct *mm)
			
 
				 {
			
 
				-	if (current->active_mm == mm)
			
 
				-		load_cr3(swapper_pg_dir);
			
 
				+	drop_mm_ref_this_cpu(mm);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
 
				 }
			
 
				 
			
 
				 static void xen_flush_tlb_others(const struct cpumask *cpus,
			
 
				-				 struct mm_struct *mm, unsigned long start,
			
 
				-				 unsigned long end)
			
 
				+				 const struct flush_tlb_info *info)
			
 
				 {
			
 
				 	struct {
			
 
				 		struct mmuext_op op;
			
@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 
				 	} *args;
			
 
				 	struct multicall_space mcs;
			
 
				 
			
 
				-	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
			
 
				+	trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
			
 
				 
			
 
				 	if (cpumask_empty(cpus))
			
 
				 		return;		/* nothing to do */
			
@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 
				 	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
			
 
				 
			
 
				 	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
			
 
				-	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
			
 
				+	if (info->end != TLB_FLUSH_ALL &&
			
 
				+	    (info->end - info->start) <= PAGE_SIZE) {
			
 
				 		args->op.cmd = MMUEXT_INVLPG_MULTI;
			
 
				-		args->op.arg1.linear_addr = start;
			
 
				+		args->op.arg1.linear_addr = info->start;
			
 
				 	}
			
 
				 
			
 
				 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
			
@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
 
				  * At the start of the day - when Xen launches a guest, it has already
			
 
				  * built pagetables for the guest. We diligently look over them
			
 
				  * in xen_setup_kernel_pagetable and graft as appropriate them in the
			
 
				- * init_level4_pgt and its friends. Then when we are happy we load
			
 
				- * the new init_level4_pgt - and continue on.
			
 
				+ * init_top_pgt and its friends. Then when we are happy we load
			
 
				+ * the new init_top_pgt - and continue on.
			
 
				  *
			
 
				  * The generic code starts (start_kernel) and 'init_mem_mapping' sets
			
 
				  * up the rest of the pagetables. When it has completed it loads the cr3.
			
@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 
				 	pt_end = pt_base + xen_start_info->nr_pt_frames;
			
 
				 
			
 
				 	/* Zap identity mapping */
			
 
				-	init_level4_pgt[0] = __pgd(0);
			
 
				+	init_top_pgt[0] = __pgd(0);
			
 
				 
			
 
				 	/* Pre-constructed entries are in pfn, so convert to mfn */
			
 
				 	/* L4[272] -> level3_ident_pgt  */
			
 
				 	/* L4[511] -> level3_kernel_pgt */
			
 
				-	convert_pfn_mfn(init_level4_pgt);
			
 
				+	convert_pfn_mfn(init_top_pgt);
			
 
				 
			
 
				 	/* L3_i[0] -> level2_ident_pgt */
			
 
				 	convert_pfn_mfn(level3_ident_pgt);
			
@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 
				 	/* Copy the initial P->M table mappings if necessary. */
			
 
				 	i = pgd_index(xen_start_info->mfn_list);
			
 
				 	if (i && i < pgd_index(__START_KERNEL_map))
			
 
				-		init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
			
 
				+		init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
			
 
				 
			
 
				 	/* Make pagetable pieces RO */
			
 
				-	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
			
 
				+	set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
			
 
				 	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
			
 
				 	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
			
 
				 	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
			
@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 
				 
			
 
				 	/* Pin down new L4 */
			
 
				 	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
			
 
				-			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
			
 
				+			  PFN_DOWN(__pa_symbol(init_top_pgt)));
			
 
				 
			
 
				 	/* Unpin Xen-provided one */
			
 
				 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
			
@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
 
				 	 * attach it to, so make sure we just set kernel pgd.
			
 
				 	 */
			
 
				 	xen_mc_batch();
			
 
				-	__xen_write_cr3(true, __pa(init_level4_pgt));
			
 
				+	__xen_write_cr3(true, __pa(init_top_pgt));
			
 
				 	xen_mc_issue(PARAVIRT_LAZY_CPU);
			
 
				 
			
 
				 	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
			
@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
 
				 	pmd_t pmd;
			
 
				 	pte_t pte;
			
 
				 
			
 
				-	pa = read_cr3();
			
 
				+	pa = read_cr3_pa();
			
 
				 	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
			
 
				 						       sizeof(pgd)));
			
 
				 	if (!pgd_present(pgd))
			
@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void)
 
				 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
			
 
				 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
			
 
				 
			
 
				-	pgd = __va(read_cr3());
			
 
				+	pgd = __va(read_cr3_pa());
			
 
				 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
			
 
				 	idx_p4d = 0;
			
 
				 	save_pud = n_pud;
			
@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 
				 {
			
 
				 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
			
 
				 
			
 
				-	BUG_ON(read_cr3() != __pa(initial_page_table));
			
 
				+	BUG_ON(read_cr3_pa() != __pa(initial_page_table));
			
 
				 	BUG_ON(cr3 != __pa(swapper_pg_dir));
			
 
				 
			
 
				 	/*
			
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
 
				 	wrmsr
			
 
				 
			
 
				 	/* Enable pre-constructed page tables. */
			
 
				-	mov $_pa(init_level4_pgt), %eax
			
 
				+	mov $_pa(init_top_pgt), %eax
			
 
				 	mov %eax, %cr3
			
 
				 	mov $(X86_CR0_PG | X86_CR0_PE), %eax
			
 
				 	mov %eax, %cr0
			
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -14,6 +14,10 @@
 
				 
			
 
				 #include <asm/page.h>
			
 
				 
			
 
				+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
			
 
				+#include <asm/tlbbatch.h>
			
 
				+#endif
			
 
				+
			
 
				 #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
			
 
				 #define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
			
 
				 		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
			
@@ -67,12 +71,15 @@ struct page_frag {
 
				 struct tlbflush_unmap_batch {
			
 
				 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
			
 
				 	/*
			
 
				-	 * Each bit set is a CPU that potentially has a TLB entry for one of
			
 
				-	 * the PFNs being flushed. See set_tlb_ubc_flush_pending().
			
 
				+	 * The arch code makes the following promise: generic code can modify a
			
 
				+	 * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
			
 
				+	 * needed barriers), then call arch_tlbbatch_flush(), and the entries
			
 
				+	 * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
			
 
				+	 * returns.
			
 
				 	 */
			
 
				-	struct cpumask cpumask;
			
 
				+	struct arch_tlbflush_unmap_batch arch;
			
 
				 
			
 
				-	/* True if any bit in cpumask is set */
			
 
				+	/* True if a flush is needed. */
			
 
				 	bool flush_required;
			
 
				 
			
 
				 	/*
			
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -93,10 +93,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 
				 #endif
			
 
				 #endif
			
 
				 #ifdef CONFIG_DEBUG_TLBFLUSH
			
 
				-#ifdef CONFIG_SMP
			
 
				 		NR_TLB_REMOTE_FLUSH,	/* cpu tried to flush others' tlbs */
			
 
				 		NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
			
 
				-#endif /* CONFIG_SMP */
			
 
				 		NR_TLB_LOCAL_FLUSH_ALL,
			
 
				 		NR_TLB_LOCAL_FLUSH_ONE,
			
 
				 #endif /* CONFIG_DEBUG_TLBFLUSH */
			
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP
 
				 config HAVE_MEMBLOCK_PHYS_MAP
			
 
				 	bool
			
 
				 
			
 
				-config HAVE_GENERIC_RCU_GUP
			
 
				+config HAVE_GENERIC_GUP
			
 
				 	bool
			
 
				 
			
 
				 config ARCH_DISCARD_MEMBLOCK
			
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1146,7 +1146,7 @@ struct page *get_dump_page(unsigned long addr)
 
				 #endif /* CONFIG_ELF_CORE */
			
 
				 
			
 
				 /*
			
 
				- * Generic RCU Fast GUP
			
 
				+ * Generic Fast GUP
			
 
				  *
			
 
				  * get_user_pages_fast attempts to pin user pages by walking the page
			
 
				  * tables directly and avoids taking locks. Thus the walker needs to be
			
@@ -1167,8 +1167,8 @@ struct page *get_dump_page(unsigned long addr)
 
				  * Before activating this code, please be aware that the following assumptions
			
 
				  * are currently made:
			
 
				  *
			
 
				- *  *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
			
 
				- *      pages containing page tables.
			
 
				+ *  *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
			
 
				+ *  free pages containing page tables or TLB flushing requires IPI broadcast.
			
 
				  *
			
 
				  *  *) ptes can be read atomically by the architecture.
			
 
				  *
			
@@ -1178,7 +1178,7 @@ struct page *get_dump_page(unsigned long addr)
 
				  *
			
 
				  * This code is based heavily on the PowerPC implementation by Nick Piggin.
			
 
				  */
			
 
				-#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
			
 
				+#ifdef CONFIG_HAVE_GENERIC_GUP
			
 
				 
			
 
				 #ifndef gup_get_pte
			
 
				 /*
			
@@ -1668,4 +1668,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
			
 
				+#endif /* CONFIG_HAVE_GENERIC_GUP */
			
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -579,25 +579,13 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 
				 void try_to_unmap_flush(void)
			
 
				 {
			
 
				 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
			
 
				-	int cpu;
			
 
				 
			
 
				 	if (!tlb_ubc->flush_required)
			
 
				 		return;
			
 
				 
			
 
				-	cpu = get_cpu();
			
 
				-
			
 
				-	if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
			
 
				-		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
			
 
				-		local_flush_tlb();
			
 
				-		trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
			
 
				-	}
			
 
				-
			
 
				-	if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
			
 
				-		flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
			
 
				-	cpumask_clear(&tlb_ubc->cpumask);
			
 
				+	arch_tlbbatch_flush(&tlb_ubc->arch);
			
 
				 	tlb_ubc->flush_required = false;
			
 
				 	tlb_ubc->writable = false;
			
 
				-	put_cpu();
			
 
				 }
			
 
				 
			
 
				 /* Flush iff there are potentially writable TLB entries that can race with IO */
			
@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
 
				 {
			
 
				 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
			
 
				 
			
 
				-	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
			
 
				+	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
			
 
				 	tlb_ubc->flush_required = true;
			
 
				 
			
 
				 	/*