6 年前 · 99792e0cea
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -1,55 +1,124 @@
 
				+====================================================
			
 
				+Complete virtual memory map with 4-level page tables
			
 
				+====================================================
			
 
				 
			
 
				-Virtual memory map with 4 level page tables:
			
 
				-
			
 
				-0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
			
 
				-hole caused by [47:63] sign extension
			
 
				-ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
			
 
				-ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
			
 
				-ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
			
 
				-ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
			
 
				-ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
			
 
				-ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
			
 
				-... unused hole ...
			
 
				-ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
			
 
				-... unused hole ...
			
 
				-				    vaddr_end for KASLR
			
 
				-fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
			
 
				-fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
			
 
				-ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
			
 
				-... unused hole ...
			
 
				-ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
			
 
				-... unused hole ...
			
 
				-ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
			
 
				-ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
			
 
				-[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
			
 
				-ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
			
 
				-ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
			
 
				-
			
 
				-Virtual memory map with 5 level page tables:
			
 
				-
			
 
				-0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
			
 
				-hole caused by [56:63] sign extension
			
 
				-ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
			
 
				-ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
			
 
				-ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
			
 
				-ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
			
 
				-ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
			
 
				-ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
			
 
				-... unused hole ...
			
 
				-ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
			
 
				-... unused hole ...
			
 
				-				    vaddr_end for KASLR
			
 
				-fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
			
 
				-... unused hole ...
			
 
				-ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
			
 
				-... unused hole ...
			
 
				-ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
			
 
				-... unused hole ...
			
 
				-ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
			
 
				-ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
			
 
				-[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
			
 
				-ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
			
 
				-ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
			
 
				+Notes:
			
 
				+
			
 
				+ - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
			
 
				+   from the top of the 64-bit address space. It's easier to understand the layout
			
 
				+   when seen both in absolute addresses and in distance-from-top notation.
			
 
				+
			
 
				+   For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
			
 
				+   64-bit address space (ffffffffffffffff).
			
 
				+
			
 
				+   Note that as we get closer to the top of the address space, the notation changes
			
 
				+   from TB to GB and then MB/KB.
			
 
				+
			
 
				+ - "16M TB" might look weird at first sight, but it's an easier to visualize size
			
 
				+   notation than "16 EB", which few will recognize at first sight as 16 exabytes.
			
 
				+   It also shows it nicely how incredibly large 64-bit address space is.
			
 
				+
			
 
				+========================================================================================================================
			
 
				+    Start addr    |   Offset   |     End addr     |  Size   | VM area description
			
 
				+========================================================================================================================
			
 
				+                  |            |                  |         |
			
 
				+ 0000000000000000 |    0       | 00007fffffffffff |  128 TB | user-space virtual memory, different per mm
			
 
				+__________________|____________|__________________|_________|___________________________________________________________
			
 
				+                  |            |                  |         |
			
 
				+ 0000800000000000 | +128    TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
			
 
				+                  |            |                  |         |     virtual memory addresses up to the -128 TB
			
 
				+                  |            |                  |         |     starting offset of kernel mappings.
			
 
				+__________________|____________|__________________|_________|___________________________________________________________
			
 
				+                                                            |
			
 
				+                                                            | Kernel-space virtual memory, shared between all processes:
			
 
				+____________________________________________________________|___________________________________________________________
			
 
				+                  |            |                  |         |
			
 
				+ ffff800000000000 | -128    TB | ffff87ffffffffff |    8 TB | ... guard hole, also reserved for hypervisor
			
 
				+ ffff880000000000 | -120    TB | ffffc7ffffffffff |   64 TB | direct mapping of all physical memory (page_offset_base)
			
 
				+ ffffc80000000000 |  -56    TB | ffffc8ffffffffff |    1 TB | ... unused hole
			
 
				+ ffffc90000000000 |  -55    TB | ffffe8ffffffffff |   32 TB | vmalloc/ioremap space (vmalloc_base)
			
 
				+ ffffe90000000000 |  -23    TB | ffffe9ffffffffff |    1 TB | ... unused hole
			
 
				+ ffffea0000000000 |  -22    TB | ffffeaffffffffff |    1 TB | virtual memory map (vmemmap_base)
			
 
				+ ffffeb0000000000 |  -21    TB | ffffebffffffffff |    1 TB | ... unused hole
			
 
				+ ffffec0000000000 |  -20    TB | fffffbffffffffff |   16 TB | KASAN shadow memory
			
 
				+ fffffc0000000000 |   -4    TB | fffffdffffffffff |    2 TB | ... unused hole
			
 
				+                  |            |                  |         | vaddr_end for KASLR
			
 
				+ fffffe0000000000 |   -2    TB | fffffe7fffffffff |  0.5 TB | cpu_entry_area mapping
			
 
				+ fffffe8000000000 |   -1.5  TB | fffffeffffffffff |  0.5 TB | LDT remap for PTI
			
 
				+ ffffff0000000000 |   -1    TB | ffffff7fffffffff |  0.5 TB | %esp fixup stacks
			
 
				+__________________|____________|__________________|_________|____________________________________________________________
			
 
				+                                                            |
			
 
				+                                                            | Identical layout to the 47-bit one from here on:
			
 
				+____________________________________________________________|____________________________________________________________
			
 
				+                  |            |                  |         |
			
 
				+ ffffff8000000000 | -512    GB | ffffffeeffffffff |  444 GB | ... unused hole
			
 
				+ ffffffef00000000 |  -68    GB | fffffffeffffffff |   64 GB | EFI region mapping space
			
 
				+ ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | ... unused hole
			
 
				+ ffffffff80000000 |   -2    GB | ffffffff9fffffff |  512 MB | kernel text mapping, mapped to physical address 0
			
 
				+ ffffffff80000000 |-2048    MB |                  |         |
			
 
				+ ffffffffa0000000 |-1536    MB | fffffffffeffffff | 1520 MB | module mapping space
			
 
				+ ffffffffff000000 |  -16    MB |                  |         |
			
 
				+    FIXADDR_START | ~-11    MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
			
 
				+ ffffffffff600000 |  -10    MB | ffffffffff600fff |    4 kB | legacy vsyscall ABI
			
 
				+ ffffffffffe00000 |   -2    MB | ffffffffffffffff |    2 MB | ... unused hole
			
 
				+__________________|____________|__________________|_________|___________________________________________________________
			
 
				+
			
 
				+
			
 
				+====================================================
			
 
				+Complete virtual memory map with 5-level page tables
			
 
				+====================================================
			
 
				+
			
 
				+Notes:
			
 
				+
			
 
				+ - With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
			
 
				+   from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
			
 
				+   offset and many of the regions expand to support the much larger physical
			
 
				+   memory supported.
			
 
				+
			
 
				+========================================================================================================================
			
 
				+    Start addr    |   Offset   |     End addr     |  Size   | VM area description
			
 
				+========================================================================================================================
			
 
				+                  |            |                  |         |
			
 
				+ 0000000000000000 |    0       | 00ffffffffffffff |   64 PB | user-space virtual memory, different per mm
			
 
				+__________________|____________|__________________|_________|___________________________________________________________
			
 
				+                  |            |                  |         |
			
 
				+ 0000800000000000 |  +64    PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
			
 
				+                  |            |                  |         |     virtual memory addresses up to the -128 TB
			
 
				+                  |            |                  |         |     starting offset of kernel mappings.
			
 
				+__________________|____________|__________________|_________|___________________________________________________________
			
 
				+                                                            |
			
 
				+                                                            | Kernel-space virtual memory, shared between all processes:
			
 
				+____________________________________________________________|___________________________________________________________
			
 
				+                  |            |                  |         |
			
 
				+ ff00000000000000 |  -64    PB | ff0fffffffffffff |    4 PB | ... guard hole, also reserved for hypervisor
			
 
				+ ff10000000000000 |  -60    PB | ff8fffffffffffff |   32 PB | direct mapping of all physical memory (page_offset_base)
			
 
				+ ff90000000000000 |  -28    PB | ff9fffffffffffff |    4 PB | LDT remap for PTI
			
 
				+ ffa0000000000000 |  -24    PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
			
 
				+ ffd2000000000000 |  -11.5  PB | ffd3ffffffffffff |  0.5 PB | ... unused hole
			
 
				+ ffd4000000000000 |  -11    PB | ffd5ffffffffffff |  0.5 PB | virtual memory map (vmemmap_base)
			
 
				+ ffd6000000000000 |  -10.5  PB | ffdeffffffffffff | 2.25 PB | ... unused hole
			
 
				+ ffdf000000000000 |   -8.25 PB | fffffdffffffffff |   ~8 PB | KASAN shadow memory
			
 
				+ fffffc0000000000 |   -4    TB | fffffdffffffffff |    2 TB | ... unused hole
			
 
				+                  |            |                  |         | vaddr_end for KASLR
			
 
				+ fffffe0000000000 |   -2    TB | fffffe7fffffffff |  0.5 TB | cpu_entry_area mapping
			
 
				+ fffffe8000000000 |   -1.5  TB | fffffeffffffffff |  0.5 TB | ... unused hole
			
 
				+ ffffff0000000000 |   -1    TB | ffffff7fffffffff |  0.5 TB | %esp fixup stacks
			
 
				+__________________|____________|__________________|_________|____________________________________________________________
			
 
				+                                                            |
			
 
				+                                                            | Identical layout to the 47-bit one from here on:
			
 
				+____________________________________________________________|____________________________________________________________
			
 
				+                  |            |                  |         |
			
 
				+ ffffff8000000000 | -512    GB | ffffffeeffffffff |  444 GB | ... unused hole
			
 
				+ ffffffef00000000 |  -68    GB | fffffffeffffffff |   64 GB | EFI region mapping space
			
 
				+ ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | ... unused hole
			
 
				+ ffffffff80000000 |   -2    GB | ffffffff9fffffff |  512 MB | kernel text mapping, mapped to physical address 0
			
 
				+ ffffffff80000000 |-2048    MB |                  |         |
			
 
				+ ffffffffa0000000 |-1536    MB | fffffffffeffffff | 1520 MB | module mapping space
			
 
				+ ffffffffff000000 |  -16    MB |                  |         |
			
 
				+    FIXADDR_START | ~-11    MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
			
 
				+ ffffffffff600000 |  -10    MB | ffffffffff600fff |    4 kB | legacy vsyscall ABI
			
 
				+ ffffffffffe00000 |   -2    MB | ffffffffffffffff |    2 MB | ... unused hole
			
 
				+__________________|____________|__________________|_________|___________________________________________________________
			
 
				 
			
 
				 Architecture defines a 64-bit virtual address. Implementations can support
			
 
				 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
 
				 	  supports them), so don't confuse the user by printing
			
 
				 	  that we have them enabled.
			
 
				 
			
 
				+config X86_CPA_STATISTICS
			
 
				+	bool "Enable statistic for Change Page Attribute"
			
 
				+	depends on DEBUG_FS
			
 
				+	---help---
			
 
				+	  Expose statistics about the Change Page Attribute mechanims, which
			
 
				+	  helps to determine the effectivness of preserving large and huge
			
 
				+	  page mappings when mapping protections are changed.
			
 
				+
			
 
				 config ARCH_HAS_MEM_ENCRYPT
			
 
				 	def_bool y
			
 
				 
			
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
 
				 #define ioremap_nocache ioremap_nocache
			
 
				 extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
			
 
				 #define ioremap_uc ioremap_uc
			
 
				-
			
 
				 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
			
 
				 #define ioremap_cache ioremap_cache
			
 
				 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
			
 
				 #define ioremap_prot ioremap_prot
			
 
				+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
			
 
				+#define ioremap_encrypted ioremap_encrypted
			
 
				 
			
 
				 /**
			
 
				  * ioremap     -   map bus memory into CPU space
			
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -67,7 +67,7 @@ struct kimage;
 
				 
			
 
				 /* Memory to backup during crash kdump */
			
 
				 #define KEXEC_BACKUP_SRC_START	(0UL)
			
 
				-#define KEXEC_BACKUP_SRC_END	(640 * 1024UL)	/* 640K */
			
 
				+#define KEXEC_BACKUP_SRC_END	(640 * 1024UL - 1)	/* 640K */
			
 
				 
			
 
				 /*
			
 
				  * CPU does not save ss and sp on stack if execution is already
			
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -59,13 +59,16 @@
 
				 #endif
			
 
				 
			
 
				 /*
			
 
				- * Kernel image size is limited to 1GiB due to the fixmap living in the
			
 
				- * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
			
 
				- * 512MiB by default, leaving 1.5GiB for modules once the page tables
			
 
				- * are fully set up. If kernel ASLR is configured, it can extend the
			
 
				- * kernel page table mapping, reducing the size of the modules area.
			
 
				+ * Maximum kernel image size is limited to 1 GiB, due to the fixmap living
			
 
				+ * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
			
 
				+ *
			
 
				+ * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
			
 
				+ * page tables are fully set up.
			
 
				+ *
			
 
				+ * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
			
 
				+ * of the modules area to 1.5 GiB.
			
 
				  */
			
 
				-#if defined(CONFIG_RANDOMIZE_BASE)
			
 
				+#ifdef CONFIG_RANDOMIZE_BASE
			
 
				 #define KERNEL_IMAGE_SIZE	(1024 * 1024 * 1024)
			
 
				 #else
			
 
				 #define KERNEL_IMAGE_SIZE	(512 * 1024 * 1024)
			
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -6,16 +6,23 @@
 
				 #define tlb_end_vma(tlb, vma) do { } while (0)
			
 
				 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
			
 
				 
			
 
				-#define tlb_flush(tlb)							\
			
 
				-{									\
			
 
				-	if (!tlb->fullmm && !tlb->need_flush_all) 			\
			
 
				-		flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL);	\
			
 
				-	else								\
			
 
				-		flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL);	\
			
 
				-}
			
 
				+static inline void tlb_flush(struct mmu_gather *tlb);
			
 
				 
			
 
				 #include <asm-generic/tlb.h>
			
 
				 
			
 
				+static inline void tlb_flush(struct mmu_gather *tlb)
			
 
				+{
			
 
				+	unsigned long start = 0UL, end = TLB_FLUSH_ALL;
			
 
				+	unsigned int stride_shift = tlb_get_unmap_shift(tlb);
			
 
				+
			
 
				+	if (!tlb->fullmm && !tlb->need_flush_all) {
			
 
				+		start = tlb->start;
			
 
				+		end = tlb->end;
			
 
				+	}
			
 
				+
			
 
				+	flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * While x86 architecture in general requires an IPI to perform TLB
			
 
				  * shootdown, enablement code for several hypervisors overrides
			
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 
				 #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
			
 
				 #endif
			
 
				 
			
 
				-static inline bool tlb_defer_switch_to_init_mm(void)
			
 
				-{
			
 
				-	/*
			
 
				-	 * If we have PCID, then switching to init_mm is reasonably
			
 
				-	 * fast.  If we don't have PCID, then switching to init_mm is
			
 
				-	 * quite slow, so we try to defer it in the hopes that we can
			
 
				-	 * avoid it entirely.  The latter approach runs the risk of
			
 
				-	 * receiving otherwise unnecessary IPIs.
			
 
				-	 *
			
 
				-	 * This choice is just a heuristic.  The tlb code can handle this
			
 
				-	 * function returning true or false regardless of whether we have
			
 
				-	 * PCID.
			
 
				-	 */
			
 
				-	return !static_cpu_has(X86_FEATURE_PCID);
			
 
				-}
			
 
				-
			
 
				 struct tlb_context {
			
 
				 	u64 ctx_id;
			
 
				 	u64 tlb_gen;
			
@@ -547,23 +531,30 @@ struct flush_tlb_info {
 
				 	unsigned long		start;
			
 
				 	unsigned long		end;
			
 
				 	u64			new_tlb_gen;
			
 
				+	unsigned int		stride_shift;
			
 
				+	bool			freed_tables;
			
 
				 };
			
 
				 
			
 
				 #define local_flush_tlb() __flush_tlb()
			
 
				 
			
 
				-#define flush_tlb_mm(mm)	flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
			
 
				+#define flush_tlb_mm(mm)						\
			
 
				+		flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
			
 
				 
			
 
				-#define flush_tlb_range(vma, start, end)	\
			
 
				-		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
			
 
				+#define flush_tlb_range(vma, start, end)				\
			
 
				+	flush_tlb_mm_range((vma)->vm_mm, start, end,			\
			
 
				+			   ((vma)->vm_flags & VM_HUGETLB)		\
			
 
				+				? huge_page_shift(hstate_vma(vma))	\
			
 
				+				: PAGE_SHIFT, false)
			
 
				 
			
 
				 extern void flush_tlb_all(void);
			
 
				 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
			
 
				-				unsigned long end, unsigned long vmflag);
			
 
				+				unsigned long end, unsigned int stride_shift,
			
 
				+				bool freed_tables);
			
 
				 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
			
 
				 
			
 
				 static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
			
 
				 {
			
 
				-	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
			
 
				+	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
			
 
				 }
			
 
				 
			
 
				 void native_flush_tlb_others(const struct cpumask *cpumask,
			
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -11,40 +11,62 @@
 
				 #include <linux/uaccess.h>
			
 
				 #include <linux/io.h>
			
 
				 
			
 
				-/**
			
 
				- * copy_oldmem_page - copy one page from "oldmem"
			
 
				- * @pfn: page frame number to be copied
			
 
				- * @buf: target memory address for the copy; this can be in kernel address
			
 
				- *	space or user address space (see @userbuf)
			
 
				- * @csize: number of bytes to copy
			
 
				- * @offset: offset in bytes into the page (based on pfn) to begin the copy
			
 
				- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
			
 
				- *	otherwise @buf is in kernel address space, use memcpy().
			
 
				- *
			
 
				- * Copy a page from "oldmem". For this page, there is no pte mapped
			
 
				- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
			
 
				- */
			
 
				-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
			
 
				-		size_t csize, unsigned long offset, int userbuf)
			
 
				+static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
			
 
				+				  unsigned long offset, int userbuf,
			
 
				+				  bool encrypted)
			
 
				 {
			
 
				 	void  *vaddr;
			
 
				 
			
 
				 	if (!csize)
			
 
				 		return 0;
			
 
				 
			
 
				-	vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
			
 
				+	if (encrypted)
			
 
				+		vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
			
 
				+	else
			
 
				+		vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
			
 
				+
			
 
				 	if (!vaddr)
			
 
				 		return -ENOMEM;
			
 
				 
			
 
				 	if (userbuf) {
			
 
				-		if (copy_to_user(buf, vaddr + offset, csize)) {
			
 
				-			iounmap(vaddr);
			
 
				+		if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
			
 
				+			iounmap((void __iomem *)vaddr);
			
 
				 			return -EFAULT;
			
 
				 		}
			
 
				 	} else
			
 
				 		memcpy(buf, vaddr + offset, csize);
			
 
				 
			
 
				 	set_iounmap_nonlazy();
			
 
				-	iounmap(vaddr);
			
 
				+	iounmap((void __iomem *)vaddr);
			
 
				 	return csize;
			
 
				 }
			
 
				+
			
 
				+/**
			
 
				+ * copy_oldmem_page - copy one page of memory
			
 
				+ * @pfn: page frame number to be copied
			
 
				+ * @buf: target memory address for the copy; this can be in kernel address
			
 
				+ *	space or user address space (see @userbuf)
			
 
				+ * @csize: number of bytes to copy
			
 
				+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
			
 
				+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
			
 
				+ *	otherwise @buf is in kernel address space, use memcpy().
			
 
				+ *
			
 
				+ * Copy a page from the old kernel's memory. For this page, there is no pte
			
 
				+ * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
			
 
				+ */
			
 
				+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
			
 
				+			 unsigned long offset, int userbuf)
			
 
				+{
			
 
				+	return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
			
 
				+ * memory with the encryption mask set to accomodate kdump on SME-enabled
			
 
				+ * machines.
			
 
				+ */
			
 
				+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
			
 
				+				   unsigned long offset, int userbuf)
			
 
				+{
			
 
				+	return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
			
 
				+}
			
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
 
				 	map_ldt_struct_to_user(mm);
			
 
				 
			
 
				 	va = (unsigned long)ldt_slot_va(slot);
			
 
				-	flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
			
 
				+	flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
			
 
				 
			
 
				 	ldt->slot = slot;
			
 
				 	return 0;
			
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 
				 	pte_unmap_unlock(pte, ptl);
			
 
				 out:
			
 
				 	up_write(&mm->mmap_sem);
			
 
				-	flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
			
 
				+	flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -19,7 +19,9 @@
 
				 #include <linux/sched.h>
			
 
				 #include <linux/seq_file.h>
			
 
				 #include <linux/highmem.h>
			
 
				+#include <linux/pci.h>
			
 
				 
			
 
				+#include <asm/e820/types.h>
			
 
				 #include <asm/pgtable.h>
			
 
				 
			
 
				 /*
			
@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
 
				 	return (signed long)(u << shift) >> shift;
			
 
				 }
			
 
				 
			
 
				+static void note_wx(struct pg_state *st)
			
 
				+{
			
 
				+	unsigned long npages;
			
 
				+
			
 
				+	npages = (st->current_address - st->start_address) / PAGE_SIZE;
			
 
				+
			
 
				+#ifdef CONFIG_PCI_BIOS
			
 
				+	/*
			
 
				+	 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
			
 
				+	 * Inform about it, but avoid the warning.
			
 
				+	 */
			
 
				+	if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
			
 
				+	    st->current_address <= PAGE_OFFSET + BIOS_END) {
			
 
				+		pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
			
 
				+		return;
			
 
				+	}
			
 
				+#endif
			
 
				+	/* Account the WX pages */
			
 
				+	st->wx_pages += npages;
			
 
				+	WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
			
 
				+		  (void *)st->start_address);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * This function gets called on a break in a continuous series
			
 
				  * of PTE entries; the next one is different so we need to
			
@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
 
				 		unsigned long delta;
			
 
				 		int width = sizeof(unsigned long) * 2;
			
 
				 
			
 
				-		if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
			
 
				-			WARN_ONCE(1,
			
 
				-				  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
			
 
				-				  (void *)st->start_address,
			
 
				-				  (void *)st->start_address);
			
 
				-			st->wx_pages += (st->current_address -
			
 
				-					 st->start_address) / PAGE_SIZE;
			
 
				-		}
			
 
				+		if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
			
 
				+			note_wx(st);
			
 
				 
			
 
				 		/*
			
 
				 		 * Now print the actual finished series
			
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 
				 	show_opcodes(regs, loglvl);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * The (legacy) vsyscall page is the long page in the kernel portion
			
 
				+ * of the address space that has user-accessible permissions.
			
 
				+ */
			
 
				+static bool is_vsyscall_vaddr(unsigned long vaddr)
			
 
				+{
			
 
				+	return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
			
 
				+}
			
 
				+
			
 
				 static void
			
 
				 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
			
 
				 		       unsigned long address, u32 *pkey, int si_code)
			
@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 
				 		if (is_errata100(regs, address))
			
 
				 			return;
			
 
				 
			
 
				-#ifdef CONFIG_X86_64
			
 
				-		/*
			
 
				-		 * Instruction fetch faults in the vsyscall page might need
			
 
				-		 * emulation.
			
 
				-		 */
			
 
				-		if (unlikely((error_code & X86_PF_INSTR) &&
			
 
				-			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
			
 
				-			if (emulate_vsyscall(regs, address))
			
 
				-				return;
			
 
				-		}
			
 
				-#endif
			
 
				-
			
 
				 		/*
			
 
				 		 * To avoid leaking information about the kernel page table
			
 
				 		 * layout, pretend that user-mode accesses to kernel addresses
			
@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int spurious_fault_check(unsigned long error_code, pte_t *pte)
			
 
				+static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
			
 
				 {
			
 
				 	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
			
 
				 		return 0;
			
 
				 
			
 
				 	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
			
 
				 		return 0;
			
 
				-	/*
			
 
				-	 * Note: We do not do lazy flushing on protection key
			
 
				-	 * changes, so no spurious fault will ever set X86_PF_PK.
			
 
				-	 */
			
 
				-	if ((error_code & X86_PF_PK))
			
 
				-		return 1;
			
 
				 
			
 
				 	return 1;
			
 
				 }
			
@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 
				  * (Optional Invalidation).
			
 
				  */
			
 
				 static noinline int
			
 
				-spurious_fault(unsigned long error_code, unsigned long address)
			
 
				+spurious_kernel_fault(unsigned long error_code, unsigned long address)
			
 
				 {
			
 
				 	pgd_t *pgd;
			
 
				 	p4d_t *p4d;
			
@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
 
				 		return 0;
			
 
				 
			
 
				 	if (p4d_large(*p4d))
			
 
				-		return spurious_fault_check(error_code, (pte_t *) p4d);
			
 
				+		return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
			
 
				 
			
 
				 	pud = pud_offset(p4d, address);
			
 
				 	if (!pud_present(*pud))
			
 
				 		return 0;
			
 
				 
			
 
				 	if (pud_large(*pud))
			
 
				-		return spurious_fault_check(error_code, (pte_t *) pud);
			
 
				+		return spurious_kernel_fault_check(error_code, (pte_t *) pud);
			
 
				 
			
 
				 	pmd = pmd_offset(pud, address);
			
 
				 	if (!pmd_present(*pmd))
			
 
				 		return 0;
			
 
				 
			
 
				 	if (pmd_large(*pmd))
			
 
				-		return spurious_fault_check(error_code, (pte_t *) pmd);
			
 
				+		return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
			
 
				 
			
 
				 	pte = pte_offset_kernel(pmd, address);
			
 
				 	if (!pte_present(*pte))
			
 
				 		return 0;
			
 
				 
			
 
				-	ret = spurious_fault_check(error_code, pte);
			
 
				+	ret = spurious_kernel_fault_check(error_code, pte);
			
 
				 	if (!ret)
			
 
				 		return 0;
			
 
				 
			
@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
 
				 	 * Make sure we have permissions in PMD.
			
 
				 	 * If not, then there's a bug in the page tables:
			
 
				 	 */
			
 
				-	ret = spurious_fault_check(error_code, (pte_t *) pmd);
			
 
				+	ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
			
 
				 	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				-NOKPROBE_SYMBOL(spurious_fault);
			
 
				+NOKPROBE_SYMBOL(spurious_kernel_fault);
			
 
				 
			
 
				 int show_unhandled_signals = 1;
			
 
				 
			
@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 
				 
			
 
				 static int fault_in_kernel_space(unsigned long address)
			
 
				 {
			
 
				+	/*
			
 
				+	 * On 64-bit systems, the vsyscall page is at an address above
			
 
				+	 * TASK_SIZE_MAX, but is not considered part of the kernel
			
 
				+	 * address space.
			
 
				+	 */
			
 
				+	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
			
 
				+		return false;
			
 
				+
			
 
				 	return address >= TASK_SIZE_MAX;
			
 
				 }
			
 
				 
			
@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * This routine handles page faults.  It determines the address,
			
 
				- * and the problem, and then passes it off to one of the appropriate
			
 
				- * routines.
			
 
				+ * Called for all faults where 'address' is part of the kernel address
			
 
				+ * space.  Might get called for faults that originate from *code* that
			
 
				+ * ran in userspace or the kernel.
			
 
				  */
			
 
				-static noinline void
			
 
				-__do_page_fault(struct pt_regs *regs, unsigned long error_code,
			
 
				-		unsigned long address)
			
 
				+static void
			
 
				+do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
			
 
				+		   unsigned long address)
			
 
				 {
			
 
				-	struct vm_area_struct *vma;
			
 
				-	struct task_struct *tsk;
			
 
				-	struct mm_struct *mm;
			
 
				-	vm_fault_t fault, major = 0;
			
 
				-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				-	u32 pkey;
			
 
				-
			
 
				-	tsk = current;
			
 
				-	mm = tsk->mm;
			
 
				-
			
 
				-	prefetchw(&mm->mmap_sem);
			
 
				-
			
 
				-	if (unlikely(kmmio_fault(regs, address)))
			
 
				-		return;
			
 
				+	/*
			
 
				+	 * Protection keys exceptions only happen on user pages.  We
			
 
				+	 * have no user pages in the kernel portion of the address
			
 
				+	 * space, so do not expect them here.
			
 
				+	 */
			
 
				+	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
			
 
				 
			
 
				 	/*
			
 
				-	 * We fault-in kernel-space virtual memory on-demand. The
			
 
				+	 * We can fault-in kernel-space virtual memory on-demand. The
			
 
				 	 * 'reference' page table is init_mm.pgd.
			
 
				 	 *
			
 
				 	 * NOTE! We MUST NOT take any locks for this case. We may
			
@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
				 	 * only copy the information from the master page table,
			
 
				 	 * nothing more.
			
 
				 	 *
			
 
				-	 * This verifies that the fault happens in kernel space
			
 
				-	 * (error_code & 4) == 0, and that the fault was not a
			
 
				-	 * protection error (error_code & 9) == 0.
			
 
				+	 * Before doing this on-demand faulting, ensure that the
			
 
				+	 * fault is not any of the following:
			
 
				+	 * 1. A fault on a PTE with a reserved bit set.
			
 
				+	 * 2. A fault caused by a user-mode access.  (Do not demand-
			
 
				+	 *    fault kernel memory due to user-mode accesses).
			
 
				+	 * 3. A fault caused by a page-level protection violation.
			
 
				+	 *    (A demand fault would be on a non-present page which
			
 
				+	 *     would have X86_PF_PROT==0).
			
 
				 	 */
			
 
				-	if (unlikely(fault_in_kernel_space(address))) {
			
 
				-		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
			
 
				-			if (vmalloc_fault(address) >= 0)
			
 
				-				return;
			
 
				-		}
			
 
				-
			
 
				-		/* Can handle a stale RO->RW TLB: */
			
 
				-		if (spurious_fault(error_code, address))
			
 
				+	if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
			
 
				+		if (vmalloc_fault(address) >= 0)
			
 
				 			return;
			
 
				+	}
			
 
				 
			
 
				-		/* kprobes don't want to hook the spurious faults: */
			
 
				-		if (kprobes_fault(regs))
			
 
				-			return;
			
 
				-		/*
			
 
				-		 * Don't take the mm semaphore here. If we fixup a prefetch
			
 
				-		 * fault we could otherwise deadlock:
			
 
				-		 */
			
 
				-		bad_area_nosemaphore(regs, error_code, address, NULL);
			
 
				+	/* Was the fault spurious, caused by lazy TLB invalidation? */
			
 
				+	if (spurious_kernel_fault(hw_error_code, address))
			
 
				+		return;
			
 
				 
			
 
				+	/* kprobes don't want to hook the spurious faults: */
			
 
				+	if (kprobes_fault(regs))
			
 
				 		return;
			
 
				-	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Note, despite being a "bad area", there are quite a few
			
 
				+	 * acceptable reasons to get here, such as erratum fixups
			
 
				+	 * and handling kernel code that can fault, like get_user().
			
 
				+	 *
			
 
				+	 * Don't take the mm semaphore here. If we fixup a prefetch
			
 
				+	 * fault we could otherwise deadlock:
			
 
				+	 */
			
 
				+	bad_area_nosemaphore(regs, hw_error_code, address, NULL);
			
 
				+}
			
 
				+NOKPROBE_SYMBOL(do_kern_addr_fault);
			
 
				+
			
 
				+/* Handle faults in the user portion of the address space */
			
 
				+static inline
			
 
				+void do_user_addr_fault(struct pt_regs *regs,
			
 
				+			unsigned long hw_error_code,
			
 
				+			unsigned long address)
			
 
				+{
			
 
				+	unsigned long sw_error_code;
			
 
				+	struct vm_area_struct *vma;
			
 
				+	struct task_struct *tsk;
			
 
				+	struct mm_struct *mm;
			
 
				+	vm_fault_t fault, major = 0;
			
 
				+	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
			
 
				+	u32 pkey;
			
 
				+
			
 
				+	tsk = current;
			
 
				+	mm = tsk->mm;
			
 
				 
			
 
				 	/* kprobes don't want to hook the spurious faults: */
			
 
				 	if (unlikely(kprobes_fault(regs)))
			
 
				 		return;
			
 
				 
			
 
				-	if (unlikely(error_code & X86_PF_RSVD))
			
 
				-		pgtable_bad(regs, error_code, address);
			
 
				+	/*
			
 
				+	 * Reserved bits are never expected to be set on
			
 
				+	 * entries in the user portion of the page tables.
			
 
				+	 */
			
 
				+	if (unlikely(hw_error_code & X86_PF_RSVD))
			
 
				+		pgtable_bad(regs, hw_error_code, address);
			
 
				 
			
 
				-	if (unlikely(smap_violation(error_code, regs))) {
			
 
				-		bad_area_nosemaphore(regs, error_code, address, NULL);
			
 
				+	/*
			
 
				+	 * Check for invalid kernel (supervisor) access to user
			
 
				+	 * pages in the user address space.
			
 
				+	 */
			
 
				+	if (unlikely(smap_violation(hw_error_code, regs))) {
			
 
				+		bad_area_nosemaphore(regs, hw_error_code, address, NULL);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -1289,10 +1313,17 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
				 	 * in a region with pagefaults disabled then we must not take the fault
			
 
				 	 */
			
 
				 	if (unlikely(faulthandler_disabled() || !mm)) {
			
 
				-		bad_area_nosemaphore(regs, error_code, address, NULL);
			
 
				+		bad_area_nosemaphore(regs, hw_error_code, address, NULL);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * hw_error_code is literally the "page fault error code" passed to
			
 
				+	 * the kernel directly from the hardware.  But, we will shortly be
			
 
				+	 * modifying it in software, so give it a new name.
			
 
				+	 */
			
 
				+	sw_error_code = hw_error_code;
			
 
				+
			
 
				 	/*
			
 
				 	 * It's safe to allow irq's after cr2 has been saved and the
			
 
				 	 * vmalloc fault has been handled.
			
@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
				 	 */
			
 
				 	if (user_mode(regs)) {
			
 
				 		local_irq_enable();
			
 
				-		error_code |= X86_PF_USER;
			
 
				+		/*
			
 
				+		 * Up to this point, X86_PF_USER set in hw_error_code
			
 
				+		 * indicated a user-mode access.  But, after this,
			
 
				+		 * X86_PF_USER in sw_error_code will indicate either
			
 
				+		 * that, *or* an implicit kernel(supervisor)-mode access
			
 
				+		 * which originated from user mode.
			
 
				+		 */
			
 
				+		if (!(hw_error_code & X86_PF_USER)) {
			
 
				+			/*
			
 
				+			 * The CPU was in user mode, but the CPU says
			
 
				+			 * the fault was not a user-mode access.
			
 
				+			 * Must be an implicit kernel-mode access,
			
 
				+			 * which we do not expect to happen in the
			
 
				+			 * user address space.
			
 
				+			 */
			
 
				+			pr_warn_once("kernel-mode error from user-mode: %lx\n",
			
 
				+					hw_error_code);
			
 
				+
			
 
				+			sw_error_code |= X86_PF_USER;
			
 
				+		}
			
 
				 		flags |= FAULT_FLAG_USER;
			
 
				 	} else {
			
 
				 		if (regs->flags & X86_EFLAGS_IF)
			
@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
				 
			
 
				 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
			
 
				 
			
 
				-	if (error_code & X86_PF_WRITE)
			
 
				+	if (sw_error_code & X86_PF_WRITE)
			
 
				 		flags |= FAULT_FLAG_WRITE;
			
 
				-	if (error_code & X86_PF_INSTR)
			
 
				+	if (sw_error_code & X86_PF_INSTR)
			
 
				 		flags |= FAULT_FLAG_INSTRUCTION;
			
 
				 
			
 
				+#ifdef CONFIG_X86_64
			
 
				 	/*
			
 
				-	 * When running in the kernel we expect faults to occur only to
			
 
				-	 * addresses in user space.  All other faults represent errors in
			
 
				-	 * the kernel and should generate an OOPS.  Unfortunately, in the
			
 
				-	 * case of an erroneous fault occurring in a code path which already
			
 
				-	 * holds mmap_sem we will deadlock attempting to validate the fault
			
 
				-	 * against the address space.  Luckily the kernel only validly
			
 
				-	 * references user space from well defined areas of code, which are
			
 
				-	 * listed in the exceptions table.
			
 
				+	 * Instruction fetch faults in the vsyscall page might need
			
 
				+	 * emulation.  The vsyscall page is at a high address
			
 
				+	 * (>PAGE_OFFSET), but is considered to be part of the user
			
 
				+	 * address space.
			
 
				 	 *
			
 
				-	 * As the vast majority of faults will be valid we will only perform
			
 
				-	 * the source reference check when there is a possibility of a
			
 
				-	 * deadlock. Attempt to lock the address space, if we cannot we then
			
 
				-	 * validate the source. If this is invalid we can skip the address
			
 
				-	 * space check, thus avoiding the deadlock:
			
 
				+	 * The vsyscall page does not have a "real" VMA, so do this
			
 
				+	 * emulation before we go searching for VMAs.
			
 
				+	 */
			
 
				+	if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
			
 
				+		if (emulate_vsyscall(regs, address))
			
 
				+			return;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	/*
			
 
				+	 * Kernel-mode access to the user address space should only occur
			
 
				+	 * on well-defined single instructions listed in the exception
			
 
				+	 * tables.  But, an erroneous kernel fault occurring outside one of
			
 
				+	 * those areas which also holds mmap_sem might deadlock attempting
			
 
				+	 * to validate the fault against the address space.
			
 
				+	 *
			
 
				+	 * Only do the expensive exception table search when we might be at
			
 
				+	 * risk of a deadlock.  This happens if we
			
 
				+	 * 1. Failed to acquire mmap_sem, and
			
 
				+	 * 2. The access did not originate in userspace.  Note: either the
			
 
				+	 *    hardware or earlier page fault code may set X86_PF_USER
			
 
				+	 *    in sw_error_code.
			
 
				 	 */
			
 
				 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
			
 
				-		if (!(error_code & X86_PF_USER) &&
			
 
				+		if (!(sw_error_code & X86_PF_USER) &&
			
 
				 		    !search_exception_tables(regs->ip)) {
			
 
				-			bad_area_nosemaphore(regs, error_code, address, NULL);
			
 
				+			/*
			
 
				+			 * Fault from code in kernel from
			
 
				+			 * which we do not expect faults.
			
 
				+			 */
			
 
				+			bad_area_nosemaphore(regs, sw_error_code, address, NULL);
			
 
				 			return;
			
 
				 		}
			
 
				 retry:
			
@@ -1351,16 +1419,16 @@ retry:
 
				 
			
 
				 	vma = find_vma(mm, address);
			
 
				 	if (unlikely(!vma)) {
			
 
				-		bad_area(regs, error_code, address);
			
 
				+		bad_area(regs, sw_error_code, address);
			
 
				 		return;
			
 
				 	}
			
 
				 	if (likely(vma->vm_start <= address))
			
 
				 		goto good_area;
			
 
				 	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
			
 
				-		bad_area(regs, error_code, address);
			
 
				+		bad_area(regs, sw_error_code, address);
			
 
				 		return;
			
 
				 	}
			
 
				-	if (error_code & X86_PF_USER) {
			
 
				+	if (sw_error_code & X86_PF_USER) {
			
 
				 		/*
			
 
				 		 * Accessing the stack below %sp is always a bug.
			
 
				 		 * The large cushion allows instructions like enter
			
@@ -1368,12 +1436,12 @@ retry:
 
				 		 * 32 pointers and then decrements %sp by 65535.)
			
 
				 		 */
			
 
				 		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
			
 
				-			bad_area(regs, error_code, address);
			
 
				+			bad_area(regs, sw_error_code, address);
			
 
				 			return;
			
 
				 		}
			
 
				 	}
			
 
				 	if (unlikely(expand_stack(vma, address))) {
			
 
				-		bad_area(regs, error_code, address);
			
 
				+		bad_area(regs, sw_error_code, address);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -1382,8 +1450,8 @@ retry:
 
				 	 * we can handle it..
			
 
				 	 */
			
 
				 good_area:
			
 
				-	if (unlikely(access_error(error_code, vma))) {
			
 
				-		bad_area_access_error(regs, error_code, address, vma);
			
 
				+	if (unlikely(access_error(sw_error_code, vma))) {
			
 
				+		bad_area_access_error(regs, sw_error_code, address, vma);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -1425,13 +1493,13 @@ good_area:
 
				 			return;
			
 
				 
			
 
				 		/* Not returning to user mode? Handle exceptions or die: */
			
 
				-		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
			
 
				+		no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				 	up_read(&mm->mmap_sem);
			
 
				 	if (unlikely(fault & VM_FAULT_ERROR)) {
			
 
				-		mm_fault_error(regs, error_code, address, &pkey, fault);
			
 
				+		mm_fault_error(regs, sw_error_code, address, &pkey, fault);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -1449,6 +1517,28 @@ good_area:
 
				 
			
 
				 	check_v8086_mode(regs, address, tsk);
			
 
				 }
			
 
				+NOKPROBE_SYMBOL(do_user_addr_fault);
			
 
				+
			
 
				+/*
			
 
				+ * This routine handles page faults.  It determines the address,
			
 
				+ * and the problem, and then passes it off to one of the appropriate
			
 
				+ * routines.
			
 
				+ */
			
 
				+static noinline void
			
 
				+__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
			
 
				+		unsigned long address)
			
 
				+{
			
 
				+	prefetchw(&current->mm->mmap_sem);
			
 
				+
			
 
				+	if (unlikely(kmmio_fault(regs, address)))
			
 
				+		return;
			
 
				+
			
 
				+	/* Was the fault on kernel-controlled part of the address space? */
			
 
				+	if (unlikely(fault_in_kernel_space(address)))
			
 
				+		do_kern_addr_fault(regs, hw_error_code, address);
			
 
				+	else
			
 
				+		do_user_addr_fault(regs, hw_error_code, address);
			
 
				+}
			
 
				 NOKPROBE_SYMBOL(__do_page_fault);
			
 
				 
			
 
				 static nokprobe_inline void
			
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
 
				 void mark_rodata_ro(void)
			
 
				 {
			
 
				 	unsigned long start = PFN_ALIGN(_text);
			
 
				-	unsigned long size = PFN_ALIGN(_etext) - start;
			
 
				+	unsigned long size = (unsigned long)__end_rodata - start;
			
 
				 
			
 
				 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
			
 
				-	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
			
 
				+	pr_info("Write protecting kernel text and read-only data: %luk\n",
			
 
				 		size >> 10);
			
 
				 
			
 
				 	kernel_set_to_readonly = 1;
			
 
				 
			
 
				 #ifdef CONFIG_CPA_DEBUG
			
 
				-	printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
			
 
				-		start, start+size);
			
 
				-	set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
			
 
				-
			
 
				-	printk(KERN_INFO "Testing CPA: write protecting again\n");
			
 
				-	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
			
 
				-#endif
			
 
				-
			
 
				-	start += size;
			
 
				-	size = (unsigned long)__end_rodata - start;
			
 
				-	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
			
 
				-	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
			
 
				-		size >> 10);
			
 
				-
			
 
				-#ifdef CONFIG_CPA_DEBUG
			
 
				-	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
			
 
				+	pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
			
 
				 	set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
			
 
				 
			
 
				-	printk(KERN_INFO "Testing CPA: write protecting again\n");
			
 
				+	pr_info("Testing CPA: write protecting again\n");
			
 
				 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
			
 
				 #endif
			
 
				 	mark_nxdata_nx();
			
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
 
				  * caller shouldn't need to know that small detail.
			
 
				  */
			
 
				 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
			
 
				-		unsigned long size, enum page_cache_mode pcm, void *caller)
			
 
				+		unsigned long size, enum page_cache_mode pcm,
			
 
				+		void *caller, bool encrypted)
			
 
				 {
			
 
				 	unsigned long offset, vaddr;
			
 
				 	resource_size_t last_addr;
			
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 
				 	 * resulting mapping.
			
 
				 	 */
			
 
				 	prot = PAGE_KERNEL_IO;
			
 
				-	if (sev_active() && mem_flags.desc_other)
			
 
				+	if ((sev_active() && mem_flags.desc_other) || encrypted)
			
 
				 		prot = pgprot_encrypted(prot);
			
 
				 
			
 
				 	switch (pcm) {
			
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 
				 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
			
 
				 
			
 
				 	return __ioremap_caller(phys_addr, size, pcm,
			
 
				-				__builtin_return_address(0));
			
 
				+				__builtin_return_address(0), false);
			
 
				 }
			
 
				 EXPORT_SYMBOL(ioremap_nocache);
			
 
				 
			
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
 
				 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
			
 
				 
			
 
				 	return __ioremap_caller(phys_addr, size, pcm,
			
 
				-				__builtin_return_address(0));
			
 
				+				__builtin_return_address(0), false);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(ioremap_uc);
			
 
				 
			
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 
				 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
			
 
				 {
			
 
				 	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
			
 
				-					__builtin_return_address(0));
			
 
				+					__builtin_return_address(0), false);
			
 
				 }
			
 
				 EXPORT_SYMBOL(ioremap_wc);
			
 
				 
			
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
 
				 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
			
 
				 {
			
 
				 	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
			
 
				-					__builtin_return_address(0));
			
 
				+					__builtin_return_address(0), false);
			
 
				 }
			
 
				 EXPORT_SYMBOL(ioremap_wt);
			
 
				 
			
 
				+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
			
 
				+{
			
 
				+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
			
 
				+				__builtin_return_address(0), true);
			
 
				+}
			
 
				+EXPORT_SYMBOL(ioremap_encrypted);
			
 
				+
			
 
				 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
			
 
				 {
			
 
				 	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
			
 
				-				__builtin_return_address(0));
			
 
				+				__builtin_return_address(0), false);
			
 
				 }
			
 
				 EXPORT_SYMBOL(ioremap_cache);
			
 
				 
			
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 
				 {
			
 
				 	return __ioremap_caller(phys_addr, size,
			
 
				 				pgprot2cachemode(__pgprot(prot_val)),
			
 
				-				__builtin_return_address(0));
			
 
				+				__builtin_return_address(0), false);
			
 
				 }
			
 
				 EXPORT_SYMBOL(ioremap_prot);
			
 
				 
			
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -37,11 +37,20 @@ struct cpa_data {
 
				 	unsigned long	numpages;
			
 
				 	int		flags;
			
 
				 	unsigned long	pfn;
			
 
				-	unsigned	force_split : 1;
			
 
				+	unsigned	force_split		: 1,
			
 
				+			force_static_prot	: 1;
			
 
				 	int		curpage;
			
 
				 	struct page	**pages;
			
 
				 };
			
 
				 
			
 
				+enum cpa_warn {
			
 
				+	CPA_CONFLICT,
			
 
				+	CPA_PROTECT,
			
 
				+	CPA_DETECT,
			
 
				+};
			
 
				+
			
 
				+static const int cpa_warn_level = CPA_PROTECT;
			
 
				+
			
 
				 /*
			
 
				  * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
			
 
				  * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
			
@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
 
				 static inline void split_page_count(int level) { }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_X86_CPA_STATISTICS
			
 
				+
			
 
				+static unsigned long cpa_1g_checked;
			
 
				+static unsigned long cpa_1g_sameprot;
			
 
				+static unsigned long cpa_1g_preserved;
			
 
				+static unsigned long cpa_2m_checked;
			
 
				+static unsigned long cpa_2m_sameprot;
			
 
				+static unsigned long cpa_2m_preserved;
			
 
				+static unsigned long cpa_4k_install;
			
 
				+
			
 
				+static inline void cpa_inc_1g_checked(void)
			
 
				+{
			
 
				+	cpa_1g_checked++;
			
 
				+}
			
 
				+
			
 
				+static inline void cpa_inc_2m_checked(void)
			
 
				+{
			
 
				+	cpa_2m_checked++;
			
 
				+}
			
 
				+
			
 
				+static inline void cpa_inc_4k_install(void)
			
 
				+{
			
 
				+	cpa_4k_install++;
			
 
				+}
			
 
				+
			
 
				+static inline void cpa_inc_lp_sameprot(int level)
			
 
				+{
			
 
				+	if (level == PG_LEVEL_1G)
			
 
				+		cpa_1g_sameprot++;
			
 
				+	else
			
 
				+		cpa_2m_sameprot++;
			
 
				+}
			
 
				+
			
 
				+static inline void cpa_inc_lp_preserved(int level)
			
 
				+{
			
 
				+	if (level == PG_LEVEL_1G)
			
 
				+		cpa_1g_preserved++;
			
 
				+	else
			
 
				+		cpa_2m_preserved++;
			
 
				+}
			
 
				+
			
 
				+static int cpastats_show(struct seq_file *m, void *p)
			
 
				+{
			
 
				+	seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
			
 
				+	seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
			
 
				+	seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
			
 
				+	seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
			
 
				+	seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
			
 
				+	seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
			
 
				+	seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int cpastats_open(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	return single_open(file, cpastats_show, NULL);
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations cpastats_fops = {
			
 
				+	.open		= cpastats_open,
			
 
				+	.read		= seq_read,
			
 
				+	.llseek		= seq_lseek,
			
 
				+	.release	= single_release,
			
 
				+};
			
 
				+
			
 
				+static int __init cpa_stats_init(void)
			
 
				+{
			
 
				+	debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
			
 
				+			    &cpastats_fops);
			
 
				+	return 0;
			
 
				+}
			
 
				+late_initcall(cpa_stats_init);
			
 
				+#else
			
 
				+static inline void cpa_inc_1g_checked(void) { }
			
 
				+static inline void cpa_inc_2m_checked(void) { }
			
 
				+static inline void cpa_inc_4k_install(void) { }
			
 
				+static inline void cpa_inc_lp_sameprot(int level) { }
			
 
				+static inline void cpa_inc_lp_preserved(int level) { }
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				 static inline int
			
 
				 within(unsigned long addr, unsigned long start, unsigned long end)
			
 
				 {
			
@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
 
				 	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
			
 
				 }
			
 
				 
			
 
				-static void __cpa_flush_range(void *arg)
			
 
				+static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
			
 
				 {
			
 
				-	/*
			
 
				-	 * We could optimize that further and do individual per page
			
 
				-	 * tlb invalidates for a low number of pages. Caveat: we must
			
 
				-	 * flush the high aliases on 64bit as well.
			
 
				-	 */
			
 
				-	__flush_tlb_all();
			
 
				+	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
			
 
				+
			
 
				+	WARN_ON(PAGE_ALIGN(start) != start);
			
 
				+
			
 
				+	if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
			
 
				+		cpa_flush_all(cache);
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				+	flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
			
 
				+
			
 
				+	return !cache;
			
 
				 }
			
 
				 
			
 
				 static void cpa_flush_range(unsigned long start, int numpages, int cache)
			
@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 
				 	unsigned int i, level;
			
 
				 	unsigned long addr;
			
 
				 
			
 
				-	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
			
 
				-	WARN_ON(PAGE_ALIGN(start) != start);
			
 
				-
			
 
				-	on_each_cpu(__cpa_flush_range, NULL, 1);
			
 
				-
			
 
				-	if (!cache)
			
 
				+	if (__cpa_flush_range(start, numpages, cache))
			
 
				 		return;
			
 
				 
			
 
				 	/*
			
@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void cpa_flush_array(unsigned long *start, int numpages, int cache,
			
 
				+static void cpa_flush_array(unsigned long baddr, unsigned long *start,
			
 
				+			    int numpages, int cache,
			
 
				 			    int in_flags, struct page **pages)
			
 
				 {
			
 
				 	unsigned int i, level;
			
 
				-#ifdef CONFIG_PREEMPT
			
 
				-	/*
			
 
				-	 * Avoid wbinvd() because it causes latencies on all CPUs,
			
 
				-	 * regardless of any CPU isolation that may be in effect.
			
 
				-	 *
			
 
				-	 * This should be extended for CAT enabled systems independent of
			
 
				-	 * PREEMPT because wbinvd() does not respect the CAT partitions and
			
 
				-	 * this is exposed to unpriviledged users through the graphics
			
 
				-	 * subsystem.
			
 
				-	 */
			
 
				-	unsigned long do_wbinvd = 0;
			
 
				-#else
			
 
				-	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
			
 
				-#endif
			
 
				-
			
 
				-	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
			
 
				 
			
 
				-	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
			
 
				-
			
 
				-	if (!cache || do_wbinvd)
			
 
				+	if (__cpa_flush_range(baddr, numpages, cache))
			
 
				 		return;
			
 
				 
			
 
				 	/*
			
@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Certain areas of memory on x86 require very specific protection flags,
			
 
				- * for example the BIOS area or kernel text. Callers don't always get this
			
 
				- * right (again, ioremap() on BIOS memory is not uncommon) so this function
			
 
				- * checks and fixes these known static required protection bits.
			
 
				- */
			
 
				-static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
			
 
				-				   unsigned long pfn)
			
 
				+static bool overlaps(unsigned long r1_start, unsigned long r1_end,
			
 
				+		     unsigned long r2_start, unsigned long r2_end)
			
 
				 {
			
 
				-	pgprot_t forbidden = __pgprot(0);
			
 
				+	return (r1_start <= r2_end && r1_end >= r2_start) ||
			
 
				+		(r2_start <= r1_end && r2_end >= r1_start);
			
 
				+}
			
 
				 
			
 
				-	/*
			
 
				-	 * The BIOS area between 640k and 1Mb needs to be executable for
			
 
				-	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
			
 
				-	 */
			
 
				 #ifdef CONFIG_PCI_BIOS
			
 
				-	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
			
 
				-		pgprot_val(forbidden) |= _PAGE_NX;
			
 
				+/*
			
 
				+ * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
			
 
				+ * based config access (CONFIG_PCI_GOBIOS) support.
			
 
				+ */
			
 
				+#define BIOS_PFN	PFN_DOWN(BIOS_BEGIN)
			
 
				+#define BIOS_PFN_END	PFN_DOWN(BIOS_END - 1)
			
 
				+
			
 
				+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
			
 
				+{
			
 
				+	if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
			
 
				+		return _PAGE_NX;
			
 
				+	return 0;
			
 
				+}
			
 
				+#else
			
 
				+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				-	/*
			
 
				-	 * The kernel text needs to be executable for obvious reasons
			
 
				-	 * Does not cover __inittext since that is gone later on. On
			
 
				-	 * 64bit we do not enforce !NX on the low mapping
			
 
				-	 */
			
 
				-	if (within(address, (unsigned long)_text, (unsigned long)_etext))
			
 
				-		pgprot_val(forbidden) |= _PAGE_NX;
			
 
				+/*
			
 
				+ * The .rodata section needs to be read-only. Using the pfn catches all
			
 
				+ * aliases.  This also includes __ro_after_init, so do not enforce until
			
 
				+ * kernel_set_to_readonly is true.
			
 
				+ */
			
 
				+static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
			
 
				+{
			
 
				+	unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
			
 
				 
			
 
				 	/*
			
 
				-	 * The .rodata section needs to be read-only. Using the pfn
			
 
				-	 * catches all aliases.  This also includes __ro_after_init,
			
 
				-	 * so do not enforce until kernel_set_to_readonly is true.
			
 
				+	 * Note: __end_rodata is at page aligned and not inclusive, so
			
 
				+	 * subtract 1 to get the last enforced PFN in the rodata area.
			
 
				 	 */
			
 
				-	if (kernel_set_to_readonly &&
			
 
				-	    within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
			
 
				-		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
			
 
				-		pgprot_val(forbidden) |= _PAGE_RW;
			
 
				+	epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
			
 
				+
			
 
				+	if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
			
 
				+		return _PAGE_RW;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Protect kernel text against becoming non executable by forbidding
			
 
				+ * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
			
 
				+ * out of which the kernel actually executes.  Do not protect the low
			
 
				+ * mapping.
			
 
				+ *
			
 
				+ * This does not cover __inittext since that is gone after boot.
			
 
				+ */
			
 
				+static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
			
 
				+{
			
 
				+	unsigned long t_end = (unsigned long)_etext - 1;
			
 
				+	unsigned long t_start = (unsigned long)_text;
			
 
				+
			
 
				+	if (overlaps(start, end, t_start, t_end))
			
 
				+		return _PAGE_NX;
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				 #if defined(CONFIG_X86_64)
			
 
				+/*
			
 
				+ * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
			
 
				+ * kernel text mappings for the large page aligned text, rodata sections
			
 
				+ * will be always read-only. For the kernel identity mappings covering the
			
 
				+ * holes caused by this alignment can be anything that user asks.
			
 
				+ *
			
 
				+ * This will preserve the large page mappings for kernel text/data at no
			
 
				+ * extra cost.
			
 
				+ */
			
 
				+static pgprotval_t protect_kernel_text_ro(unsigned long start,
			
 
				+					  unsigned long end)
			
 
				+{
			
 
				+	unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
			
 
				+	unsigned long t_start = (unsigned long)_text;
			
 
				+	unsigned int level;
			
 
				+
			
 
				+	if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
			
 
				+		return 0;
			
 
				 	/*
			
 
				-	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
			
 
				-	 * kernel text mappings for the large page aligned text, rodata sections
			
 
				-	 * will be always read-only. For the kernel identity mappings covering
			
 
				-	 * the holes caused by this alignment can be anything that user asks.
			
 
				+	 * Don't enforce the !RW mapping for the kernel text mapping, if
			
 
				+	 * the current mapping is already using small page mapping.  No
			
 
				+	 * need to work hard to preserve large page mappings in this case.
			
 
				 	 *
			
 
				-	 * This will preserve the large page mappings for kernel text/data
			
 
				-	 * at no extra cost.
			
 
				+	 * This also fixes the Linux Xen paravirt guest boot failure caused
			
 
				+	 * by unexpected read-only mappings for kernel identity
			
 
				+	 * mappings. In this paravirt guest case, the kernel text mapping
			
 
				+	 * and the kernel identity mapping share the same page-table pages,
			
 
				+	 * so the protections for kernel text and identity mappings have to
			
 
				+	 * be the same.
			
 
				 	 */
			
 
				-	if (kernel_set_to_readonly &&
			
 
				-	    within(address, (unsigned long)_text,
			
 
				-		   (unsigned long)__end_rodata_hpage_align)) {
			
 
				-		unsigned int level;
			
 
				-
			
 
				-		/*
			
 
				-		 * Don't enforce the !RW mapping for the kernel text mapping,
			
 
				-		 * if the current mapping is already using small page mapping.
			
 
				-		 * No need to work hard to preserve large page mappings in this
			
 
				-		 * case.
			
 
				-		 *
			
 
				-		 * This also fixes the Linux Xen paravirt guest boot failure
			
 
				-		 * (because of unexpected read-only mappings for kernel identity
			
 
				-		 * mappings). In this paravirt guest case, the kernel text
			
 
				-		 * mapping and the kernel identity mapping share the same
			
 
				-		 * page-table pages. Thus we can't really use different
			
 
				-		 * protections for the kernel text and identity mappings. Also,
			
 
				-		 * these shared mappings are made of small page mappings.
			
 
				-		 * Thus this don't enforce !RW mapping for small page kernel
			
 
				-		 * text mapping logic will help Linux Xen parvirt guest boot
			
 
				-		 * as well.
			
 
				-		 */
			
 
				-		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
			
 
				-			pgprot_val(forbidden) |= _PAGE_RW;
			
 
				-	}
			
 
				+	if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
			
 
				+		return _PAGE_RW;
			
 
				+	return 0;
			
 
				+}
			
 
				+#else
			
 
				+static pgprotval_t protect_kernel_text_ro(unsigned long start,
			
 
				+					  unsigned long end)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				-	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
			
 
				+static inline bool conflicts(pgprot_t prot, pgprotval_t val)
			
 
				+{
			
 
				+	return (pgprot_val(prot) & ~val) != pgprot_val(prot);
			
 
				+}
			
 
				 
			
 
				-	return prot;
			
 
				+static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
			
 
				+				  unsigned long start, unsigned long end,
			
 
				+				  unsigned long pfn, const char *txt)
			
 
				+{
			
 
				+	static const char *lvltxt[] = {
			
 
				+		[CPA_CONFLICT]	= "conflict",
			
 
				+		[CPA_PROTECT]	= "protect",
			
 
				+		[CPA_DETECT]	= "detect",
			
 
				+	};
			
 
				+
			
 
				+	if (warnlvl > cpa_warn_level || !conflicts(prot, val))
			
 
				+		return;
			
 
				+
			
 
				+	pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
			
 
				+		lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
			
 
				+		(unsigned long long)val);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Certain areas of memory on x86 require very specific protection flags,
			
 
				+ * for example the BIOS area or kernel text. Callers don't always get this
			
 
				+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
			
 
				+ * checks and fixes these known static required protection bits.
			
 
				+ */
			
 
				+static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
			
 
				+					  unsigned long pfn, unsigned long npg,
			
 
				+					  int warnlvl)
			
 
				+{
			
 
				+	pgprotval_t forbidden, res;
			
 
				+	unsigned long end;
			
 
				+
			
 
				+	/*
			
 
				+	 * There is no point in checking RW/NX conflicts when the requested
			
 
				+	 * mapping is setting the page !PRESENT.
			
 
				+	 */
			
 
				+	if (!(pgprot_val(prot) & _PAGE_PRESENT))
			
 
				+		return prot;
			
 
				+
			
 
				+	/* Operate on the virtual address */
			
 
				+	end = start + npg * PAGE_SIZE - 1;
			
 
				+
			
 
				+	res = protect_kernel_text(start, end);
			
 
				+	check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
			
 
				+	forbidden = res;
			
 
				+
			
 
				+	res = protect_kernel_text_ro(start, end);
			
 
				+	check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
			
 
				+	forbidden |= res;
			
 
				+
			
 
				+	/* Check the PFN directly */
			
 
				+	res = protect_pci_bios(pfn, pfn + npg - 1);
			
 
				+	check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
			
 
				+	forbidden |= res;
			
 
				+
			
 
				+	res = protect_rodata(pfn, pfn + npg - 1);
			
 
				+	check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
			
 
				+	forbidden |= res;
			
 
				+
			
 
				+	return __pgprot(pgprot_val(prot) & ~forbidden);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 
				  */
			
 
				 pte_t *lookup_address(unsigned long address, unsigned int *level)
			
 
				 {
			
 
				-        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
			
 
				+	return lookup_address_in_pgd(pgd_offset_k(address), address, level);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(lookup_address);
			
 
				 
			
 
				 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
			
 
				 				  unsigned int *level)
			
 
				 {
			
 
				-        if (cpa->pgd)
			
 
				+	if (cpa->pgd)
			
 
				 		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
			
 
				 					       address, level);
			
 
				 
			
 
				-        return lookup_address(address, level);
			
 
				+	return lookup_address(address, level);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
 
				 	return prot;
			
 
				 }
			
 
				 
			
 
				-static int
			
 
				-try_preserve_large_page(pte_t *kpte, unsigned long address,
			
 
				-			struct cpa_data *cpa)
			
 
				+static int __should_split_large_page(pte_t *kpte, unsigned long address,
			
 
				+				     struct cpa_data *cpa)
			
 
				 {
			
 
				-	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
			
 
				+	unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
			
 
				+	pgprot_t old_prot, new_prot, req_prot, chk_prot;
			
 
				 	pte_t new_pte, old_pte, *tmp;
			
 
				-	pgprot_t old_prot, new_prot, req_prot;
			
 
				-	int i, do_split = 1;
			
 
				 	enum pg_level level;
			
 
				 
			
 
				-	if (cpa->force_split)
			
 
				-		return 1;
			
 
				-
			
 
				-	spin_lock(&pgd_lock);
			
 
				 	/*
			
 
				 	 * Check for races, another CPU might have split this page
			
 
				 	 * up already:
			
 
				 	 */
			
 
				 	tmp = _lookup_address_cpa(cpa, address, &level);
			
 
				 	if (tmp != kpte)
			
 
				-		goto out_unlock;
			
 
				+		return 1;
			
 
				 
			
 
				 	switch (level) {
			
 
				 	case PG_LEVEL_2M:
			
 
				 		old_prot = pmd_pgprot(*(pmd_t *)kpte);
			
 
				 		old_pfn = pmd_pfn(*(pmd_t *)kpte);
			
 
				+		cpa_inc_2m_checked();
			
 
				 		break;
			
 
				 	case PG_LEVEL_1G:
			
 
				 		old_prot = pud_pgprot(*(pud_t *)kpte);
			
 
				 		old_pfn = pud_pfn(*(pud_t *)kpte);
			
 
				+		cpa_inc_1g_checked();
			
 
				 		break;
			
 
				 	default:
			
 
				-		do_split = -EINVAL;
			
 
				-		goto out_unlock;
			
 
				+		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				 	psize = page_level_size(level);
			
@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 
				 	 * Calculate the number of pages, which fit into this large
			
 
				 	 * page starting at address:
			
 
				 	 */
			
 
				-	nextpage_addr = (address + psize) & pmask;
			
 
				-	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
			
 
				+	lpaddr = (address + psize) & pmask;
			
 
				+	numpages = (lpaddr - address) >> PAGE_SHIFT;
			
 
				 	if (numpages < cpa->numpages)
			
 
				 		cpa->numpages = numpages;
			
 
				 
			
@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
 
				 		pgprot_val(req_prot) |= _PAGE_PSE;
			
 
				 
			
 
				 	/*
			
 
				-	 * old_pfn points to the large page base pfn. So we need
			
 
				-	 * to add the offset of the virtual address:
			
 
				+	 * old_pfn points to the large page base pfn. So we need to add the
			
 
				+	 * offset of the virtual address:
			
 
				 	 */
			
 
				 	pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
			
 
				 	cpa->pfn = pfn;
			
 
				 
			
 
				-	new_prot = static_protections(req_prot, address, pfn);
			
 
				+	/*
			
 
				+	 * Calculate the large page base address and the number of 4K pages
			
 
				+	 * in the large page
			
 
				+	 */
			
 
				+	lpaddr = address & pmask;
			
 
				+	numpages = psize >> PAGE_SHIFT;
			
 
				 
			
 
				 	/*
			
 
				-	 * We need to check the full range, whether
			
 
				-	 * static_protection() requires a different pgprot for one of
			
 
				-	 * the pages in the range we try to preserve:
			
 
				+	 * Sanity check that the existing mapping is correct versus the static
			
 
				+	 * protections. static_protections() guards against !PRESENT, so no
			
 
				+	 * extra conditional required here.
			
 
				 	 */
			
 
				-	addr = address & pmask;
			
 
				-	pfn = old_pfn;
			
 
				-	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
			
 
				-		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
			
 
				+	chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
			
 
				+				      CPA_CONFLICT);
			
 
				 
			
 
				-		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
			
 
				-			goto out_unlock;
			
 
				+	if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
			
 
				+		/*
			
 
				+		 * Split the large page and tell the split code to
			
 
				+		 * enforce static protections.
			
 
				+		 */
			
 
				+		cpa->force_static_prot = 1;
			
 
				+		return 1;
			
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * If there are no changes, return. maxpages has been updated
			
 
				-	 * above:
			
 
				+	 * Optimization: If the requested pgprot is the same as the current
			
 
				+	 * pgprot, then the large page can be preserved and no updates are
			
 
				+	 * required independent of alignment and length of the requested
			
 
				+	 * range. The above already established that the current pgprot is
			
 
				+	 * correct, which in consequence makes the requested pgprot correct
			
 
				+	 * as well if it is the same. The static protection scan below will
			
 
				+	 * not come to a different conclusion.
			
 
				 	 */
			
 
				-	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
			
 
				-		do_split = 0;
			
 
				-		goto out_unlock;
			
 
				+	if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
			
 
				+		cpa_inc_lp_sameprot(level);
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * We need to change the attributes. Check, whether we can
			
 
				-	 * change the large page in one go. We request a split, when
			
 
				-	 * the address is not aligned and the number of pages is
			
 
				-	 * smaller than the number of pages in the large page. Note
			
 
				-	 * that we limited the number of possible pages already to
			
 
				-	 * the number of pages in the large page.
			
 
				+	 * If the requested range does not cover the full page, split it up
			
 
				 	 */
			
 
				-	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
			
 
				-		/*
			
 
				-		 * The address is aligned and the number of pages
			
 
				-		 * covers the full page.
			
 
				-		 */
			
 
				-		new_pte = pfn_pte(old_pfn, new_prot);
			
 
				-		__set_pmd_pte(kpte, address, new_pte);
			
 
				-		cpa->flags |= CPA_FLUSHTLB;
			
 
				-		do_split = 0;
			
 
				-	}
			
 
				+	if (address != lpaddr || cpa->numpages != numpages)
			
 
				+		return 1;
			
 
				 
			
 
				-out_unlock:
			
 
				+	/*
			
 
				+	 * Check whether the requested pgprot is conflicting with a static
			
 
				+	 * protection requirement in the large page.
			
 
				+	 */
			
 
				+	new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
			
 
				+				      CPA_DETECT);
			
 
				+
			
 
				+	/*
			
 
				+	 * If there is a conflict, split the large page.
			
 
				+	 *
			
 
				+	 * There used to be a 4k wise evaluation trying really hard to
			
 
				+	 * preserve the large pages, but experimentation has shown, that this
			
 
				+	 * does not help at all. There might be corner cases which would
			
 
				+	 * preserve one large page occasionally, but it's really not worth the
			
 
				+	 * extra code and cycles for the common case.
			
 
				+	 */
			
 
				+	if (pgprot_val(req_prot) != pgprot_val(new_prot))
			
 
				+		return 1;
			
 
				+
			
 
				+	/* All checks passed. Update the large page mapping. */
			
 
				+	new_pte = pfn_pte(old_pfn, new_prot);
			
 
				+	__set_pmd_pte(kpte, address, new_pte);
			
 
				+	cpa->flags |= CPA_FLUSHTLB;
			
 
				+	cpa_inc_lp_preserved(level);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int should_split_large_page(pte_t *kpte, unsigned long address,
			
 
				+				   struct cpa_data *cpa)
			
 
				+{
			
 
				+	int do_split;
			
 
				+
			
 
				+	if (cpa->force_split)
			
 
				+		return 1;
			
 
				+
			
 
				+	spin_lock(&pgd_lock);
			
 
				+	do_split = __should_split_large_page(kpte, address, cpa);
			
 
				 	spin_unlock(&pgd_lock);
			
 
				 
			
 
				 	return do_split;
			
 
				 }
			
 
				 
			
 
				+static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
			
 
				+			  pgprot_t ref_prot, unsigned long address,
			
 
				+			  unsigned long size)
			
 
				+{
			
 
				+	unsigned int npg = PFN_DOWN(size);
			
 
				+	pgprot_t prot;
			
 
				+
			
 
				+	/*
			
 
				+	 * If should_split_large_page() discovered an inconsistent mapping,
			
 
				+	 * remove the invalid protection in the split mapping.
			
 
				+	 */
			
 
				+	if (!cpa->force_static_prot)
			
 
				+		goto set;
			
 
				+
			
 
				+	prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
			
 
				+
			
 
				+	if (pgprot_val(prot) == pgprot_val(ref_prot))
			
 
				+		goto set;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this is splitting a PMD, fix it up. PUD splits cannot be
			
 
				+	 * fixed trivially as that would require to rescan the newly
			
 
				+	 * installed PMD mappings after returning from split_large_page()
			
 
				+	 * so an eventual further split can allocate the necessary PTE
			
 
				+	 * pages. Warn for now and revisit it in case this actually
			
 
				+	 * happens.
			
 
				+	 */
			
 
				+	if (size == PAGE_SIZE)
			
 
				+		ref_prot = prot;
			
 
				+	else
			
 
				+		pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
			
 
				+set:
			
 
				+	set_pte(pte, pfn_pte(pfn, ref_prot));
			
 
				+}
			
 
				+
			
 
				 static int
			
 
				 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
			
 
				 		   struct page *base)
			
 
				 {
			
 
				+	unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
			
 
				 	pte_t *pbase = (pte_t *)page_address(base);
			
 
				-	unsigned long ref_pfn, pfn, pfninc = 1;
			
 
				 	unsigned int i, level;
			
 
				-	pte_t *tmp;
			
 
				 	pgprot_t ref_prot;
			
 
				+	pte_t *tmp;
			
 
				 
			
 
				 	spin_lock(&pgd_lock);
			
 
				 	/*
			
@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 
				 		 * PAT bit to correct position.
			
 
				 		 */
			
 
				 		ref_prot = pgprot_large_2_4k(ref_prot);
			
 
				-
			
 
				 		ref_pfn = pmd_pfn(*(pmd_t *)kpte);
			
 
				+		lpaddr = address & PMD_MASK;
			
 
				+		lpinc = PAGE_SIZE;
			
 
				 		break;
			
 
				 
			
 
				 	case PG_LEVEL_1G:
			
 
				 		ref_prot = pud_pgprot(*(pud_t *)kpte);
			
 
				 		ref_pfn = pud_pfn(*(pud_t *)kpte);
			
 
				 		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
			
 
				-
			
 
				+		lpaddr = address & PUD_MASK;
			
 
				+		lpinc = PMD_SIZE;
			
 
				 		/*
			
 
				 		 * Clear the PSE flags if the PRESENT flag is not set
			
 
				 		 * otherwise pmd_present/pmd_huge will return true
			
@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 
				 	 * Get the target pfn from the original entry:
			
 
				 	 */
			
 
				 	pfn = ref_pfn;
			
 
				-	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
			
 
				-		set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
			
 
				+	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
			
 
				+		split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
			
 
				 
			
 
				 	if (virt_addr_valid(address)) {
			
 
				 		unsigned long pfn = PFN_DOWN(__pa(address));
			
@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 
				 	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
			
 
				 
			
 
				 	/*
			
 
				-	 * Intel Atom errata AAH41 workaround.
			
 
				+	 * Do a global flush tlb after splitting the large page
			
 
				+	 * and before we do the actual change page attribute in the PTE.
			
 
				+	 *
			
 
				+	 * Without this, we violate the TLB application note, that says:
			
 
				+	 * "The TLBs may contain both ordinary and large-page
			
 
				+	 *  translations for a 4-KByte range of linear addresses. This
			
 
				+	 *  may occur if software modifies the paging structures so that
			
 
				+	 *  the page size used for the address range changes. If the two
			
 
				+	 *  translations differ with respect to page frame or attributes
			
 
				+	 *  (e.g., permissions), processor behavior is undefined and may
			
 
				+	 *  be implementation-specific."
			
 
				 	 *
			
 
				-	 * The real fix should be in hw or in a microcode update, but
			
 
				-	 * we also probabilistically try to reduce the window of having
			
 
				-	 * a large TLB mixed with 4K TLBs while instruction fetches are
			
 
				-	 * going on.
			
 
				+	 * We do this global tlb flush inside the cpa_lock, so that we
			
 
				+	 * don't allow any other cpu, with stale tlb entries change the
			
 
				+	 * page attribute in parallel, that also falls into the
			
 
				+	 * just split large page entry.
			
 
				 	 */
			
 
				-	__flush_tlb_all();
			
 
				+	flush_tlb_all();
			
 
				 	spin_unlock(&pgd_lock);
			
 
				 
			
 
				 	return 0;
			
@@ -1247,7 +1494,9 @@ repeat:
 
				 		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
			
 
				 		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
			
 
				 
			
 
				-		new_prot = static_protections(new_prot, address, pfn);
			
 
				+		cpa_inc_4k_install();
			
 
				+		new_prot = static_protections(new_prot, address, pfn, 1,
			
 
				+					      CPA_PROTECT);
			
 
				 
			
 
				 		new_prot = pgprot_clear_protnone_bits(new_prot);
			
 
				 
			
@@ -1273,7 +1522,7 @@ repeat:
 
				 	 * Check, whether we can keep the large page intact
			
 
				 	 * and just change the pte:
			
 
				 	 */
			
 
				-	do_split = try_preserve_large_page(kpte, address, cpa);
			
 
				+	do_split = should_split_large_page(kpte, address, cpa);
			
 
				 	/*
			
 
				 	 * When the range fits into the existing large page,
			
 
				 	 * return. cp->numpages and cpa->tlbflush have been updated in
			
@@ -1286,28 +1535,8 @@ repeat:
 
				 	 * We have to split the large page:
			
 
				 	 */
			
 
				 	err = split_large_page(cpa, kpte, address);
			
 
				-	if (!err) {
			
 
				-		/*
			
 
				-	 	 * Do a global flush tlb after splitting the large page
			
 
				-	 	 * and before we do the actual change page attribute in the PTE.
			
 
				-	 	 *
			
 
				-	 	 * With out this, we violate the TLB application note, that says
			
 
				-	 	 * "The TLBs may contain both ordinary and large-page
			
 
				-		 *  translations for a 4-KByte range of linear addresses. This
			
 
				-		 *  may occur if software modifies the paging structures so that
			
 
				-		 *  the page size used for the address range changes. If the two
			
 
				-		 *  translations differ with respect to page frame or attributes
			
 
				-		 *  (e.g., permissions), processor behavior is undefined and may
			
 
				-		 *  be implementation-specific."
			
 
				-	 	 *
			
 
				-	 	 * We do this global tlb flush inside the cpa_lock, so that we
			
 
				-		 * don't allow any other cpu, with stale tlb entries change the
			
 
				-		 * page attribute in parallel, that also falls into the
			
 
				-		 * just split large page entry.
			
 
				-	 	 */
			
 
				-		flush_tlb_all();
			
 
				+	if (!err)
			
 
				 		goto repeat;
			
 
				-	}
			
 
				 
			
 
				 	return err;
			
 
				 }
			
@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 
				 	cache = !!pgprot2cachemode(mask_set);
			
 
				 
			
 
				 	/*
			
 
				-	 * On success we use CLFLUSH, when the CPU supports it to
			
 
				-	 * avoid the WBINVD. If the CPU does not support it and in the
			
 
				-	 * error case we fall back to cpa_flush_all (which uses
			
 
				-	 * WBINVD):
			
 
				+	 * On error; flush everything to be sure.
			
 
				 	 */
			
 
				-	if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
			
 
				-		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
			
 
				-			cpa_flush_array(addr, numpages, cache,
			
 
				-					cpa.flags, pages);
			
 
				-		} else
			
 
				-			cpa_flush_range(baddr, numpages, cache);
			
 
				-	} else
			
 
				+	if (ret) {
			
 
				 		cpa_flush_all(cache);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
			
 
				+		cpa_flush_array(baddr, addr, numpages, cache,
			
 
				+				cpa.flags, pages);
			
 
				+	} else {
			
 
				+		cpa_flush_range(baddr, numpages, cache);
			
 
				+	}
			
 
				 
			
 
				 out:
			
 
				 	return ret;
			
@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
 
				 	/*
			
 
				 	 * Before changing the encryption attribute, we need to flush caches.
			
 
				 	 */
			
 
				-	if (static_cpu_has(X86_FEATURE_CLFLUSH))
			
 
				-		cpa_flush_range(start, numpages, 1);
			
 
				-	else
			
 
				-		cpa_flush_all(1);
			
 
				+	cpa_flush_range(start, numpages, 1);
			
 
				 
			
 
				 	ret = __change_page_attr_set_clr(&cpa, 1);
			
 
				 
			
@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
 
				 	 * in case TLB flushing gets optimized in the cpa_flush_range()
			
 
				 	 * path use the same logic as above.
			
 
				 	 */
			
 
				-	if (static_cpu_has(X86_FEATURE_CLFLUSH))
			
 
				-		cpa_flush_range(start, numpages, 0);
			
 
				-	else
			
 
				-		cpa_flush_all(0);
			
 
				+	cpa_flush_range(start, numpages, 0);
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
				 {
			
 
				 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
			
 
				 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
			
 
				+	bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
			
 
				 	unsigned cpu = smp_processor_id();
			
 
				 	u64 next_tlb_gen;
			
 
				+	bool need_flush;
			
 
				+	u16 new_asid;
			
 
				 
			
 
				 	/*
			
 
				 	 * NB: The scheduler will call us with prev == next when switching
			
@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
				 			   next->context.ctx_id);
			
 
				 
			
 
				 		/*
			
 
				-		 * We don't currently support having a real mm loaded without
			
 
				-		 * our cpu set in mm_cpumask().  We have all the bookkeeping
			
 
				-		 * in place to figure out whether we would need to flush
			
 
				-		 * if our cpu were cleared in mm_cpumask(), but we don't
			
 
				-		 * currently use it.
			
 
				+		 * Even in lazy TLB mode, the CPU should stay set in the
			
 
				+		 * mm_cpumask. The TLB shootdown code can figure out from
			
 
				+		 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
			
 
				 		 */
			
 
				 		if (WARN_ON_ONCE(real_prev != &init_mm &&
			
 
				 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
			
 
				 			cpumask_set_cpu(cpu, mm_cpumask(next));
			
 
				 
			
 
				-		return;
			
 
				+		/*
			
 
				+		 * If the CPU is not in lazy TLB mode, we are just switching
			
 
				+		 * from one thread in a process to another thread in the same
			
 
				+		 * process. No TLB flush required.
			
 
				+		 */
			
 
				+		if (!was_lazy)
			
 
				+			return;
			
 
				+
			
 
				+		/*
			
 
				+		 * Read the tlb_gen to check whether a flush is needed.
			
 
				+		 * If the TLB is up to date, just use it.
			
 
				+		 * The barrier synchronizes with the tlb_gen increment in
			
 
				+		 * the TLB shootdown code.
			
 
				+		 */
			
 
				+		smp_mb();
			
 
				+		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
			
 
				+		if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
			
 
				+				next_tlb_gen)
			
 
				+			return;
			
 
				+
			
 
				+		/*
			
 
				+		 * TLB contents went out of date while we were in lazy
			
 
				+		 * mode. Fall through to the TLB switching code below.
			
 
				+		 */
			
 
				+		new_asid = prev_asid;
			
 
				+		need_flush = true;
			
 
				 	} else {
			
 
				-		u16 new_asid;
			
 
				-		bool need_flush;
			
 
				 		u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
			
 
				 
			
 
				 		/*
			
@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 
				 		/* Let nmi_uaccess_okay() know that we're changing CR3. */
			
 
				 		this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
			
 
				 		barrier();
			
 
				+	}
			
 
				 
			
 
				-		if (need_flush) {
			
 
				-			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
			
 
				-			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
			
 
				-			load_new_mm_cr3(next->pgd, new_asid, true);
			
 
				-
			
 
				-			/*
			
 
				-			 * NB: This gets called via leave_mm() in the idle path
			
 
				-			 * where RCU functions differently.  Tracing normally
			
 
				-			 * uses RCU, so we need to use the _rcuidle variant.
			
 
				-			 *
			
 
				-			 * (There is no good reason for this.  The idle code should
			
 
				-			 *  be rearranged to call this before rcu_idle_enter().)
			
 
				-			 */
			
 
				-			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
			
 
				-		} else {
			
 
				-			/* The new ASID is already up to date. */
			
 
				-			load_new_mm_cr3(next->pgd, new_asid, false);
			
 
				-
			
 
				-			/* See above wrt _rcuidle. */
			
 
				-			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
			
 
				-		}
			
 
				+	if (need_flush) {
			
 
				+		this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
			
 
				+		this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
			
 
				+		load_new_mm_cr3(next->pgd, new_asid, true);
			
 
				 
			
 
				 		/*
			
 
				-		 * Record last user mm's context id, so we can avoid
			
 
				-		 * flushing branch buffer with IBPB if we switch back
			
 
				-		 * to the same user.
			
 
				+		 * NB: This gets called via leave_mm() in the idle path
			
 
				+		 * where RCU functions differently.  Tracing normally
			
 
				+		 * uses RCU, so we need to use the _rcuidle variant.
			
 
				+		 *
			
 
				+		 * (There is no good reason for this.  The idle code should
			
 
				+		 *  be rearranged to call this before rcu_idle_enter().)
			
 
				 		 */
			
 
				-		if (next != &init_mm)
			
 
				-			this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
			
 
				-
			
 
				-		/* Make sure we write CR3 before loaded_mm. */
			
 
				-		barrier();
			
 
				+		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
			
 
				+	} else {
			
 
				+		/* The new ASID is already up to date. */
			
 
				+		load_new_mm_cr3(next->pgd, new_asid, false);
			
 
				 
			
 
				-		this_cpu_write(cpu_tlbstate.loaded_mm, next);
			
 
				-		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
			
 
				+		/* See above wrt _rcuidle. */
			
 
				+		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
			
 
				 	}
			
 
				 
			
 
				-	load_mm_cr4(next);
			
 
				-	switch_ldt(real_prev, next);
			
 
				+	/*
			
 
				+	 * Record last user mm's context id, so we can avoid
			
 
				+	 * flushing branch buffer with IBPB if we switch back
			
 
				+	 * to the same user.
			
 
				+	 */
			
 
				+	if (next != &init_mm)
			
 
				+		this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
			
 
				+
			
 
				+	/* Make sure we write CR3 before loaded_mm. */
			
 
				+	barrier();
			
 
				+
			
 
				+	this_cpu_write(cpu_tlbstate.loaded_mm, next);
			
 
				+	this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
			
 
				+
			
 
				+	if (next != real_prev) {
			
 
				+		load_mm_cr4(next);
			
 
				+		switch_ldt(real_prev, next);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 
				 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
			
 
				 		return;
			
 
				 
			
 
				-	if (tlb_defer_switch_to_init_mm()) {
			
 
				-		/*
			
 
				-		 * There's a significant optimization that may be possible
			
 
				-		 * here.  We have accurate enough TLB flush tracking that we
			
 
				-		 * don't need to maintain coherence of TLB per se when we're
			
 
				-		 * lazy.  We do, however, need to maintain coherence of
			
 
				-		 * paging-structure caches.  We could, in principle, leave our
			
 
				-		 * old mm loaded and only switch to init_mm when
			
 
				-		 * tlb_remove_page() happens.
			
 
				-		 */
			
 
				-		this_cpu_write(cpu_tlbstate.is_lazy, true);
			
 
				-	} else {
			
 
				-		switch_mm(NULL, &init_mm, NULL);
			
 
				-	}
			
 
				+	this_cpu_write(cpu_tlbstate.is_lazy, true);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 
				 		 * paging-structure cache to avoid speculatively reading
			
 
				 		 * garbage into our TLB.  Since switching to init_mm is barely
			
 
				 		 * slower than a minimal flush, just switch to init_mm.
			
 
				+		 *
			
 
				+		 * This should be rare, with native_flush_tlb_others skipping
			
 
				+		 * IPIs to lazy TLB mode CPUs.
			
 
				 		 */
			
 
				 		switch_mm_irqs_off(NULL, &init_mm, NULL);
			
 
				 		return;
			
@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
 
				 	    f->new_tlb_gen == local_tlb_gen + 1 &&
			
 
				 	    f->new_tlb_gen == mm_tlb_gen) {
			
 
				 		/* Partial flush */
			
 
				-		unsigned long addr;
			
 
				-		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
			
 
				+		unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
			
 
				+		unsigned long addr = f->start;
			
 
				 
			
 
				-		addr = f->start;
			
 
				 		while (addr < f->end) {
			
 
				 			__flush_tlb_one_user(addr);
			
 
				-			addr += PAGE_SIZE;
			
 
				+			addr += 1UL << f->stride_shift;
			
 
				 		}
			
 
				 		if (local)
			
 
				-			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
			
 
				-		trace_tlb_flush(reason, nr_pages);
			
 
				+			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
			
 
				+		trace_tlb_flush(reason, nr_invalidate);
			
 
				 	} else {
			
 
				 		/* Full flush. */
			
 
				 		local_flush_tlb();
			
@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
 
				 	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
			
 
				 }
			
 
				 
			
 
				+static bool tlb_is_not_lazy(int cpu, void *data)
			
 
				+{
			
 
				+	return !per_cpu(cpu_tlbstate.is_lazy, cpu);
			
 
				+}
			
 
				+
			
 
				 void native_flush_tlb_others(const struct cpumask *cpumask,
			
 
				 			     const struct flush_tlb_info *info)
			
 
				 {
			
@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 
				 					       (void *)info, 1);
			
 
				 		return;
			
 
				 	}
			
 
				-	smp_call_function_many(cpumask, flush_tlb_func_remote,
			
 
				+
			
 
				+	/*
			
 
				+	 * If no page tables were freed, we can skip sending IPIs to
			
 
				+	 * CPUs in lazy TLB mode. They will flush the CPU themselves
			
 
				+	 * at the next context switch.
			
 
				+	 *
			
 
				+	 * However, if page tables are getting freed, we need to send the
			
 
				+	 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
			
 
				+	 * up on the new contents of what used to be page tables, while
			
 
				+	 * doing a speculative memory access.
			
 
				+	 */
			
 
				+	if (info->freed_tables)
			
 
				+		smp_call_function_many(cpumask, flush_tlb_func_remote,
			
 
				 			       (void *)info, 1);
			
 
				+	else
			
 
				+		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
			
 
				+				(void *)info, 1, GFP_ATOMIC, cpumask);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 
				 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
			
 
				 
			
 
				 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
			
 
				-				unsigned long end, unsigned long vmflag)
			
 
				+				unsigned long end, unsigned int stride_shift,
			
 
				+				bool freed_tables)
			
 
				 {
			
 
				 	int cpu;
			
 
				 
			
 
				 	struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
			
 
				 		.mm = mm,
			
 
				+		.stride_shift = stride_shift,
			
 
				+		.freed_tables = freed_tables,
			
 
				 	};
			
 
				 
			
 
				 	cpu = get_cpu();
			
@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
				 
			
 
				 	/* Should we flush just the requested range? */
			
 
				 	if ((end != TLB_FLUSH_ALL) &&
			
 
				-	    !(vmflag & VM_HUGETLB) &&
			
 
				-	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
			
 
				+	    ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
			
 
				 		info.start = start;
			
 
				 		info.end = end;
			
 
				 	} else {
			
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -22,6 +22,7 @@
 
				 #include <linux/tick.h>
			
 
				 #include <linux/nmi.h>
			
 
				 #include <linux/cpuhotplug.h>
			
 
				+#include <linux/stackprotector.h>
			
 
				 
			
 
				 #include <asm/paravirt.h>
			
 
				 #include <asm/desc.h>
			
@@ -88,6 +89,7 @@ static void cpu_bringup(void)
 
				 asmlinkage __visible void cpu_bringup_and_idle(void)
			
 
				 {
			
 
				 	cpu_bringup();
			
 
				+	boot_init_stack_canary();
			
 
				 	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
			
 
				 }
			
 
				 
			
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	old_devtb_phys = entry & PAGE_MASK;
			
 
				+	/*
			
 
				+	 * When SME is enabled in the first kernel, the entry includes the
			
 
				+	 * memory encryption mask(sme_me_mask), we must remove the memory
			
 
				+	 * encryption mask to obtain the true physical address in kdump kernel.
			
 
				+	 */
			
 
				+	old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
			
 
				+
			
 
				 	if (old_devtb_phys >= 0x100000000ULL) {
			
 
				 		pr_err("The address of old device table is above 4G, not trustworthy!\n");
			
 
				 		return false;
			
 
				 	}
			
 
				-	old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
			
 
				+	old_devtb = (sme_active() && is_kdump_kernel())
			
 
				+		    ? (__force void *)ioremap_encrypted(old_devtb_phys,
			
 
				+							dev_table_size)
			
 
				+		    : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
			
 
				+
			
 
				 	if (!old_devtb)
			
 
				 		return false;
			
 
				 
			
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -24,6 +24,8 @@
 
				 #include <linux/vmalloc.h>
			
 
				 #include <linux/pagemap.h>
			
 
				 #include <linux/uaccess.h>
			
 
				+#include <linux/mem_encrypt.h>
			
 
				+#include <asm/pgtable.h>
			
 
				 #include <asm/io.h>
			
 
				 #include "internal.h"
			
 
				 
			
@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
 
				 
			
 
				 /* Reads a page from the oldmem device from given offset. */
			
 
				 static ssize_t read_from_oldmem(char *buf, size_t count,
			
 
				-				u64 *ppos, int userbuf)
			
 
				+				u64 *ppos, int userbuf,
			
 
				+				bool encrypted)
			
 
				 {
			
 
				 	unsigned long pfn, offset;
			
 
				 	size_t nr_bytes;
			
@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
 
				 		if (pfn_is_ram(pfn) == 0)
			
 
				 			memset(buf, 0, nr_bytes);
			
 
				 		else {
			
 
				-			tmp = copy_oldmem_page(pfn, buf, nr_bytes,
			
 
				-						offset, userbuf);
			
 
				+			if (encrypted)
			
 
				+				tmp = copy_oldmem_page_encrypted(pfn, buf,
			
 
				+								 nr_bytes,
			
 
				+								 offset,
			
 
				+								 userbuf);
			
 
				+			else
			
 
				+				tmp = copy_oldmem_page(pfn, buf, nr_bytes,
			
 
				+						       offset, userbuf);
			
 
				+
			
 
				 			if (tmp < 0)
			
 
				 				return tmp;
			
 
				 		}
			
@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
 
				  */
			
 
				 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
			
 
				 {
			
 
				-	return read_from_oldmem(buf, count, ppos, 0);
			
 
				+	return read_from_oldmem(buf, count, ppos, 0, false);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 
				  */
			
 
				 ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
			
 
				 {
			
 
				-	return read_from_oldmem(buf, count, ppos, 0);
			
 
				+	return read_from_oldmem(buf, count, ppos, 0, sme_active());
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -173,9 +183,20 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
 
				 				  unsigned long from, unsigned long pfn,
			
 
				 				  unsigned long size, pgprot_t prot)
			
 
				 {
			
 
				+	prot = pgprot_encrypted(prot);
			
 
				 	return remap_pfn_range(vma, from, pfn, size, prot);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Architectures which support memory encryption override this.
			
 
				+ */
			
 
				+ssize_t __weak
			
 
				+copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
			
 
				+			   unsigned long offset, int userbuf)
			
 
				+{
			
 
				+	return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Copy to either kernel or user space
			
 
				  */
			
@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
 
				 					    m->offset + m->size - *fpos,
			
 
				 					    buflen);
			
 
				 			start = m->paddr + *fpos - m->offset;
			
 
				-			tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
			
 
				+			tmp = read_from_oldmem(buffer, tsz, &start,
			
 
				+					       userbuf, sme_active());
			
 
				 			if (tmp < 0)
			
 
				 				return tmp;
			
 
				 			buflen -= tsz;
			
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
 
				 
			
 
				 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
			
 
				 						unsigned long, int);
			
 
				+extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
			
 
				+					  size_t csize, unsigned long offset,
			
 
				+					  int userbuf);
			
 
				+
			
 
				 void vmcore_cleanup(void);
			
 
				 
			
 
				 /* Architecture code defines this if there are other possible ELF
			
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 
				 		smp_call_func_t func, void *info, bool wait,
			
 
				 		gfp_t gfp_flags);
			
 
				 
			
 
				+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
			
 
				+		smp_call_func_t func, void *info, bool wait,
			
 
				+		gfp_t gfp_flags, const struct cpumask *mask);
			
 
				+
			
 
				 int smp_call_function_single_async(int cpu, call_single_data_t *csd);
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	/* Ensure that these pages are decrypted if SME is enabled. */
			
 
				+	if (pages)
			
 
				+		arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
			
 
				+
			
 
				 	return pages;
			
 
				 }
			
 
				 
			
@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 
				 			result  = -ENOMEM;
			
 
				 			goto out;
			
 
				 		}
			
 
				+		arch_kexec_post_alloc_pages(page_address(page), 1, 0);
			
 
				 		ptr = kmap(page);
			
 
				 		ptr += maddr & ~PAGE_MASK;
			
 
				 		mchunk = min_t(size_t, mbytes,
			
@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
 
				 			result = copy_from_user(ptr, buf, uchunk);
			
 
				 		kexec_flush_icache_page(page);
			
 
				 		kunmap(page);
			
 
				+		arch_kexec_pre_free_pages(page_address(page), 1);
			
 
				 		if (result) {
			
 
				 			result = -EFAULT;
			
 
				 			goto out;
			
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -318,33 +318,34 @@ int release_resource(struct resource *old)
 
				 
			
 
				 EXPORT_SYMBOL(release_resource);
			
 
				 
			
 
				-/*
			
 
				- * Finds the lowest iomem resource existing within [res->start.res->end).
			
 
				- * The caller must specify res->start, res->end, res->flags, and optionally
			
 
				- * desc.  If found, returns 0, res is overwritten, if not found, returns -1.
			
 
				- * This function walks the whole tree and not just first level children until
			
 
				- * and unless first_level_children_only is true.
			
 
				+/**
			
 
				+ * Finds the lowest iomem resource that covers part of [start..end].  The
			
 
				+ * caller must specify start, end, flags, and desc (which may be
			
 
				+ * IORES_DESC_NONE).
			
 
				+ *
			
 
				+ * If a resource is found, returns 0 and *res is overwritten with the part
			
 
				+ * of the resource that's within [start..end]; if none is found, returns
			
 
				+ * -1.
			
 
				+ *
			
 
				+ * This function walks the whole tree and not just first level children
			
 
				+ * unless @first_lvl is true.
			
 
				  */
			
 
				-static int find_next_iomem_res(struct resource *res, unsigned long desc,
			
 
				-			       bool first_level_children_only)
			
 
				+static int find_next_iomem_res(resource_size_t start, resource_size_t end,
			
 
				+			       unsigned long flags, unsigned long desc,
			
 
				+			       bool first_lvl, struct resource *res)
			
 
				 {
			
 
				-	resource_size_t start, end;
			
 
				 	struct resource *p;
			
 
				-	bool sibling_only = false;
			
 
				 
			
 
				-	BUG_ON(!res);
			
 
				-
			
 
				-	start = res->start;
			
 
				-	end = res->end;
			
 
				-	BUG_ON(start >= end);
			
 
				+	if (!res)
			
 
				+		return -EINVAL;
			
 
				 
			
 
				-	if (first_level_children_only)
			
 
				-		sibling_only = true;
			
 
				+	if (start >= end)
			
 
				+		return -EINVAL;
			
 
				 
			
 
				 	read_lock(&resource_lock);
			
 
				 
			
 
				-	for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
			
 
				-		if ((p->flags & res->flags) != res->flags)
			
 
				+	for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
			
 
				+		if ((p->flags & flags) != flags)
			
 
				 			continue;
			
 
				 		if ((desc != IORES_DESC_NONE) && (desc != p->desc))
			
 
				 			continue;
			
@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 
				 			p = NULL;
			
 
				 			break;
			
 
				 		}
			
 
				-		if ((p->end >= start) && (p->start < end))
			
 
				+		if ((p->end >= start) && (p->start <= end))
			
 
				 			break;
			
 
				 	}
			
 
				 
			
 
				 	read_unlock(&resource_lock);
			
 
				 	if (!p)
			
 
				 		return -1;
			
 
				+
			
 
				 	/* copy data */
			
 
				-	if (res->start < p->start)
			
 
				-		res->start = p->start;
			
 
				-	if (res->end > p->end)
			
 
				-		res->end = p->end;
			
 
				+	res->start = max(start, p->start);
			
 
				+	res->end = min(end, p->end);
			
 
				 	res->flags = p->flags;
			
 
				 	res->desc = p->desc;
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
			
 
				-				 bool first_level_children_only,
			
 
				-				 void *arg,
			
 
				+static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
			
 
				+				 unsigned long flags, unsigned long desc,
			
 
				+				 bool first_lvl, void *arg,
			
 
				 				 int (*func)(struct resource *, void *))
			
 
				 {
			
 
				-	u64 orig_end = res->end;
			
 
				+	struct resource res;
			
 
				 	int ret = -1;
			
 
				 
			
 
				-	while ((res->start < res->end) &&
			
 
				-	       !find_next_iomem_res(res, desc, first_level_children_only)) {
			
 
				-		ret = (*func)(res, arg);
			
 
				+	while (start < end &&
			
 
				+	       !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
			
 
				+		ret = (*func)(&res, arg);
			
 
				 		if (ret)
			
 
				 			break;
			
 
				 
			
 
				-		res->start = res->end + 1;
			
 
				-		res->end = orig_end;
			
 
				+		start = res.end + 1;
			
 
				 	}
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				+/**
			
 
				  * Walks through iomem resources and calls func() with matching resource
			
 
				  * ranges. This walks through whole tree and not just first level children.
			
 
				  * All the memory ranges which overlap start,end and also match flags and
			
@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
 
				 int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
			
 
				 		u64 end, void *arg, int (*func)(struct resource *, void *))
			
 
				 {
			
 
				-	struct resource res;
			
 
				-
			
 
				-	res.start = start;
			
 
				-	res.end = end;
			
 
				-	res.flags = flags;
			
 
				-
			
 
				-	return __walk_iomem_res_desc(&res, desc, false, arg, func);
			
 
				+	return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
			
 
				 
			
@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
 
				  * ranges.
			
 
				  */
			
 
				 int walk_system_ram_res(u64 start, u64 end, void *arg,
			
 
				-				int (*func)(struct resource *, void *))
			
 
				+			int (*func)(struct resource *, void *))
			
 
				 {
			
 
				-	struct resource res;
			
 
				+	unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
			
 
				 
			
 
				-	res.start = start;
			
 
				-	res.end = end;
			
 
				-	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
			
 
				-
			
 
				-	return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
			
 
				+	return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
			
 
				 				     arg, func);
			
 
				 }
			
 
				 
			
@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 
				 int walk_mem_res(u64 start, u64 end, void *arg,
			
 
				 		 int (*func)(struct resource *, void *))
			
 
				 {
			
 
				-	struct resource res;
			
 
				-
			
 
				-	res.start = start;
			
 
				-	res.end = end;
			
 
				-	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
			
 
				+	unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
			
 
				 
			
 
				-	return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
			
 
				+	return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
			
 
				 				     arg, func);
			
 
				 }
			
 
				 
			
@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
 
				  * It is to be used only for System RAM.
			
 
				  */
			
 
				 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
			
 
				-		void *arg, int (*func)(unsigned long, unsigned long, void *))
			
 
				+			  void *arg, int (*func)(unsigned long, unsigned long, void *))
			
 
				 {
			
 
				+	resource_size_t start, end;
			
 
				+	unsigned long flags;
			
 
				 	struct resource res;
			
 
				 	unsigned long pfn, end_pfn;
			
 
				-	u64 orig_end;
			
 
				 	int ret = -1;
			
 
				 
			
 
				-	res.start = (u64) start_pfn << PAGE_SHIFT;
			
 
				-	res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
			
 
				-	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
			
 
				-	orig_end = res.end;
			
 
				-	while ((res.start < res.end) &&
			
 
				-		(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
			
 
				+	start = (u64) start_pfn << PAGE_SHIFT;
			
 
				+	end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
			
 
				+	flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
			
 
				+	while (start < end &&
			
 
				+	       !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
			
 
				+				    true, &res)) {
			
 
				 		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
			
 
				 		end_pfn = (res.end + 1) >> PAGE_SHIFT;
			
 
				 		if (end_pfn > pfn)
			
 
				 			ret = (*func)(pfn, end_pfn - pfn, arg);
			
 
				 		if (ret)
			
 
				 			break;
			
 
				-		res.start = res.end + 1;
			
 
				-		res.end = orig_end;
			
 
				+		start = res.end + 1;
			
 
				 	}
			
 
				 	return ret;
			
 
				 }
			
@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
 
				  * @constraint: the size and alignment constraints to be met.
			
 
				  */
			
 
				 static int reallocate_resource(struct resource *root, struct resource *old,
			
 
				-			resource_size_t newsize,
			
 
				-			struct resource_constraint  *constraint)
			
 
				+			       resource_size_t newsize,
			
 
				+			       struct resource_constraint *constraint)
			
 
				 {
			
 
				 	int err=0;
			
 
				 	struct resource new = *old;
			
@@ -972,7 +957,7 @@ skip:
 
				  * Existing children of the resource are assumed to be immutable.
			
 
				  */
			
 
				 int adjust_resource(struct resource *res, resource_size_t start,
			
 
				-			resource_size_t size)
			
 
				+		    resource_size_t size)
			
 
				 {
			
 
				 	int result;
			
 
				 
			
@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
 
				 }
			
 
				 EXPORT_SYMBOL(adjust_resource);
			
 
				 
			
 
				-static void __init __reserve_region_with_split(struct resource *root,
			
 
				-		resource_size_t start, resource_size_t end,
			
 
				-		const char *name)
			
 
				+static void __init
			
 
				+__reserve_region_with_split(struct resource *root, resource_size_t start,
			
 
				+			    resource_size_t end, const char *name)
			
 
				 {
			
 
				 	struct resource *parent = root;
			
 
				 	struct resource *conflict;
			
@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
 
				 
			
 
				 }
			
 
				 
			
 
				-void __init reserve_region_with_split(struct resource *root,
			
 
				-		resource_size_t start, resource_size_t end,
			
 
				-		const char *name)
			
 
				+void __init
			
 
				+reserve_region_with_split(struct resource *root, resource_size_t start,
			
 
				+			  resource_size_t end, const char *name)
			
 
				 {
			
 
				 	int abort = 0;
			
 
				 
			
@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
 
				  * The described resource region must match a currently busy region.
			
 
				  */
			
 
				 void __release_region(struct resource *parent, resource_size_t start,
			
 
				-			resource_size_t n)
			
 
				+		      resource_size_t n)
			
 
				 {
			
 
				 	struct resource **p;
			
 
				 	resource_size_t end;
			
@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
 
				  *   simplicity.  Enhance this logic when necessary.
			
 
				  */
			
 
				 int release_mem_region_adjustable(struct resource *parent,
			
 
				-			resource_size_t start, resource_size_t size)
			
 
				+				  resource_size_t start, resource_size_t size)
			
 
				 {
			
 
				 	struct resource **p;
			
 
				 	struct resource *res;
			
@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
 
				 		this->start == match->start && this->n == match->n;
			
 
				 }
			
 
				 
			
 
				-struct resource * __devm_request_region(struct device *dev,
			
 
				-				struct resource *parent, resource_size_t start,
			
 
				-				resource_size_t n, const char *name)
			
 
				+struct resource *
			
 
				+__devm_request_region(struct device *dev, struct resource *parent,
			
 
				+		      resource_size_t start, resource_size_t n, const char *name)
			
 
				 {
			
 
				 	struct region_devres *dr = NULL;
			
 
				 	struct resource *res;
			
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
 
				 
			
 
				 void cpu_startup_entry(enum cpuhp_state state)
			
 
				 {
			
 
				-	/*
			
 
				-	 * This #ifdef needs to die, but it's too late in the cycle to
			
 
				-	 * make this generic (ARM and SH have never invoked the canary
			
 
				-	 * init for the non boot CPUs!). Will be fixed in 3.11
			
 
				-	 */
			
 
				-#ifdef CONFIG_X86
			
 
				-	/*
			
 
				-	 * If we're the non-boot CPU, nothing set the stack canary up
			
 
				-	 * for us. The boot CPU already has it initialized but no harm
			
 
				-	 * in doing it again. This is a good place for updating it, as
			
 
				-	 * we wont ever return from this function (so the invalid
			
 
				-	 * canaries already on the stack wont ever trigger).
			
 
				-	 */
			
 
				-	boot_init_stack_canary();
			
 
				-#endif
			
 
				 	arch_cpu_idle_prepare();
			
 
				 	cpuhp_online_idle(state);
			
 
				 	while (1)
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -56,7 +56,6 @@
 
				 #include <linux/profile.h>
			
 
				 #include <linux/rcupdate_wait.h>
			
 
				 #include <linux/security.h>
			
 
				-#include <linux/stackprotector.h>
			
 
				 #include <linux/stop_machine.h>
			
 
				 #include <linux/suspend.h>
			
 
				 #include <linux/swait.h>
			
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
 
				  * You must not call this function with disabled interrupts or
			
 
				  * from a hardware interrupt handler or from a bottom half handler.
			
 
				  */
			
 
				-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
			
 
				+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
			
 
				 			smp_call_func_t func, void *info, bool wait,
			
 
				-			gfp_t gfp_flags)
			
 
				+			gfp_t gfp_flags, const struct cpumask *mask)
			
 
				 {
			
 
				 	cpumask_var_t cpus;
			
 
				 	int cpu, ret;
			
@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 
				 
			
 
				 	if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
			
 
				 		preempt_disable();
			
 
				-		for_each_online_cpu(cpu)
			
 
				+		for_each_cpu(cpu, mask)
			
 
				 			if (cond_func(cpu, info))
			
 
				-				cpumask_set_cpu(cpu, cpus);
			
 
				+				__cpumask_set_cpu(cpu, cpus);
			
 
				 		on_each_cpu_mask(cpus, func, info, wait);
			
 
				 		preempt_enable();
			
 
				 		free_cpumask_var(cpus);
			
@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 
				 		 * just have to IPI them one by one.
			
 
				 		 */
			
 
				 		preempt_disable();
			
 
				-		for_each_online_cpu(cpu)
			
 
				+		for_each_cpu(cpu, mask)
			
 
				 			if (cond_func(cpu, info)) {
			
 
				 				ret = smp_call_function_single(cpu, func,
			
 
				 								info, wait);
			
@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 
				 		preempt_enable();
			
 
				 	}
			
 
				 }
			
 
				+EXPORT_SYMBOL(on_each_cpu_cond_mask);
			
 
				+
			
 
				+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
			
 
				+			smp_call_func_t func, void *info, bool wait,
			
 
				+			gfp_t gfp_flags)
			
 
				+{
			
 
				+	on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
			
 
				+				cpu_online_mask);
			
 
				+}
			
 
				 EXPORT_SYMBOL(on_each_cpu_cond);
			
 
				 
			
 
				 static void do_nothing(void *unused)
			
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
 
				  * Preemption is disabled here to make sure the cond_func is called under the
			
 
				  * same condtions in UP and SMP.
			
 
				  */
			
 
				-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
			
 
				-		      smp_call_func_t func, void *info, bool wait,
			
 
				-		      gfp_t gfp_flags)
			
 
				+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
			
 
				+			   smp_call_func_t func, void *info, bool wait,
			
 
				+			   gfp_t gfp_flags, const struct cpumask *mask)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 
				 	}
			
 
				 	preempt_enable();
			
 
				 }
			
 
				+EXPORT_SYMBOL(on_each_cpu_cond_mask);
			
 
				+
			
 
				+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
			
 
				+		      smp_call_func_t func, void *info, bool wait,
			
 
				+		      gfp_t gfp_flags)
			
 
				+{
			
 
				+	on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
			
 
				+}
			
 
				 EXPORT_SYMBOL(on_each_cpu_cond);
			
 
				 
			
 
				 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
			
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -8,6 +8,7 @@
 
				  */
			
 
				 
			
 
				 #include <linux/pagemap.h>
			
 
				+#include <linux/hugetlb.h>
			
 
				 #include <asm/tlb.h>
			
 
				 #include <asm-generic/pgtable.h>