7 years ago · 73da9e1a9f
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -30,7 +30,6 @@ Currently, these files are in /proc/sys/vm:
 
				 - dirty_writeback_centisecs
			
 
				 - drop_caches
			
 
				 - extfrag_threshold
			
 
				-- hugepages_treat_as_movable
			
 
				 - hugetlb_shm_group
			
 
				 - laptop_mode
			
 
				 - legacy_va_layout
			
@@ -261,30 +260,6 @@ any throttling.
 
				 
			
 
				 ==============================================================
			
 
				 
			
 
				-hugepages_treat_as_movable
			
 
				-
			
 
				-This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
			
 
				-or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE.
			
 
				-ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified,
			
 
				-so this parameter has no effect if used without kernelcore=.
			
 
				-
			
 
				-Hugepage migration is now available in some situations which depend on the
			
 
				-architecture and/or the hugepage size. If a hugepage supports migration,
			
 
				-allocation from ZONE_MOVABLE is always enabled for the hugepage regardless
			
 
				-of the value of this parameter.
			
 
				-IOW, this parameter affects only non-migratable hugepages.
			
 
				-
			
 
				-Assuming that hugepages are not migratable in your system, one usecase of
			
 
				-this parameter is that users can make hugepage pool more extensible by
			
 
				-enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE
			
 
				-page reclaim/migration/compaction work more and you can get contiguous
			
 
				-memory more likely. Note that using ZONE_MOVABLE for non-migratable
			
 
				-hugepages can do harm to other features like memory hotremove (because
			
 
				-memory hotremove expects that memory blocks on ZONE_MOVABLE are always
			
 
				-removable,) so it's a trade-off responsible for the users.
			
 
				-
			
 
				-==============================================================
			
 
				-
			
 
				 hugetlb_shm_group
			
 
				 
			
 
				 hugetlb_shm_group contains group id that is allowed to create SysV
			
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -20,19 +20,20 @@ options.
 
				 
			
 
				 The /proc/meminfo file provides information about the total number of
			
 
				 persistent hugetlb pages in the kernel's huge page pool.  It also displays
			
 
				-information about the number of free, reserved and surplus huge pages and the
			
 
				-default huge page size.  The huge page size is needed for generating the
			
 
				-proper alignment and size of the arguments to system calls that map huge page
			
 
				-regions.
			
 
				+default huge page size and information about the number of free, reserved
			
 
				+and surplus huge pages in the pool of huge pages of default size.
			
 
				+The huge page size is needed for generating the proper alignment and
			
 
				+size of the arguments to system calls that map huge page regions.
			
 
				 
			
 
				 The output of "cat /proc/meminfo" will include lines like:
			
 
				 
			
 
				 .....
			
 
				-HugePages_Total: vvv
			
 
				-HugePages_Free:  www
			
 
				-HugePages_Rsvd:  xxx
			
 
				-HugePages_Surp:  yyy
			
 
				-Hugepagesize:    zzz kB
			
 
				+HugePages_Total: uuu
			
 
				+HugePages_Free:  vvv
			
 
				+HugePages_Rsvd:  www
			
 
				+HugePages_Surp:  xxx
			
 
				+Hugepagesize:    yyy kB
			
 
				+Hugetlb:         zzz kB
			
 
				 
			
 
				 where:
			
 
				 HugePages_Total is the size of the pool of huge pages.
			
@@ -47,6 +48,14 @@ HugePages_Surp  is short for "surplus," and is the number of huge pages in
 
				                 the pool above the value in /proc/sys/vm/nr_hugepages. The
			
 
				                 maximum number of surplus huge pages is controlled by
			
 
				                 /proc/sys/vm/nr_overcommit_hugepages.
			
 
				+Hugepagesize    is the default hugepage size (in Kb).
			
 
				+Hugetlb         is the total amount of memory (in kB), consumed by huge
			
 
				+                pages of all sizes.
			
 
				+                If huge pages of different sizes are in use, this number
			
 
				+                will exceed HugePages_Total * Hugepagesize. To get more
			
 
				+                detailed information, please, refer to
			
 
				+                /sys/kernel/mm/hugepages (described below).
			
 
				+
			
 
				 
			
 
				 /proc/filesystems should also show a filesystem of type "hugetlbfs" configured
			
 
				 in the kernel.
			
--- a/arch/arc/include/asm/hugepage.h
+++ b/arch/arc/include/asm/hugepage.h
@@ -74,4 +74,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 
				 extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
			
 
				 				unsigned long end);
			
 
				 
			
 
				+/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/
			
 
				+#define pmdp_establish generic_pmdp_establish
			
 
				+
			
 
				 #endif
			
--- a/arch/arm/include/asm/pgtable-3level.h
+++ b/arch/arm/include/asm/pgtable-3level.h
@@ -249,6 +249,9 @@ PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 
				 #define pfn_pmd(pfn,prot)	(__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)))
			
 
				 #define mk_pmd(page,prot)	pfn_pmd(page_to_pfn(page),prot)
			
 
				 
			
 
				+/* No hardware dirty/accessed bits -- generic_pmdp_establish() fits */
			
 
				+#define pmdp_establish generic_pmdp_establish
			
 
				+
			
 
				 /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */
			
 
				 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
			
 
				 {
			
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -706,6 +706,13 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 
				 {
			
 
				 	ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
			
 
				 }
			
 
				+
			
 
				+#define pmdp_establish pmdp_establish
			
 
				+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
			
 
				+		unsigned long address, pmd_t *pmdp, pmd_t pmd)
			
 
				+{
			
 
				+	return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
			
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -115,14 +115,6 @@ static void set_eit_vector_entries(void)
 
				 	_flush_cache_copyback_all();
			
 
				 }
			
 
				 
			
 
				-void abort(void)
			
 
				-{
			
 
				-	BUG();
			
 
				-
			
 
				-	/* if that doesn't kill us, halt */
			
 
				-	panic("Oops failed to kill thread");
			
 
				-}
			
 
				-
			
 
				 void __init trap_init(void)
			
 
				 {
			
 
				 	set_eit_vector_entries();
			
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -534,6 +534,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
 
				 
			
 
				 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				 
			
 
				+/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/
			
 
				+#define pmdp_establish generic_pmdp_establish
			
 
				+
			
 
				 #define has_transparent_hugepage has_transparent_hugepage
			
 
				 extern int has_transparent_hugepage(void);
			
 
				 
			
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -151,7 +151,6 @@ config PPC
 
				 	select ARCH_MIGHT_HAVE_PC_PARPORT
			
 
				 	select ARCH_MIGHT_HAVE_PC_SERIO
			
 
				 	select ARCH_SUPPORTS_ATOMIC_RMW
			
 
				-	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
			
 
				 	select ARCH_USE_BUILTIN_BSWAP
			
 
				 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
			
 
				 	select ARCH_WANT_IPC_PARSE_VERSION
			
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -101,8 +101,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
 
				 extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
			
 
				 					 pgtable_t pgtable);
			
 
				 extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
			
 
				-extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
			
 
				-				      unsigned long address, pmd_t *pmdp);
			
 
				 extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
			
 
				 				       unsigned long addr, pmd_t *pmdp);
			
 
				 extern int hash__has_transparent_hugepage(void);
			
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -203,8 +203,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma,
 
				 extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
			
 
				 					 pgtable_t pgtable);
			
 
				 extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
			
 
				-extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
			
 
				-				      unsigned long address, pmd_t *pmdp);
			
 
				 extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
			
 
				 				       unsigned long addr, pmd_t *pmdp);
			
 
				 extern int hash__has_transparent_hugepage(void);
			
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1137,17 +1137,8 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm,
 
				 }
			
 
				 
			
 
				 #define __HAVE_ARCH_PMDP_INVALIDATE
			
 
				-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				-			    pmd_t *pmdp);
			
 
				-
			
 
				-#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
			
 
				-static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
			
 
				-					   unsigned long address, pmd_t *pmdp)
			
 
				-{
			
 
				-	if (radix_enabled())
			
 
				-		return radix__pmdp_huge_split_prepare(vma, address, pmdp);
			
 
				-	return hash__pmdp_huge_split_prepare(vma, address, pmdp);
			
 
				-}
			
 
				+extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				+			     pmd_t *pmdp);
			
 
				 
			
 
				 #define pmd_move_must_withdraw pmd_move_must_withdraw
			
 
				 struct spinlock;
			
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -269,12 +269,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
 
				 		return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
			
 
				 	return __pmd(pmd_val(pmd) | _PAGE_PTE);
			
 
				 }
			
 
				-static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
			
 
				-					    unsigned long address, pmd_t *pmdp)
			
 
				-{
			
 
				-	/* Nothing to do for radix. */
			
 
				-	return;
			
 
				-}
			
 
				 
			
 
				 extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
			
 
				 					  pmd_t *pmdp, unsigned long clr,
			
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm)
 
				  * We use this to invalidate a pmdp entry before switching from a
			
 
				  * hugepte to regular pmd entry.
			
 
				  */
			
 
				-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				 		     pmd_t *pmdp)
			
 
				 {
			
 
				-	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
			
 
				+	unsigned long old_pmd;
			
 
				+
			
 
				+	old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
			
 
				 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
			
 
				 	/*
			
 
				 	 * This ensures that generic code that rely on IRQ disabling
			
 
				 	 * to prevent a parallel THP split work as expected.
			
 
				 	 */
			
 
				 	serialize_against_pte_lookup(vma->vm_mm);
			
 
				+	return __pmd(old_pmd);
			
 
				 }
			
 
				 
			
 
				 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
			
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 
				 	return pgtable;
			
 
				 }
			
 
				 
			
 
				-void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
			
 
				-			       unsigned long address, pmd_t *pmdp)
			
 
				-{
			
 
				-	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
			
 
				-	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
			
 
				-	VM_BUG_ON(pmd_devmap(*pmdp));
			
 
				-
			
 
				-	/*
			
 
				-	 * We can't mark the pmd none here, because that will cause a race
			
 
				-	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
			
 
				-	 * we spilt, but at the same time we wan't rest of the ppc64 code
			
 
				-	 * not to insert hash pte on this, because we will be modifying
			
 
				-	 * the deposited pgtable in the caller of this function. Hence
			
 
				-	 * clear the _PAGE_USER so that we move the fault handling to
			
 
				-	 * higher level function and that will serialize against ptl.
			
 
				-	 * We need to flush existing hash pte entries here even though,
			
 
				-	 * the translation is still valid, because we will withdraw
			
 
				-	 * pgtable_t after this.
			
 
				-	 */
			
 
				-	pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED);
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * A linux hugepage PMD was changed and the corresponding hash table entries
			
 
				  * neesd to be flushed.
			
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -108,7 +108,6 @@ config S390
 
				 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
			
 
				 	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
			
 
				 	select ARCH_SUPPORTS_ATOMIC_RMW
			
 
				-	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
			
 
				 	select ARCH_SUPPORTS_NUMA_BALANCING
			
 
				 	select ARCH_USE_BUILTIN_BSWAP
			
 
				 	select ARCH_USE_CMPXCHG_LOCKREF
			
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1505,12 +1505,12 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
 
				 }
			
 
				 
			
 
				 #define __HAVE_ARCH_PMDP_INVALIDATE
			
 
				-static inline void pmdp_invalidate(struct vm_area_struct *vma,
			
 
				+static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma,
			
 
				 				   unsigned long addr, pmd_t *pmdp)
			
 
				 {
			
 
				 	pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
			
 
				 
			
 
				-	pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
			
 
				+	return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd);
			
 
				 }
			
 
				 
			
 
				 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
			
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1010,7 +1010,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 
				 			  pmd_t *pmd);
			
 
				 
			
 
				 #define __HAVE_ARCH_PMDP_INVALIDATE
			
 
				-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				+extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				 			    pmd_t *pmdp);
			
 
				 
			
 
				 #define __HAVE_ARCH_PGTABLE_DEPOSIT
			
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -219,17 +219,28 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
			
 
				+		unsigned long address, pmd_t *pmdp, pmd_t pmd)
			
 
				+{
			
 
				+	pmd_t old;
			
 
				+
			
 
				+	do {
			
 
				+		old = *pmdp;
			
 
				+	} while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
			
 
				+
			
 
				+	return old;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * This routine is only called when splitting a THP
			
 
				  */
			
 
				-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				 		     pmd_t *pmdp)
			
 
				 {
			
 
				-	pmd_t entry = *pmdp;
			
 
				-
			
 
				-	pmd_val(entry) &= ~_PAGE_VALID;
			
 
				+	pmd_t old, entry;
			
 
				 
			
 
				-	set_pmd_at(vma->vm_mm, address, pmdp, entry);
			
 
				+	entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID);
			
 
				+	old = pmdp_establish(vma, address, pmdp, entry);
			
 
				 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
			
 
				 
			
 
				 	/*
			
@@ -240,6 +251,8 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 
				 	if ((pmd_val(entry) & _PAGE_PMD_HUGE) &&
			
 
				 	    !is_huge_zero_page(pmd_page(entry)))
			
 
				 		(vma->vm_mm)->context.thp_pte_count--;
			
 
				+
			
 
				+	return old;
			
 
				 }
			
 
				 
			
 
				 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,6 @@ config X86
 
				 	select ARCH_MIGHT_HAVE_PC_PARPORT
			
 
				 	select ARCH_MIGHT_HAVE_PC_SERIO
			
 
				 	select ARCH_SUPPORTS_ATOMIC_RMW
			
 
				-	select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
			
 
				 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
			
 
				 	select ARCH_USE_BUILTIN_BSWAP
			
 
				 	select ARCH_USE_QUEUED_RWLOCKS
			
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -158,7 +158,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
 
				 #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
			
 
				 #endif
			
 
				 
			
 
				-#ifdef CONFIG_SMP
			
 
				 union split_pmd {
			
 
				 	struct {
			
 
				 		u32 pmd_low;
			
@@ -166,6 +165,8 @@ union split_pmd {
 
				 	};
			
 
				 	pmd_t pmd;
			
 
				 };
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				 static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
			
 
				 {
			
 
				 	union split_pmd res, *orig = (union split_pmd *)pmdp;
			
@@ -181,6 +182,40 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 
				 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
			
 
				 #endif
			
 
				 
			
 
				+#ifndef pmdp_establish
			
 
				+#define pmdp_establish pmdp_establish
			
 
				+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
			
 
				+		unsigned long address, pmd_t *pmdp, pmd_t pmd)
			
 
				+{
			
 
				+	pmd_t old;
			
 
				+
			
 
				+	/*
			
 
				+	 * If pmd has present bit cleared we can get away without expensive
			
 
				+	 * cmpxchg64: we can update pmdp half-by-half without racing with
			
 
				+	 * anybody.
			
 
				+	 */
			
 
				+	if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
			
 
				+		union split_pmd old, new, *ptr;
			
 
				+
			
 
				+		ptr = (union split_pmd *)pmdp;
			
 
				+
			
 
				+		new.pmd = pmd;
			
 
				+
			
 
				+		/* xchg acts as a barrier before setting of the high bits */
			
 
				+		old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
			
 
				+		old.pmd_high = ptr->pmd_high;
			
 
				+		ptr->pmd_high = new.pmd_high;
			
 
				+		return old.pmd;
			
 
				+	}
			
 
				+
			
 
				+	do {
			
 
				+		old = *pmdp;
			
 
				+	} while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
			
 
				+
			
 
				+	return old;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 #ifdef CONFIG_SMP
			
 
				 union split_pud {
			
 
				 	struct {
			
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1109,6 +1109,21 @@ static inline int pud_write(pud_t pud)
 
				 	return pud_flags(pud) & _PAGE_RW;
			
 
				 }
			
 
				 
			
 
				+#ifndef pmdp_establish
			
 
				+#define pmdp_establish pmdp_establish
			
 
				+static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
			
 
				+		unsigned long address, pmd_t *pmdp, pmd_t pmd)
			
 
				+{
			
 
				+	if (IS_ENABLED(CONFIG_SMP)) {
			
 
				+		return xchg(pmdp, pmd);
			
 
				+	} else {
			
 
				+		pmd_t old = *pmdp;
			
 
				+		*pmdp = pmd;
			
 
				+		return old;
			
 
				+	}
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
			
 
				  *
			
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler,
 
				 static void handle_remove(struct work_struct *work);
			
 
				 
			
 
				 static const struct mmu_notifier_ops mn_opts = {
			
 
				+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
			
 
				 	.invalidate_range_start = mmu_notifier_range_start,
			
 
				 };
			
 
				 
			
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
				 }
			
 
				 
			
 
				 static const struct mmu_notifier_ops iommu_mn = {
			
 
				+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
			
 
				 	.release		= mn_release,
			
 
				 	.clear_flush_young      = mn_clear_flush_young,
			
 
				 	.invalidate_range       = mn_invalidate_range,
			
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
				 }
			
 
				 
			
 
				 static const struct mmu_notifier_ops intel_mmuops = {
			
 
				+	.flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
			
 
				 	.release = intel_mm_release,
			
 
				 	.change_pte = intel_change_pte,
			
 
				 	.invalidate_range = intel_invalidate_range,
			
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
				 
			
 
				 
			
 
				 static const struct mmu_notifier_ops gru_mmuops = {
			
 
				+	.flags			= MMU_INVALIDATE_DOES_NOT_BLOCK,
			
 
				 	.invalidate_range_start	= gru_invalidate_range_start,
			
 
				 	.invalidate_range_end	= gru_invalidate_range_end,
			
 
				 	.release		= gru_release,
			
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -44,6 +44,7 @@
 
				 
			
 
				 /* The 'colour' (ie low bits) within a PMD of a page offset.  */
			
 
				 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
			
 
				+#define PG_PMD_NR	(PMD_SIZE >> PAGE_SHIFT)
			
 
				 
			
 
				 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
			
 
				 
			
@@ -375,8 +376,8 @@ restart:
 
				 		 * unmapped.
			
 
				 		 */
			
 
				 		if (pmd_downgrade && dax_is_zero_entry(entry))
			
 
				-			unmap_mapping_range(mapping,
			
 
				-				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
			
 
				+			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
			
 
				+							PG_PMD_NR, false);
			
 
				 
			
 
				 		err = radix_tree_preload(
			
 
				 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
			
@@ -538,12 +539,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 
				 	if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
			
 
				 		/* we are replacing a zero page with block mapping */
			
 
				 		if (dax_is_pmd_entry(entry))
			
 
				-			unmap_mapping_range(mapping,
			
 
				-					(vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
			
 
				-					PMD_SIZE, 0);
			
 
				+			unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
			
 
				+							PG_PMD_NR, false);
			
 
				 		else /* pte entry */
			
 
				-			unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
			
 
				-					PAGE_SIZE, 0);
			
 
				+			unmap_mapping_pages(mapping, vmf->pgoff, 1, false);
			
 
				 	}
			
 
				 
			
 
				 	spin_lock_irq(&mapping->tree_lock);
			
@@ -636,8 +635,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 
				 			pmd = pmd_mkclean(pmd);
			
 
				 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
			
 
				 unlock_pmd:
			
 
				-			spin_unlock(ptl);
			
 
				 #endif
			
 
				+			spin_unlock(ptl);
			
 
				 		} else {
			
 
				 			if (pfn != pte_pfn(*ptep))
			
 
				 				goto unlock_pte;
			
@@ -1269,12 +1268,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_FS_DAX_PMD
			
 
				-/*
			
 
				- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
			
 
				- * more often than one might expect in the below functions.
			
 
				- */
			
 
				-#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
			
 
				-
			
 
				 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
			
 
				 		void *entry)
			
 
				 {
			
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -418,7 +418,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 
				 		break;
			
 
				 	case F_ADD_SEALS:
			
 
				 	case F_GET_SEALS:
			
 
				-		err = shmem_fcntl(filp, cmd, arg);
			
 
				+		err = memfd_fcntl(filp, cmd, arg);
			
 
				 		break;
			
 
				 	case F_GET_RW_HINT:
			
 
				 	case F_SET_RW_HINT:
			
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -55,16 +55,6 @@ struct hugetlbfs_config {
 
				 	umode_t			mode;
			
 
				 };
			
 
				 
			
 
				-struct hugetlbfs_inode_info {
			
 
				-	struct shared_policy policy;
			
 
				-	struct inode vfs_inode;
			
 
				-};
			
 
				-
			
 
				-static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
			
 
				-{
			
 
				-	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
			
 
				-}
			
 
				-
			
 
				 int sysctl_hugetlb_shm_group;
			
 
				 
			
 
				 enum {
			
@@ -520,8 +510,16 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
				 
			
 
				 	if (hole_end > hole_start) {
			
 
				 		struct address_space *mapping = inode->i_mapping;
			
 
				+		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
			
 
				 
			
 
				 		inode_lock(inode);
			
 
				+
			
 
				+		/* protected by i_mutex */
			
 
				+		if (info->seals & F_SEAL_WRITE) {
			
 
				+			inode_unlock(inode);
			
 
				+			return -EPERM;
			
 
				+		}
			
 
				+
			
 
				 		i_mmap_lock_write(mapping);
			
 
				 		if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
			
 
				 			hugetlb_vmdelete_list(&mapping->i_mmap,
			
@@ -539,6 +537,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 
				 				loff_t len)
			
 
				 {
			
 
				 	struct inode *inode = file_inode(file);
			
 
				+	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
			
 
				 	struct address_space *mapping = inode->i_mapping;
			
 
				 	struct hstate *h = hstate_inode(inode);
			
 
				 	struct vm_area_struct pseudo_vma;
			
@@ -570,6 +569,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 
				 	if (error)
			
 
				 		goto out;
			
 
				 
			
 
				+	if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
			
 
				+		error = -EPERM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Initialize a pseudo vma as this is required by the huge page
			
 
				 	 * allocation routines.  If NUMA is configured, use page index
			
@@ -660,6 +664,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
				 	struct hstate *h = hstate_inode(inode);
			
 
				 	int error;
			
 
				 	unsigned int ia_valid = attr->ia_valid;
			
 
				+	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
			
 
				 
			
 
				 	BUG_ON(!inode);
			
 
				 
			
@@ -668,9 +673,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
				 		return error;
			
 
				 
			
 
				 	if (ia_valid & ATTR_SIZE) {
			
 
				-		if (attr->ia_size & ~huge_page_mask(h))
			
 
				+		loff_t oldsize = inode->i_size;
			
 
				+		loff_t newsize = attr->ia_size;
			
 
				+
			
 
				+		if (newsize & ~huge_page_mask(h))
			
 
				 			return -EINVAL;
			
 
				-		error = hugetlb_vmtruncate(inode, attr->ia_size);
			
 
				+		/* protected by i_mutex */
			
 
				+		if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
			
 
				+		    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
			
 
				+			return -EPERM;
			
 
				+		error = hugetlb_vmtruncate(inode, newsize);
			
 
				 		if (error)
			
 
				 			return error;
			
 
				 	}
			
@@ -722,6 +734,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 
				 
			
 
				 	inode = new_inode(sb);
			
 
				 	if (inode) {
			
 
				+		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
			
 
				+
			
 
				 		inode->i_ino = get_next_ino();
			
 
				 		inode_init_owner(inode, dir, mode);
			
 
				 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
			
@@ -729,6 +743,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 
				 		inode->i_mapping->a_ops = &hugetlbfs_aops;
			
 
				 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
			
 
				 		inode->i_mapping->private_data = resv_map;
			
 
				+		info->seals = F_SEAL_SEAL;
			
 
				 		switch (mode & S_IFMT) {
			
 
				 		default:
			
 
				 			init_special_inode(inode, mode, dev);
			
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -311,7 +311,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 
				 	if (had_lock < 0)
			
 
				 		return ERR_PTR(had_lock);
			
 
				 
			
 
				+	down_read(&OCFS2_I(inode)->ip_xattr_sem);
			
 
				 	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
			
 
				+	up_read(&OCFS2_I(inode)->ip_xattr_sem);
			
 
				 
			
 
				 	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
			
 
				 	brelse(di_bh);
			
@@ -330,7 +332,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
 
				 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
			
 
				 		return 0;
			
 
				 
			
 
				+	down_read(&OCFS2_I(inode)->ip_xattr_sem);
			
 
				 	acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
			
 
				+	up_read(&OCFS2_I(inode)->ip_xattr_sem);
			
 
				 	if (IS_ERR(acl) || !acl)
			
 
				 		return PTR_ERR(acl);
			
 
				 	ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
			
@@ -361,8 +365,10 @@ int ocfs2_init_acl(handle_t *handle,
 
				 
			
 
				 	if (!S_ISLNK(inode->i_mode)) {
			
 
				 		if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
			
 
				+			down_read(&OCFS2_I(dir)->ip_xattr_sem);
			
 
				 			acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
			
 
				 						   dir_bh);
			
 
				+			up_read(&OCFS2_I(dir)->ip_xattr_sem);
			
 
				 			if (IS_ERR(acl))
			
 
				 				return PTR_ERR(acl);
			
 
				 		}
			
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
 
				 				     struct ocfs2_extent_rec *rec);
			
 
				 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
			
 
				 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
			
 
				+
			
 
				+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
			
 
				+					struct ocfs2_extent_tree *et,
			
 
				+					struct buffer_head **new_eb_bh,
			
 
				+					int blk_wanted, int *blk_given);
			
 
				+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
			
 
				+
			
 
				 static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
			
 
				 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
			
 
				 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
			
@@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 
				 	if (!obj)
			
 
				 		obj = (void *)bh->b_data;
			
 
				 	et->et_object = obj;
			
 
				+	et->et_dealloc = NULL;
			
 
				 
			
 
				 	et->et_ops->eo_fill_root_el(et);
			
 
				 	if (!et->et_ops->eo_fill_max_leaf_clusters)
			
@@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle,
 
				 			    struct buffer_head **last_eb_bh,
			
 
				 			    struct ocfs2_alloc_context *meta_ac)
			
 
				 {
			
 
				-	int status, new_blocks, i;
			
 
				+	int status, new_blocks, i, block_given = 0;
			
 
				 	u64 next_blkno, new_last_eb_blk;
			
 
				 	struct buffer_head *bh;
			
 
				 	struct buffer_head **new_eb_bhs = NULL;
			
@@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle,
 
				 		goto bail;
			
 
				 	}
			
 
				 
			
 
				-	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
			
 
				-					   meta_ac, new_eb_bhs);
			
 
				-	if (status < 0) {
			
 
				-		mlog_errno(status);
			
 
				-		goto bail;
			
 
				+	/* Firstyly, try to reuse dealloc since we have already estimated how
			
 
				+	 * many extent blocks we may use.
			
 
				+	 */
			
 
				+	if (!ocfs2_is_dealloc_empty(et)) {
			
 
				+		status = ocfs2_reuse_blk_from_dealloc(handle, et,
			
 
				+						      new_eb_bhs, new_blocks,
			
 
				+						      &block_given);
			
 
				+		if (status < 0) {
			
 
				+			mlog_errno(status);
			
 
				+			goto bail;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	BUG_ON(block_given > new_blocks);
			
 
				+
			
 
				+	if (block_given < new_blocks) {
			
 
				+		BUG_ON(!meta_ac);
			
 
				+		status = ocfs2_create_new_meta_bhs(handle, et,
			
 
				+						   new_blocks - block_given,
			
 
				+						   meta_ac,
			
 
				+						   &new_eb_bhs[block_given]);
			
 
				+		if (status < 0) {
			
 
				+			mlog_errno(status);
			
 
				+			goto bail;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
			
@@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
 
				 				  struct ocfs2_alloc_context *meta_ac,
			
 
				 				  struct buffer_head **ret_new_eb_bh)
			
 
				 {
			
 
				-	int status, i;
			
 
				+	int status, i, block_given = 0;
			
 
				 	u32 new_clusters;
			
 
				 	struct buffer_head *new_eb_bh = NULL;
			
 
				 	struct ocfs2_extent_block *eb;
			
 
				 	struct ocfs2_extent_list  *root_el;
			
 
				 	struct ocfs2_extent_list  *eb_el;
			
 
				 
			
 
				-	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
			
 
				-					   &new_eb_bh);
			
 
				+	if (!ocfs2_is_dealloc_empty(et)) {
			
 
				+		status = ocfs2_reuse_blk_from_dealloc(handle, et,
			
 
				+						      &new_eb_bh, 1,
			
 
				+						      &block_given);
			
 
				+	} else if (meta_ac) {
			
 
				+		status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
			
 
				+						   &new_eb_bh);
			
 
				+
			
 
				+	} else {
			
 
				+		BUG();
			
 
				+	}
			
 
				+
			
 
				 	if (status < 0) {
			
 
				 		mlog_errno(status);
			
 
				 		goto bail;
			
@@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
 
				 	int depth = le16_to_cpu(el->l_tree_depth);
			
 
				 	struct buffer_head *bh = NULL;
			
 
				 
			
 
				-	BUG_ON(meta_ac == NULL);
			
 
				+	BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
			
 
				 
			
 
				 	shift = ocfs2_find_branch_target(et, &bh);
			
 
				 	if (shift < 0) {
			
@@ -2598,11 +2636,8 @@ static void ocfs2_unlink_subtree(handle_t *handle,
 
				 	int i;
			
 
				 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
			
 
				 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
			
 
				-	struct ocfs2_extent_list *el;
			
 
				 	struct ocfs2_extent_block *eb;
			
 
				 
			
 
				-	el = path_leaf_el(left_path);
			
 
				-
			
 
				 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
			
 
				 
			
 
				 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
			
@@ -3938,7 +3973,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
 
				 					   struct ocfs2_path *path,
			
 
				 					   struct ocfs2_extent_rec *insert_rec)
			
 
				 {
			
 
				-	int ret, i, next_free;
			
 
				+	int i, next_free;
			
 
				 	struct buffer_head *bh;
			
 
				 	struct ocfs2_extent_list *el;
			
 
				 	struct ocfs2_extent_rec *rec;
			
@@ -3955,7 +3990,6 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
 
				 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
			
 
				 				    "Owner %llu has a bad extent list\n",
			
 
				 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
			
 
				-			ret = -EIO;
			
 
				 			return;
			
 
				 		}
			
 
				 
			
@@ -5057,7 +5091,6 @@ int ocfs2_split_extent(handle_t *handle,
 
				 	struct buffer_head *last_eb_bh = NULL;
			
 
				 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
			
 
				 	struct ocfs2_merge_ctxt ctxt;
			
 
				-	struct ocfs2_extent_list *rightmost_el;
			
 
				 
			
 
				 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
			
 
				 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
			
@@ -5093,9 +5126,7 @@ int ocfs2_split_extent(handle_t *handle,
 
				 		}
			
 
				 
			
 
				 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
			
 
				-		rightmost_el = &eb->h_list;
			
 
				-	} else
			
 
				-		rightmost_el = path_root_el(path);
			
 
				+	}
			
 
				 
			
 
				 	if (rec->e_cpos == split_rec->e_cpos &&
			
 
				 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
			
@@ -6585,6 +6616,154 @@ ocfs2_find_per_slot_free_list(int type,
 
				 	return fl;
			
 
				 }
			
 
				 
			
 
				+static struct ocfs2_per_slot_free_list *
			
 
				+ocfs2_find_preferred_free_list(int type,
			
 
				+			       int preferred_slot,
			
 
				+			       int *real_slot,
			
 
				+			       struct ocfs2_cached_dealloc_ctxt *ctxt)
			
 
				+{
			
 
				+	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
			
 
				+
			
 
				+	while (fl) {
			
 
				+		if (fl->f_inode_type == type && fl->f_slot == preferred_slot) {
			
 
				+			*real_slot = fl->f_slot;
			
 
				+			return fl;
			
 
				+		}
			
 
				+
			
 
				+		fl = fl->f_next_suballocator;
			
 
				+	}
			
 
				+
			
 
				+	/* If we can't find any free list matching preferred slot, just use
			
 
				+	 * the first one.
			
 
				+	 */
			
 
				+	fl = ctxt->c_first_suballocator;
			
 
				+	*real_slot = fl->f_slot;
			
 
				+
			
 
				+	return fl;
			
 
				+}
			
 
				+
			
 
				+/* Return Value 1 indicates empty */
			
 
				+static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
			
 
				+{
			
 
				+	struct ocfs2_per_slot_free_list *fl = NULL;
			
 
				+
			
 
				+	if (!et->et_dealloc)
			
 
				+		return 1;
			
 
				+
			
 
				+	fl = et->et_dealloc->c_first_suballocator;
			
 
				+	if (!fl)
			
 
				+		return 1;
			
 
				+
			
 
				+	if (!fl->f_first)
			
 
				+		return 1;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* If extent was deleted from tree due to extent rotation and merging, and
			
 
				+ * no metadata is reserved ahead of time. Try to reuse some extents
			
 
				+ * just deleted. This is only used to reuse extent blocks.
			
 
				+ * It is supposed to find enough extent blocks in dealloc if our estimation
			
 
				+ * on metadata is accurate.
			
 
				+ */
			
 
				+static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
			
 
				+					struct ocfs2_extent_tree *et,
			
 
				+					struct buffer_head **new_eb_bh,
			
 
				+					int blk_wanted, int *blk_given)
			
 
				+{
			
 
				+	int i, status = 0, real_slot;
			
 
				+	struct ocfs2_cached_dealloc_ctxt *dealloc;
			
 
				+	struct ocfs2_per_slot_free_list *fl;
			
 
				+	struct ocfs2_cached_block_free *bf;
			
 
				+	struct ocfs2_extent_block *eb;
			
 
				+	struct ocfs2_super *osb =
			
 
				+		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
			
 
				+
			
 
				+	*blk_given = 0;
			
 
				+
			
 
				+	/* If extent tree doesn't have a dealloc, this is not faulty. Just
			
 
				+	 * tell upper caller dealloc can't provide any block and it should
			
 
				+	 * ask for alloc to claim more space.
			
 
				+	 */
			
 
				+	dealloc = et->et_dealloc;
			
 
				+	if (!dealloc)
			
 
				+		goto bail;
			
 
				+
			
 
				+	for (i = 0; i < blk_wanted; i++) {
			
 
				+		/* Prefer to use local slot */
			
 
				+		fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE,
			
 
				+						    osb->slot_num, &real_slot,
			
 
				+						    dealloc);
			
 
				+		/* If no more block can be reused, we should claim more
			
 
				+		 * from alloc. Just return here normally.
			
 
				+		 */
			
 
				+		if (!fl) {
			
 
				+			status = 0;
			
 
				+			break;
			
 
				+		}
			
 
				+
			
 
				+		bf = fl->f_first;
			
 
				+		fl->f_first = bf->free_next;
			
 
				+
			
 
				+		new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
			
 
				+		if (new_eb_bh[i] == NULL) {
			
 
				+			status = -ENOMEM;
			
 
				+			mlog_errno(status);
			
 
				+			goto bail;
			
 
				+		}
			
 
				+
			
 
				+		mlog(0, "Reusing block(%llu) from "
			
 
				+		     "dealloc(local slot:%d, real slot:%d)\n",
			
 
				+		     bf->free_blk, osb->slot_num, real_slot);
			
 
				+
			
 
				+		ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
			
 
				+
			
 
				+		status = ocfs2_journal_access_eb(handle, et->et_ci,
			
 
				+						 new_eb_bh[i],
			
 
				+						 OCFS2_JOURNAL_ACCESS_CREATE);
			
 
				+		if (status < 0) {
			
 
				+			mlog_errno(status);
			
 
				+			goto bail;
			
 
				+		}
			
 
				+
			
 
				+		memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
			
 
				+		eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
			
 
				+
			
 
				+		/* We can't guarantee that buffer head is still cached, so
			
 
				+		 * polutlate the extent block again.
			
 
				+		 */
			
 
				+		strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
			
 
				+		eb->h_blkno = cpu_to_le64(bf->free_blk);
			
 
				+		eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
			
 
				+		eb->h_suballoc_slot = cpu_to_le16(real_slot);
			
 
				+		eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
			
 
				+		eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
			
 
				+		eb->h_list.l_count =
			
 
				+			cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
			
 
				+
			
 
				+		/* We'll also be dirtied by the caller, so
			
 
				+		 * this isn't absolutely necessary.
			
 
				+		 */
			
 
				+		ocfs2_journal_dirty(handle, new_eb_bh[i]);
			
 
				+
			
 
				+		if (!fl->f_first) {
			
 
				+			dealloc->c_first_suballocator = fl->f_next_suballocator;
			
 
				+			kfree(fl);
			
 
				+		}
			
 
				+		kfree(bf);
			
 
				+	}
			
 
				+
			
 
				+	*blk_given = i;
			
 
				+
			
 
				+bail:
			
 
				+	if (unlikely(status < 0)) {
			
 
				+		for (i = 0; i < blk_wanted; i++)
			
 
				+			brelse(new_eb_bh[i]);
			
 
				+	}
			
 
				+
			
 
				+	return status;
			
 
				+}
			
 
				+
			
 
				 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
			
 
				 			      int type, int slot, u64 suballoc,
			
 
				 			      u64 blkno, unsigned int bit)
			
@@ -7382,6 +7561,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 
				 	struct buffer_head *gd_bh = NULL;
			
 
				 	struct ocfs2_dinode *main_bm;
			
 
				 	struct ocfs2_group_desc *gd = NULL;
			
 
				+	struct ocfs2_trim_fs_info info, *pinfo = NULL;
			
 
				 
			
 
				 	start = range->start >> osb->s_clustersize_bits;
			
 
				 	len = range->len >> osb->s_clustersize_bits;
			
@@ -7419,6 +7599,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 
				 
			
 
				 	trace_ocfs2_trim_fs(start, len, minlen);
			
 
				 
			
 
				+	ocfs2_trim_fs_lock_res_init(osb);
			
 
				+	ret = ocfs2_trim_fs_lock(osb, NULL, 1);
			
 
				+	if (ret < 0) {
			
 
				+		if (ret != -EAGAIN) {
			
 
				+			mlog_errno(ret);
			
 
				+			ocfs2_trim_fs_lock_res_uninit(osb);
			
 
				+			goto out_unlock;
			
 
				+		}
			
 
				+
			
 
				+		mlog(ML_NOTICE, "Wait for trim on device (%s) to "
			
 
				+		     "finish, which is running from another node.\n",
			
 
				+		     osb->dev_str);
			
 
				+		ret = ocfs2_trim_fs_lock(osb, &info, 0);
			
 
				+		if (ret < 0) {
			
 
				+			mlog_errno(ret);
			
 
				+			ocfs2_trim_fs_lock_res_uninit(osb);
			
 
				+			goto out_unlock;
			
 
				+		}
			
 
				+
			
 
				+		if (info.tf_valid && info.tf_success &&
			
 
				+		    info.tf_start == start && info.tf_len == len &&
			
 
				+		    info.tf_minlen == minlen) {
			
 
				+			/* Avoid sending duplicated trim to a shared device */
			
 
				+			mlog(ML_NOTICE, "The same trim on device (%s) was "
			
 
				+			     "just done from node (%u), return.\n",
			
 
				+			     osb->dev_str, info.tf_nodenum);
			
 
				+			range->len = info.tf_trimlen;
			
 
				+			goto out_trimunlock;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	info.tf_nodenum = osb->node_num;
			
 
				+	info.tf_start = start;
			
 
				+	info.tf_len = len;
			
 
				+	info.tf_minlen = minlen;
			
 
				+
			
 
				 	/* Determine first and last group to examine based on start and len */
			
 
				 	first_group = ocfs2_which_cluster_group(main_bm_inode, start);
			
 
				 	if (first_group == osb->first_cluster_group_blkno)
			
@@ -7463,6 +7679,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 
				 			group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
			
 
				 	}
			
 
				 	range->len = trimmed * sb->s_blocksize;
			
 
				+
			
 
				+	info.tf_trimlen = range->len;
			
 
				+	info.tf_success = (ret ? 0 : 1);
			
 
				+	pinfo = &info;
			
 
				+out_trimunlock:
			
 
				+	ocfs2_trim_fs_unlock(osb, pinfo);
			
 
				+	ocfs2_trim_fs_lock_res_uninit(osb);
			
 
				 out_unlock:
			
 
				 	ocfs2_inode_unlock(main_bm_inode, 0);
			
 
				 	brelse(main_bm_bh);
			
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
 
				 	ocfs2_journal_access_func		et_root_journal_access;
			
 
				 	void					*et_object;
			
 
				 	unsigned int				et_max_leaf_clusters;
			
 
				+	struct ocfs2_cached_dealloc_ctxt	*et_dealloc;
			
 
				 };
			
 
				 
			
 
				 /*
			
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
 
				 	struct ocfs2_cached_dealloc_ctxt w_dealloc;
			
 
				 
			
 
				 	struct list_head		w_unwritten_list;
			
 
				+	unsigned int			w_unwritten_count;
			
 
				 };
			
 
				 
			
 
				 void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
			
@@ -1386,6 +1387,7 @@ retry:
 
				 	desc->c_clear_unwritten = 0;
			
 
				 	list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
			
 
				 	list_add_tail(&new->ue_node, &wc->w_unwritten_list);
			
 
				+	wc->w_unwritten_count++;
			
 
				 	new = NULL;
			
 
				 unlock:
			
 
				 	spin_unlock(&oi->ip_lock);
			
@@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
 
				 		ue->ue_phys = desc->c_phys;
			
 
				 
			
 
				 		list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list);
			
 
				-		dwc->dw_zero_count++;
			
 
				+		dwc->dw_zero_count += wc->w_unwritten_count;
			
 
				 	}
			
 
				 
			
 
				 	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
			
@@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
 
				 
			
 
				 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
			
 
				 
			
 
				+	/* Attach dealloc with extent tree in case that we may reuse extents
			
 
				+	 * which are already unlinked from current extent tree due to extent
			
 
				+	 * rotation and merging.
			
 
				+	 */
			
 
				+	et.et_dealloc = &dealloc;
			
 
				+
			
 
				 	ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
			
 
				 				    &data_ac, &meta_ac);
			
 
				 	if (ret) {
			
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -314,12 +314,13 @@ void o2quo_conn_err(u8 node)
 
				 				node, qs->qs_connected);
			
 
				 
			
 
				 		clear_bit(node, qs->qs_conn_bm);
			
 
				+
			
 
				+		if (test_bit(node, qs->qs_hb_bm))
			
 
				+			o2quo_set_hold(qs, node);
			
 
				 	}
			
 
				 
			
 
				 	mlog(0, "node %u, %d total\n", node, qs->qs_connected);
			
 
				 
			
 
				-	if (test_bit(node, qs->qs_hb_bm))
			
 
				-		o2quo_set_hold(qs, node);
			
 
				 
			
 
				 	spin_unlock(&qs->qs_lock);
			
 
				 }
			
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -196,7 +196,7 @@ struct o2net_msg_handler {
 
				 	u32			nh_msg_type;
			
 
				 	u32			nh_key;
			
 
				 	o2net_msg_handler_func	*nh_func;
			
 
				-	o2net_msg_handler_func	*nh_func_data;
			
 
				+	void			*nh_func_data;
			
 
				 	o2net_post_msg_handler_func
			
 
				 				*nh_post_func;
			
 
				 	struct kref		nh_kref;
			
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1958,7 +1958,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
 
				 
			
 
				 	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
			
 
				 
			
 
				-	error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level);
			
 
				+	error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1);
			
 
				 	if (lock_level && error >= 0) {
			
 
				 		/* We release EX lock which used to update atime
			
 
				 		 * and get PR lock again to reduce contention
			
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1122,13 +1122,6 @@ recheck:
 
				 	/* sleep if we haven't finished voting yet */
			
 
				 	if (sleep) {
			
 
				 		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
			
 
				-
			
 
				-		/*
			
 
				-		if (kref_read(&mle->mle_refs) < 2)
			
 
				-			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
			
 
				-			kref_read(&mle->mle_refs),
			
 
				-			res->lockname.len, res->lockname.name);
			
 
				-		*/
			
 
				 		atomic_set(&mle->woken, 0);
			
 
				 		(void)wait_event_timeout(mle->wq,
			
 
				 					 (atomic_read(&mle->woken) == 1),
			
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -259,6 +259,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = {
 
				 	.flags		= 0,
			
 
				 };
			
 
				 
			
 
				+static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = {
			
 
				+	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
			
 
				+};
			
 
				+
			
 
				 static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = {
			
 
				 	.flags		= LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB,
			
 
				 };
			
@@ -676,6 +680,24 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
 
				 				   &ocfs2_nfs_sync_lops, osb);
			
 
				 }
			
 
				 
			
 
				+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
			
 
				+{
			
 
				+	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
			
 
				+
			
 
				+	ocfs2_lock_res_init_once(lockres);
			
 
				+	ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
			
 
				+	ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
			
 
				+				   &ocfs2_trim_fs_lops, osb);
			
 
				+}
			
 
				+
			
 
				+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb)
			
 
				+{
			
 
				+	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
			
 
				+
			
 
				+	ocfs2_simple_drop_lockres(osb, lockres);
			
 
				+	ocfs2_lock_res_free(lockres);
			
 
				+}
			
 
				+
			
 
				 static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
			
 
				 					    struct ocfs2_super *osb)
			
 
				 {
			
@@ -1742,6 +1764,27 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 
				 	return status;
			
 
				 }
			
 
				 
			
 
				+int ocfs2_try_rw_lock(struct inode *inode, int write)
			
 
				+{
			
 
				+	int status, level;
			
 
				+	struct ocfs2_lock_res *lockres;
			
 
				+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
			
 
				+
			
 
				+	mlog(0, "inode %llu try to take %s RW lock\n",
			
 
				+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
			
 
				+	     write ? "EXMODE" : "PRMODE");
			
 
				+
			
 
				+	if (ocfs2_mount_local(osb))
			
 
				+		return 0;
			
 
				+
			
 
				+	lockres = &OCFS2_I(inode)->ip_rw_lockres;
			
 
				+
			
 
				+	level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
			
 
				+
			
 
				+	status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
			
 
				+	return status;
			
 
				+}
			
 
				+
			
 
				 void ocfs2_rw_unlock(struct inode *inode, int write)
			
 
				 {
			
 
				 	int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
			
@@ -2486,6 +2529,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
 
				 	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
			
 
				 	if (ret == -EAGAIN) {
			
 
				 		unlock_page(page);
			
 
				+		/*
			
 
				+		 * If we can't get inode lock immediately, we should not return
			
 
				+		 * directly here, since this will lead to a softlockup problem.
			
 
				+		 * The method is to get a blocking lock and immediately unlock
			
 
				+		 * before returning, this can avoid CPU resource waste due to
			
 
				+		 * lots of retries, and benefits fairness in getting lock.
			
 
				+		 */
			
 
				+		if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
			
 
				+			ocfs2_inode_unlock(inode, ex);
			
 
				 		ret = AOP_TRUNCATED_PAGE;
			
 
				 	}
			
 
				 
			
@@ -2494,13 +2546,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
 
				 
			
 
				 int ocfs2_inode_lock_atime(struct inode *inode,
			
 
				 			  struct vfsmount *vfsmnt,
			
 
				-			  int *level)
			
 
				+			  int *level, int wait)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = ocfs2_inode_lock(inode, NULL, 0);
			
 
				+	if (wait)
			
 
				+		ret = ocfs2_inode_lock(inode, NULL, 0);
			
 
				+	else
			
 
				+		ret = ocfs2_try_inode_lock(inode, NULL, 0);
			
 
				+
			
 
				 	if (ret < 0) {
			
 
				-		mlog_errno(ret);
			
 
				+		if (ret != -EAGAIN)
			
 
				+			mlog_errno(ret);
			
 
				 		return ret;
			
 
				 	}
			
 
				 
			
@@ -2512,9 +2569,14 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 
				 		struct buffer_head *bh = NULL;
			
 
				 
			
 
				 		ocfs2_inode_unlock(inode, 0);
			
 
				-		ret = ocfs2_inode_lock(inode, &bh, 1);
			
 
				+		if (wait)
			
 
				+			ret = ocfs2_inode_lock(inode, &bh, 1);
			
 
				+		else
			
 
				+			ret = ocfs2_try_inode_lock(inode, &bh, 1);
			
 
				+
			
 
				 		if (ret < 0) {
			
 
				-			mlog_errno(ret);
			
 
				+			if (ret != -EAGAIN)
			
 
				+				mlog_errno(ret);
			
 
				 			return ret;
			
 
				 		}
			
 
				 		*level = 1;
			
@@ -2745,6 +2807,70 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
 
				 				     ex ? LKM_EXMODE : LKM_PRMODE);
			
 
				 }
			
 
				 
			
 
				+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
			
 
				+		       struct ocfs2_trim_fs_info *info, int trylock)
			
 
				+{
			
 
				+	int status;
			
 
				+	struct ocfs2_trim_fs_lvb *lvb;
			
 
				+	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
			
 
				+
			
 
				+	if (info)
			
 
				+		info->tf_valid = 0;
			
 
				+
			
 
				+	if (ocfs2_is_hard_readonly(osb))
			
 
				+		return -EROFS;
			
 
				+
			
 
				+	if (ocfs2_mount_local(osb))
			
 
				+		return 0;
			
 
				+
			
 
				+	status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX,
			
 
				+				    trylock ? DLM_LKF_NOQUEUE : 0, 0);
			
 
				+	if (status < 0) {
			
 
				+		if (status != -EAGAIN)
			
 
				+			mlog_errno(status);
			
 
				+		return status;
			
 
				+	}
			
 
				+
			
 
				+	if (info) {
			
 
				+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
			
 
				+		if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) &&
			
 
				+		    lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) {
			
 
				+			info->tf_valid = 1;
			
 
				+			info->tf_success = lvb->lvb_success;
			
 
				+			info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum);
			
 
				+			info->tf_start = be64_to_cpu(lvb->lvb_start);
			
 
				+			info->tf_len = be64_to_cpu(lvb->lvb_len);
			
 
				+			info->tf_minlen = be64_to_cpu(lvb->lvb_minlen);
			
 
				+			info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return status;
			
 
				+}
			
 
				+
			
 
				+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
			
 
				+			  struct ocfs2_trim_fs_info *info)
			
 
				+{
			
 
				+	struct ocfs2_trim_fs_lvb *lvb;
			
 
				+	struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
			
 
				+
			
 
				+	if (ocfs2_mount_local(osb))
			
 
				+		return;
			
 
				+
			
 
				+	if (info) {
			
 
				+		lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
			
 
				+		lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION;
			
 
				+		lvb->lvb_success = info->tf_success;
			
 
				+		lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum);
			
 
				+		lvb->lvb_start = cpu_to_be64(info->tf_start);
			
 
				+		lvb->lvb_len = cpu_to_be64(info->tf_len);
			
 
				+		lvb->lvb_minlen = cpu_to_be64(info->tf_minlen);
			
 
				+		lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen);
			
 
				+	}
			
 
				+
			
 
				+	ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
			
 
				+}
			
 
				+
			
 
				 int ocfs2_dentry_lock(struct dentry *dentry, int ex)
			
 
				 {
			
 
				 	int ret;
			
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -70,6 +70,29 @@ struct ocfs2_orphan_scan_lvb {
 
				 	__be32	lvb_os_seqno;
			
 
				 };
			
 
				 
			
 
				+#define OCFS2_TRIMFS_LVB_VERSION 1
			
 
				+
			
 
				+struct ocfs2_trim_fs_lvb {
			
 
				+	__u8	lvb_version;
			
 
				+	__u8	lvb_success;
			
 
				+	__u8	lvb_reserved[2];
			
 
				+	__be32	lvb_nodenum;
			
 
				+	__be64	lvb_start;
			
 
				+	__be64	lvb_len;
			
 
				+	__be64	lvb_minlen;
			
 
				+	__be64	lvb_trimlen;
			
 
				+};
			
 
				+
			
 
				+struct ocfs2_trim_fs_info {
			
 
				+	u8	tf_valid;	/* lvb is valid, or not */
			
 
				+	u8	tf_success;	/* trim is successful, or not */
			
 
				+	u32	tf_nodenum;	/* osb node number */
			
 
				+	u64	tf_start;	/* trim start offset in clusters */
			
 
				+	u64	tf_len;		/* trim end offset in clusters */
			
 
				+	u64	tf_minlen;	/* trim minimum contiguous free clusters */
			
 
				+	u64	tf_trimlen;	/* trimmed length in bytes */
			
 
				+};
			
 
				+
			
 
				 struct ocfs2_lock_holder {
			
 
				 	struct list_head oh_list;
			
 
				 	struct pid *oh_owner_pid;
			
@@ -116,13 +139,14 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
 
				 int ocfs2_create_new_inode_locks(struct inode *inode);
			
 
				 int ocfs2_drop_inode_locks(struct inode *inode);
			
 
				 int ocfs2_rw_lock(struct inode *inode, int write);
			
 
				+int ocfs2_try_rw_lock(struct inode *inode, int write);
			
 
				 void ocfs2_rw_unlock(struct inode *inode, int write);
			
 
				 int ocfs2_open_lock(struct inode *inode);
			
 
				 int ocfs2_try_open_lock(struct inode *inode, int write);
			
 
				 void ocfs2_open_unlock(struct inode *inode);
			
 
				 int ocfs2_inode_lock_atime(struct inode *inode,
			
 
				 			  struct vfsmount *vfsmnt,
			
 
				-			  int *level);
			
 
				+			  int *level, int wait);
			
 
				 int ocfs2_inode_lock_full_nested(struct inode *inode,
			
 
				 			 struct buffer_head **ret_bh,
			
 
				 			 int ex,
			
@@ -140,6 +164,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
 
				 /* 99% of the time we don't want to supply any additional flags --
			
 
				  * those are for very specific cases only. */
			
 
				 #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
			
 
				+#define ocfs2_try_inode_lock(i, b, e)\
			
 
				+		ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\
			
 
				+		OI_LS_NORMAL)
			
 
				 void ocfs2_inode_unlock(struct inode *inode,
			
 
				 		       int ex);
			
 
				 int ocfs2_super_lock(struct ocfs2_super *osb,
			
@@ -153,6 +180,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb);
 
				 void ocfs2_rename_unlock(struct ocfs2_super *osb);
			
 
				 int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
			
 
				 void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
			
 
				+void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb);
			
 
				+void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb);
			
 
				+int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
			
 
				+		       struct ocfs2_trim_fs_info *info, int trylock);
			
 
				+void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
			
 
				+			  struct ocfs2_trim_fs_info *info);
			
 
				 int ocfs2_dentry_lock(struct dentry *dentry, int ex);
			
 
				 void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
			
 
				 int ocfs2_file_lock(struct file *file, int ex, int trylock);
			
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -38,6 +38,7 @@
 
				 #include "inode.h"
			
 
				 #include "super.h"
			
 
				 #include "symlink.h"
			
 
				+#include "aops.h"
			
 
				 #include "ocfs2_trace.h"
			
 
				 
			
 
				 #include "buffer_head_io.h"
			
@@ -832,6 +833,50 @@ out:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/* Is IO overwriting allocated blocks? */
			
 
				+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
			
 
				+		       u64 map_start, u64 map_len)
			
 
				+{
			
 
				+	int ret = 0, is_last;
			
 
				+	u32 mapping_end, cpos;
			
 
				+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
			
 
				+	struct ocfs2_extent_rec rec;
			
 
				+
			
 
				+	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
			
 
				+		if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len))
			
 
				+			return ret;
			
 
				+		else
			
 
				+			return -EAGAIN;
			
 
				+	}
			
 
				+
			
 
				+	cpos = map_start >> osb->s_clustersize_bits;
			
 
				+	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
			
 
				+					       map_start + map_len);
			
 
				+	is_last = 0;
			
 
				+	while (cpos < mapping_end && !is_last) {
			
 
				+		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
			
 
				+						 NULL, &rec, &is_last);
			
 
				+		if (ret) {
			
 
				+			mlog_errno(ret);
			
 
				+			goto out;
			
 
				+		}
			
 
				+
			
 
				+		if (rec.e_blkno == 0ULL)
			
 
				+			break;
			
 
				+
			
 
				+		if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
			
 
				+			break;
			
 
				+
			
 
				+		cpos = le32_to_cpu(rec.e_cpos) +
			
 
				+			le16_to_cpu(rec.e_leaf_clusters);
			
 
				+	}
			
 
				+
			
 
				+	if (cpos < mapping_end)
			
 
				+		ret = -EAGAIN;
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
			
 
				 {
			
 
				 	struct inode *inode = file->f_mapping->host;
			
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
 
				 int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
			
 
				 		 u64 map_start, u64 map_len);
			
 
				 
			
 
				+int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh,
			
 
				+		       u64 map_start, u64 map_len);
			
 
				+
			
 
				 int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
			
 
				 
			
 
				 int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
			
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -140,6 +140,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
 
				 		spin_unlock(&oi->ip_lock);
			
 
				 	}
			
 
				 
			
 
				+	file->f_mode |= FMODE_NOWAIT;
			
 
				+
			
 
				 leave:
			
 
				 	return status;
			
 
				 }
			
@@ -2132,12 +2134,12 @@ out:
 
				 }
			
 
				 
			
 
				 static int ocfs2_prepare_inode_for_write(struct file *file,
			
 
				-					 loff_t pos,
			
 
				-					 size_t count)
			
 
				+					 loff_t pos, size_t count, int wait)
			
 
				 {
			
 
				-	int ret = 0, meta_level = 0;
			
 
				+	int ret = 0, meta_level = 0, overwrite_io = 0;
			
 
				 	struct dentry *dentry = file->f_path.dentry;
			
 
				 	struct inode *inode = d_inode(dentry);
			
 
				+	struct buffer_head *di_bh = NULL;
			
 
				 	loff_t end;
			
 
				 
			
 
				 	/*
			
@@ -2145,13 +2147,40 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 
				 	 * if we need to make modifications here.
			
 
				 	 */
			
 
				 	for(;;) {
			
 
				-		ret = ocfs2_inode_lock(inode, NULL, meta_level);
			
 
				+		if (wait)
			
 
				+			ret = ocfs2_inode_lock(inode, NULL, meta_level);
			
 
				+		else
			
 
				+			ret = ocfs2_try_inode_lock(inode,
			
 
				+				overwrite_io ? NULL : &di_bh, meta_level);
			
 
				 		if (ret < 0) {
			
 
				 			meta_level = -1;
			
 
				-			mlog_errno(ret);
			
 
				+			if (ret != -EAGAIN)
			
 
				+				mlog_errno(ret);
			
 
				 			goto out;
			
 
				 		}
			
 
				 
			
 
				+		/*
			
 
				+		 * Check if IO will overwrite allocated blocks in case
			
 
				+		 * IOCB_NOWAIT flag is set.
			
 
				+		 */
			
 
				+		if (!wait && !overwrite_io) {
			
 
				+			overwrite_io = 1;
			
 
				+			if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
			
 
				+				ret = -EAGAIN;
			
 
				+				goto out_unlock;
			
 
				+			}
			
 
				+
			
 
				+			ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
			
 
				+			brelse(di_bh);
			
 
				+			di_bh = NULL;
			
 
				+			up_read(&OCFS2_I(inode)->ip_alloc_sem);
			
 
				+			if (ret < 0) {
			
 
				+				if (ret != -EAGAIN)
			
 
				+					mlog_errno(ret);
			
 
				+				goto out_unlock;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				 		/* Clear suid / sgid if necessary. We do this here
			
 
				 		 * instead of later in the write path because
			
 
				 		 * remove_suid() calls ->setattr without any hint that
			
@@ -2199,7 +2228,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 
				 
			
 
				 out_unlock:
			
 
				 	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
			
 
				-					    pos, count);
			
 
				+					    pos, count, wait);
			
 
				+
			
 
				+	brelse(di_bh);
			
 
				 
			
 
				 	if (meta_level >= 0)
			
 
				 		ocfs2_inode_unlock(inode, meta_level);
			
@@ -2211,7 +2242,7 @@ out:
 
				 static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
			
 
				 				    struct iov_iter *from)
			
 
				 {
			
 
				-	int direct_io, rw_level;
			
 
				+	int rw_level;
			
 
				 	ssize_t written = 0;
			
 
				 	ssize_t ret;
			
 
				 	size_t count = iov_iter_count(from);
			
@@ -2223,6 +2254,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 
				 	void *saved_ki_complete = NULL;
			
 
				 	int append_write = ((iocb->ki_pos + count) >=
			
 
				 			i_size_read(inode) ? 1 : 0);
			
 
				+	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
			
 
				+	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
			
 
				 
			
 
				 	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
			
 
				 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
			
@@ -2230,12 +2263,17 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 
				 		file->f_path.dentry->d_name.name,
			
 
				 		(unsigned int)from->nr_segs);	/* GRRRRR */
			
 
				 
			
 
				+	if (!direct_io && nowait)
			
 
				+		return -EOPNOTSUPP;
			
 
				+
			
 
				 	if (count == 0)
			
 
				 		return 0;
			
 
				 
			
 
				-	direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
			
 
				-
			
 
				-	inode_lock(inode);
			
 
				+	if (nowait) {
			
 
				+		if (!inode_trylock(inode))
			
 
				+			return -EAGAIN;
			
 
				+	} else
			
 
				+		inode_lock(inode);
			
 
				 
			
 
				 	/*
			
 
				 	 * Concurrent O_DIRECT writes are allowed with
			
@@ -2244,9 +2282,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 
				 	 */
			
 
				 	rw_level = (!direct_io || full_coherency || append_write);
			
 
				 
			
 
				-	ret = ocfs2_rw_lock(inode, rw_level);
			
 
				+	if (nowait)
			
 
				+		ret = ocfs2_try_rw_lock(inode, rw_level);
			
 
				+	else
			
 
				+		ret = ocfs2_rw_lock(inode, rw_level);
			
 
				 	if (ret < 0) {
			
 
				-		mlog_errno(ret);
			
 
				+		if (ret != -EAGAIN)
			
 
				+			mlog_errno(ret);
			
 
				 		goto out_mutex;
			
 
				 	}
			
 
				 
			
@@ -2260,9 +2302,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 
				 		 * other nodes to drop their caches.  Buffered I/O
			
 
				 		 * already does this in write_begin().
			
 
				 		 */
			
 
				-		ret = ocfs2_inode_lock(inode, NULL, 1);
			
 
				+		if (nowait)
			
 
				+			ret = ocfs2_try_inode_lock(inode, NULL, 1);
			
 
				+		else
			
 
				+			ret = ocfs2_inode_lock(inode, NULL, 1);
			
 
				 		if (ret < 0) {
			
 
				-			mlog_errno(ret);
			
 
				+			if (ret != -EAGAIN)
			
 
				+				mlog_errno(ret);
			
 
				 			goto out;
			
 
				 		}
			
 
				 
			
@@ -2277,9 +2323,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 
				 	}
			
 
				 	count = ret;
			
 
				 
			
 
				-	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
			
 
				+	ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait);
			
 
				 	if (ret < 0) {
			
 
				-		mlog_errno(ret);
			
 
				+		if (ret != -EAGAIN)
			
 
				+			mlog_errno(ret);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -2355,6 +2402,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
 
				 	int ret = 0, rw_level = -1, lock_level = 0;
			
 
				 	struct file *filp = iocb->ki_filp;
			
 
				 	struct inode *inode = file_inode(filp);
			
 
				+	int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
			
 
				+	int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
			
 
				 
			
 
				 	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
			
 
				 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
			
@@ -2369,14 +2418,22 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
 
				 		goto bail;
			
 
				 	}
			
 
				 
			
 
				+	if (!direct_io && nowait)
			
 
				+		return -EOPNOTSUPP;
			
 
				+
			
 
				 	/*
			
 
				 	 * buffered reads protect themselves in ->readpage().  O_DIRECT reads
			
 
				 	 * need locks to protect pending reads from racing with truncate.
			
 
				 	 */
			
 
				-	if (iocb->ki_flags & IOCB_DIRECT) {
			
 
				-		ret = ocfs2_rw_lock(inode, 0);
			
 
				+	if (direct_io) {
			
 
				+		if (nowait)
			
 
				+			ret = ocfs2_try_rw_lock(inode, 0);
			
 
				+		else
			
 
				+			ret = ocfs2_rw_lock(inode, 0);
			
 
				+
			
 
				 		if (ret < 0) {
			
 
				-			mlog_errno(ret);
			
 
				+			if (ret != -EAGAIN)
			
 
				+				mlog_errno(ret);
			
 
				 			goto bail;
			
 
				 		}
			
 
				 		rw_level = 0;
			
@@ -2393,9 +2450,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
 
				 	 * like i_size. This allows the checks down below
			
 
				 	 * generic_file_aio_read() a chance of actually working.
			
 
				 	 */
			
 
				-	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level);
			
 
				+	ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
			
 
				+				     !nowait);
			
 
				 	if (ret < 0) {
			
 
				-		mlog_errno(ret);
			
 
				+		if (ret != -EAGAIN)
			
 
				+			mlog_errno(ret);
			
 
				 		goto bail;
			
 
				 	}
			
 
				 	ocfs2_inode_unlock(inode, lock_level);
			
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -666,23 +666,24 @@ static int __ocfs2_journal_access(handle_t *handle,
 
				 	/* we can safely remove this assertion after testing. */
			
 
				 	if (!buffer_uptodate(bh)) {
			
 
				 		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
			
 
				-		mlog(ML_ERROR, "b_blocknr=%llu\n",
			
 
				-		     (unsigned long long)bh->b_blocknr);
			
 
				+		mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n",
			
 
				+		     (unsigned long long)bh->b_blocknr, bh->b_state);
			
 
				 
			
 
				 		lock_buffer(bh);
			
 
				 		/*
			
 
				-		 * A previous attempt to write this buffer head failed.
			
 
				-		 * Nothing we can do but to retry the write and hope for
			
 
				-		 * the best.
			
 
				+		 * A previous transaction with a couple of buffer heads fail
			
 
				+		 * to checkpoint, so all the bhs are marked as BH_Write_EIO.
			
 
				+		 * For current transaction, the bh is just among those error
			
 
				+		 * bhs which previous transaction handle. We can't just clear
			
 
				+		 * its BH_Write_EIO and reuse directly, since other bhs are
			
 
				+		 * not written to disk yet and that will cause metadata
			
 
				+		 * inconsistency. So we should set fs read-only to avoid
			
 
				+		 * further damage.
			
 
				 		 */
			
 
				 		if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
			
 
				-			clear_buffer_write_io_error(bh);
			
 
				-			set_buffer_uptodate(bh);
			
 
				-		}
			
 
				-
			
 
				-		if (!buffer_uptodate(bh)) {
			
 
				 			unlock_buffer(bh);
			
 
				-			return -EIO;
			
 
				+			return ocfs2_error(osb->sb, "A previous attempt to "
			
 
				+					"write this buffer head failed\n");
			
 
				 		}
			
 
				 		unlock_buffer(bh);
			
 
				 	}
			
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -184,7 +184,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 
				 	int ret = 0, lock_level = 0;
			
 
				 
			
 
				 	ret = ocfs2_inode_lock_atime(file_inode(file),
			
 
				-				    file->f_path.mnt, &lock_level);
			
 
				+				    file->f_path.mnt, &lock_level, 1);
			
 
				 	if (ret < 0) {
			
 
				 		mlog_errno(ret);
			
 
				 		goto out;
			
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -404,6 +404,7 @@ struct ocfs2_super
 
				 	struct ocfs2_lock_res osb_super_lockres;
			
 
				 	struct ocfs2_lock_res osb_rename_lockres;
			
 
				 	struct ocfs2_lock_res osb_nfs_sync_lockres;
			
 
				+	struct ocfs2_lock_res osb_trim_fs_lockres;
			
 
				 	struct ocfs2_dlm_debug *osb_dlm_debug;
			
 
				 
			
 
				 	struct dentry *osb_debug_root;
			
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -50,6 +50,7 @@ enum ocfs2_lock_type {
 
				 	OCFS2_LOCK_TYPE_NFS_SYNC,
			
 
				 	OCFS2_LOCK_TYPE_ORPHAN_SCAN,
			
 
				 	OCFS2_LOCK_TYPE_REFCOUNT,
			
 
				+	OCFS2_LOCK_TYPE_TRIM_FS,
			
 
				 	OCFS2_NUM_LOCK_TYPES
			
 
				 };
			
 
				 
			
@@ -93,6 +94,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 
				 		case OCFS2_LOCK_TYPE_REFCOUNT:
			
 
				 			c = 'T';
			
 
				 			break;
			
 
				+		case OCFS2_LOCK_TYPE_TRIM_FS:
			
 
				+			c = 'I';
			
 
				+			break;
			
 
				 		default:
			
 
				 			c = '\0';
			
 
				 	}
			
@@ -115,6 +119,7 @@ static char *ocfs2_lock_type_strings[] = {
 
				 	[OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
			
 
				 	[OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
			
 
				 	[OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount",
			
 
				+	[OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs",
			
 
				 };
			
 
				 
			
 
				 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
			
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1449,20 +1449,22 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
 
				 
			
 
				 TRACE_EVENT(ocfs2_prepare_inode_for_write,
			
 
				 	TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
			
 
				-		 unsigned long count),
			
 
				-	TP_ARGS(ino, saved_pos, count),
			
 
				+		 unsigned long count, int wait),
			
 
				+	TP_ARGS(ino, saved_pos, count, wait),
			
 
				 	TP_STRUCT__entry(
			
 
				 		__field(unsigned long long, ino)
			
 
				 		__field(unsigned long long, saved_pos)
			
 
				 		__field(unsigned long, count)
			
 
				+		__field(int, wait)
			
 
				 	),
			
 
				 	TP_fast_assign(
			
 
				 		__entry->ino = ino;
			
 
				 		__entry->saved_pos = saved_pos;
			
 
				 		__entry->count = count;
			
 
				+		__entry->wait = wait;
			
 
				 	),
			
 
				-	TP_printk("%llu %llu %lu", __entry->ino,
			
 
				-		  __entry->saved_pos, __entry->count)
			
 
				+	TP_printk("%llu %llu %lu %d", __entry->ino,
			
 
				+		  __entry->saved_pos, __entry->count, __entry->wait)
			
 
				 );
			
 
				 
			
 
				 DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
			
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2438,6 +2438,8 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
 
				 	}
			
 
				 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
			
 
				 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
			
 
				+		if (undo_fn)
			
 
				+			jbd_unlock_bh_state(group_bh);
			
 
				 		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
			
 
				 				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
			
 
				 				   le16_to_cpu(bg->bg_bits),
			
@@ -2563,16 +2565,16 @@ static int _ocfs2_free_clusters(handle_t *handle,
 
				 	int status;
			
 
				 	u16 bg_start_bit;
			
 
				 	u64 bg_blkno;
			
 
				-	struct ocfs2_dinode *fe;
			
 
				 
			
 
				 	/* You can't ever have a contiguous set of clusters
			
 
				 	 * bigger than a block group bitmap so we never have to worry
			
 
				 	 * about looping on them.
			
 
				 	 * This is expensive. We can safely remove once this stuff has
			
 
				 	 * gotten tested really well. */
			
 
				-	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
			
 
				+	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb,
			
 
				+				ocfs2_blocks_to_clusters(bitmap_inode->i_sb,
			
 
				+							 start_blk)));
			
 
				 
			
 
				-	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
			
 
				 
			
 
				 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
			
 
				 				     &bg_start_bit);
			
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -474,9 +474,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 
				 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
			
 
				 		if (!new) {
			
 
				 			ocfs2_release_system_inodes(osb);
			
 
				-			status = -EINVAL;
			
 
				+			status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
			
 
				 			mlog_errno(status);
			
 
				-			/* FIXME: Should ERROR_RO_FS */
			
 
				 			mlog(ML_ERROR, "Unable to load system inode %d, "
			
 
				 			     "possibly corrupt fs?", i);
			
 
				 			goto bail;
			
@@ -505,7 +504,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 
				 		new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
			
 
				 		if (!new) {
			
 
				 			ocfs2_release_system_inodes(osb);
			
 
				-			status = -EINVAL;
			
 
				+			status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL;
			
 
				 			mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n",
			
 
				 			     status, i, osb->slot_num);
			
 
				 			goto bail;
			
@@ -1208,14 +1207,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 
				 read_super_error:
			
 
				 	brelse(bh);
			
 
				 
			
 
				+	if (status)
			
 
				+		mlog_errno(status);
			
 
				+
			
 
				 	if (osb) {
			
 
				 		atomic_set(&osb->vol_state, VOLUME_DISABLED);
			
 
				 		wake_up(&osb->osb_mount_event);
			
 
				 		ocfs2_dismount_volume(sb, 1);
			
 
				 	}
			
 
				 
			
 
				-	if (status)
			
 
				-		mlog_errno(status);
			
 
				 	return status;
			
 
				 }
			
 
				 
			
@@ -1843,6 +1843,9 @@ static int ocfs2_mount_volume(struct super_block *sb)
 
				 	status = ocfs2_dlm_init(osb);
			
 
				 	if (status < 0) {
			
 
				 		mlog_errno(status);
			
 
				+		if (status == -EBADR && ocfs2_userspace_stack(osb))
			
 
				+			mlog(ML_ERROR, "couldn't mount because cluster name on"
			
 
				+			" disk does not match the running cluster name.\n");
			
 
				 		goto leave;
			
 
				 	}
			
 
				 
			
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -638,14 +638,17 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 
				 						     si->value_len);
			
 
				 
			
 
				 	if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
			
 
				+		down_read(&OCFS2_I(dir)->ip_xattr_sem);
			
 
				 		acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
			
 
				 					OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
			
 
				 					"", NULL, 0);
			
 
				+		up_read(&OCFS2_I(dir)->ip_xattr_sem);
			
 
				 		if (acl_len > 0) {
			
 
				 			a_size = ocfs2_xattr_entry_real_size(0, acl_len);
			
 
				 			if (S_ISDIR(mode))
			
 
				 				a_size <<= 1;
			
 
				 		} else if (acl_len != 0 && acl_len != -ENODATA) {
			
 
				+			ret = acl_len;
			
 
				 			mlog_errno(ret);
			
 
				 			return ret;
			
 
				 		}
			
@@ -6415,7 +6418,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
 
				 		 * and then insert the extents one by one.
			
 
				 		 */
			
 
				 		if (xv->xr_list.l_tree_depth) {
			
 
				-			memcpy(new_xv, &def_xv, sizeof(def_xv));
			
 
				+			memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE);
			
 
				 			vb->vb_xv = new_xv;
			
 
				 			vb->vb_bh = value_bh;
			
 
				 			ocfs2_init_xattr_value_extent_tree(&data_et,
			
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -47,8 +47,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 
				 	if (hiwater_rss < mm->hiwater_rss)
			
 
				 		hiwater_rss = mm->hiwater_rss;
			
 
				 
			
 
				-	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
			
 
				-	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
			
 
				+	/* split executable areas between text and lib */
			
 
				+	text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK);
			
 
				+	text = min(text, mm->exec_vm << PAGE_SHIFT);
			
 
				+	lib = (mm->exec_vm << PAGE_SHIFT) - text;
			
 
				+
			
 
				 	swap = get_mm_counter(mm, MM_SWAPENTS);
			
 
				 	seq_printf(m,
			
 
				 		"VmPeak:\t%8lu kB\n"
			
@@ -76,7 +79,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 
				 		file << (PAGE_SHIFT-10),
			
 
				 		shmem << (PAGE_SHIFT-10),
			
 
				 		mm->data_vm << (PAGE_SHIFT-10),
			
 
				-		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
			
 
				+		mm->stack_vm << (PAGE_SHIFT-10),
			
 
				+		text >> 10,
			
 
				+		lib >> 10,
			
 
				 		mm_pgtables_bytes(mm) >> 10,
			
 
				 		swap << (PAGE_SHIFT-10));
			
 
				 	hugetlb_report_usage(m, mm);
			
@@ -977,14 +982,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 
				 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
			
 
				 		unsigned long addr, pmd_t *pmdp)
			
 
				 {
			
 
				-	pmd_t pmd = *pmdp;
			
 
				+	pmd_t old, pmd = *pmdp;
			
 
				 
			
 
				 	if (pmd_present(pmd)) {
			
 
				 		/* See comment in change_huge_pmd() */
			
 
				-		pmdp_invalidate(vma, addr, pmdp);
			
 
				-		if (pmd_dirty(*pmdp))
			
 
				+		old = pmdp_invalidate(vma, addr, pmdp);
			
 
				+		if (pmd_dirty(old))
			
 
				 			pmd = pmd_mkdirty(pmd);
			
 
				-		if (pmd_young(*pmdp))
			
 
				+		if (pmd_young(old))
			
 
				 			pmd = pmd_mkyoung(pmd);
			
 
				 
			
 
				 		pmd = pmd_wrprotect(pmd);
			
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -294,10 +294,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
 
				 	 * pmd_trans_unstable) of the pmd.
			
 
				 	 */
			
 
				 	_pmd = READ_ONCE(*pmd);
			
 
				-	if (!pmd_present(_pmd))
			
 
				+	if (pmd_none(_pmd))
			
 
				 		goto out;
			
 
				 
			
 
				 	ret = false;
			
 
				+	if (!pmd_present(_pmd))
			
 
				+		goto out;
			
 
				+
			
 
				 	if (pmd_trans_huge(_pmd))
			
 
				 		goto out;
			
 
				 
			
@@ -985,24 +988,14 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
 
				 				  struct uffd_msg *msg)
			
 
				 {
			
 
				 	int fd;
			
 
				-	struct file *file;
			
 
				-	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
			
 
				 
			
 
				-	fd = get_unused_fd_flags(flags);
			
 
				+	fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
			
 
				+			      O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
			
 
				 	if (fd < 0)
			
 
				 		return fd;
			
 
				 
			
 
				-	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
			
 
				-				  O_RDWR | flags);
			
 
				-	if (IS_ERR(file)) {
			
 
				-		put_unused_fd(fd);
			
 
				-		return PTR_ERR(file);
			
 
				-	}
			
 
				-
			
 
				-	fd_install(fd, file);
			
 
				 	msg->arg.reserved.reserved1 = 0;
			
 
				 	msg->arg.fork.ufd = fd;
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1884,24 +1877,10 @@ static void init_once_userfaultfd_ctx(void *mem)
 
				 	seqcount_init(&ctx->refile_seq);
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				- * userfaultfd_file_create - Creates a userfaultfd file pointer.
			
 
				- * @flags: Flags for the userfaultfd file.
			
 
				- *
			
 
				- * This function creates a userfaultfd file pointer, w/out installing
			
 
				- * it into the fd table. This is useful when the userfaultfd file is
			
 
				- * used during the initialization of data structures that require
			
 
				- * extra setup after the userfaultfd creation. So the userfaultfd
			
 
				- * creation is split into the file pointer creation phase, and the
			
 
				- * file descriptor installation phase.  In this way races with
			
 
				- * userspace closing the newly installed file descriptor can be
			
 
				- * avoided.  Returns a userfaultfd file pointer, or a proper error
			
 
				- * pointer.
			
 
				- */
			
 
				-static struct file *userfaultfd_file_create(int flags)
			
 
				+SYSCALL_DEFINE1(userfaultfd, int, flags)
			
 
				 {
			
 
				-	struct file *file;
			
 
				 	struct userfaultfd_ctx *ctx;
			
 
				+	int fd;
			
 
				 
			
 
				 	BUG_ON(!current->mm);
			
 
				 
			
@@ -1909,14 +1888,12 @@ static struct file *userfaultfd_file_create(int flags)
 
				 	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
			
 
				 	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
			
 
				 
			
 
				-	file = ERR_PTR(-EINVAL);
			
 
				 	if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
			
 
				-		goto out;
			
 
				+		return -EINVAL;
			
 
				 
			
 
				-	file = ERR_PTR(-ENOMEM);
			
 
				 	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
			
 
				 	if (!ctx)
			
 
				-		goto out;
			
 
				+		return -ENOMEM;
			
 
				 
			
 
				 	atomic_set(&ctx->refcount, 1);
			
 
				 	ctx->flags = flags;
			
@@ -1927,39 +1904,13 @@ static struct file *userfaultfd_file_create(int flags)
 
				 	/* prevent the mm struct to be freed */
			
 
				 	mmgrab(ctx->mm);
			
 
				 
			
 
				-	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
			
 
				-				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
			
 
				-	if (IS_ERR(file)) {
			
 
				+	fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
			
 
				+			      O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
			
 
				+	if (fd < 0) {
			
 
				 		mmdrop(ctx->mm);
			
 
				 		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
			
 
				 	}
			
 
				-out:
			
 
				-	return file;
			
 
				-}
			
 
				-
			
 
				-SYSCALL_DEFINE1(userfaultfd, int, flags)
			
 
				-{
			
 
				-	int fd, error;
			
 
				-	struct file *file;
			
 
				-
			
 
				-	error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
			
 
				-	if (error < 0)
			
 
				-		return error;
			
 
				-	fd = error;
			
 
				-
			
 
				-	file = userfaultfd_file_create(flags);
			
 
				-	if (IS_ERR(file)) {
			
 
				-		error = PTR_ERR(file);
			
 
				-		goto err_put_unused_fd;
			
 
				-	}
			
 
				-	fd_install(fd, file);
			
 
				-
			
 
				 	return fd;
			
 
				-
			
 
				-err_put_unused_fd:
			
 
				-	put_unused_fd(fd);
			
 
				-
			
 
				-	return error;
			
 
				 }
			
 
				 
			
 
				 static int __init userfaultfd_init(void)
			
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -309,19 +309,26 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 
				 extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
			
 
				 #endif
			
 
				 
			
 
				-#ifndef __HAVE_ARCH_PMDP_INVALIDATE
			
 
				-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				-			    pmd_t *pmdp);
			
 
				-#endif
			
 
				-
			
 
				-#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
			
 
				-static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
			
 
				-					   unsigned long address, pmd_t *pmdp)
			
 
				+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				+/*
			
 
				+ * This is an implementation of pmdp_establish() that is only suitable for an
			
 
				+ * architecture that doesn't have hardware dirty/accessed bits. In this case we
			
 
				+ * can't race with CPU which sets these bits and non-atomic aproach is fine.
			
 
				+ */
			
 
				+static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
			
 
				+		unsigned long address, pmd_t *pmdp, pmd_t pmd)
			
 
				 {
			
 
				-
			
 
				+	pmd_t old_pmd = *pmdp;
			
 
				+	set_pmd_at(vma->vm_mm, address, pmdp, pmd);
			
 
				+	return old_pmd;
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
			
 
				+extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				+			    pmd_t *pmdp);
			
 
				+#endif
			
 
				+
			
 
				 #ifndef __HAVE_ARCH_PTE_SAME
			
 
				 static inline int pte_same(pte_t pte_a, pte_t pte_b)
			
 
				 {
			
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 
				 						long freed);
			
 
				 bool isolate_huge_page(struct page *page, struct list_head *list);
			
 
				 void putback_active_hugepage(struct page *page);
			
 
				+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
			
 
				 void free_huge_page(struct page *page);
			
 
				 void hugetlb_fix_reserve_counts(struct inode *inode);
			
 
				 extern struct mutex *hugetlb_fault_mutex_table;
			
@@ -129,7 +130,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
 
				 
			
 
				 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
			
 
				 
			
 
				-extern int hugepages_treat_as_movable;
			
 
				 extern int sysctl_hugetlb_shm_group;
			
 
				 extern struct list_head huge_boot_pages;
			
 
				 
			
@@ -158,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 
				 		unsigned long address, unsigned long end, pgprot_t newprot);
			
 
				 
			
 
				 bool is_hugetlb_entry_migration(pte_t pte);
			
 
				+
			
 
				 #else /* !CONFIG_HUGETLB_PAGE */
			
 
				 
			
 
				 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
			
@@ -198,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
 
				 	return false;
			
 
				 }
			
 
				 #define putback_active_hugepage(p)	do {} while (0)
			
 
				+#define move_hugetlb_state(old, new, reason)	do {} while (0)
			
 
				 
			
 
				 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
			
 
				 		unsigned long address, unsigned long end, pgprot_t newprot)
			
@@ -271,6 +273,17 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
				 	return sb->s_fs_info;
			
 
				 }
			
 
				 
			
 
				+struct hugetlbfs_inode_info {
			
 
				+	struct shared_policy policy;
			
 
				+	struct inode vfs_inode;
			
 
				+	unsigned int seals;
			
 
				+};
			
 
				+
			
 
				+static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
			
 
				+{
			
 
				+	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
			
 
				+}
			
 
				+
			
 
				 extern const struct file_operations hugetlbfs_file_operations;
			
 
				 extern const struct vm_operations_struct hugetlb_vm_ops;
			
 
				 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
			
@@ -343,10 +356,10 @@ struct huge_bootmem_page {
 
				 struct page *alloc_huge_page(struct vm_area_struct *vma,
			
 
				 				unsigned long addr, int avoid_reserve);
			
 
				 struct page *alloc_huge_page_node(struct hstate *h, int nid);
			
 
				-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
			
 
				-				unsigned long addr, int avoid_reserve);
			
 
				 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
			
 
				 				nodemask_t *nmask);
			
 
				+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
			
 
				+				unsigned long address);
			
 
				 int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
			
 
				 			pgoff_t idx);
			
 
				 
			
@@ -524,7 +537,7 @@ struct hstate {};
 
				 #define alloc_huge_page(v, a, r) NULL
			
 
				 #define alloc_huge_page_node(h, nid) NULL
			
 
				 #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
			
 
				-#define alloc_huge_page_noerr(v, a, r) NULL
			
 
				+#define alloc_huge_page_vma(h, vma, address) NULL
			
 
				 #define alloc_bootmem_huge_page(h) NULL
			
 
				 #define hstate_file(f) NULL
			
 
				 #define hstate_sizelog(s) NULL
			
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -108,7 +108,10 @@ struct lruvec_stat {
 
				  */
			
 
				 struct mem_cgroup_per_node {
			
 
				 	struct lruvec		lruvec;
			
 
				-	struct lruvec_stat __percpu *lruvec_stat;
			
 
				+
			
 
				+	struct lruvec_stat __percpu *lruvec_stat_cpu;
			
 
				+	atomic_long_t		lruvec_stat[NR_VM_NODE_STAT_ITEMS];
			
 
				+
			
 
				 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
			
 
				 
			
 
				 	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
			
@@ -227,10 +230,10 @@ struct mem_cgroup {
 
				 	spinlock_t		move_lock;
			
 
				 	struct task_struct	*move_lock_task;
			
 
				 	unsigned long		move_lock_flags;
			
 
				-	/*
			
 
				-	 * percpu counter.
			
 
				-	 */
			
 
				-	struct mem_cgroup_stat_cpu __percpu *stat;
			
 
				+
			
 
				+	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
			
 
				+	atomic_long_t		stat[MEMCG_NR_STAT];
			
 
				+	atomic_long_t		events[MEMCG_NR_EVENTS];
			
 
				 
			
 
				 	unsigned long		socket_pressure;
			
 
				 
			
@@ -265,6 +268,12 @@ struct mem_cgroup {
 
				 	/* WARNING: nodeinfo must be the last member here */
			
 
				 };
			
 
				 
			
 
				+/*
			
 
				+ * size of first charge trial. "32" comes from vmscan.c's magic value.
			
 
				+ * TODO: maybe necessary to use big numbers in big irons.
			
 
				+ */
			
 
				+#define MEMCG_CHARGE_BATCH 32U
			
 
				+
			
 
				 extern struct mem_cgroup *root_mem_cgroup;
			
 
				 
			
 
				 static inline bool mem_cgroup_disabled(void)
			
@@ -272,13 +281,6 @@ static inline bool mem_cgroup_disabled(void)
 
				 	return !cgroup_subsys_enabled(memory_cgrp_subsys);
			
 
				 }
			
 
				 
			
 
				-static inline void mem_cgroup_event(struct mem_cgroup *memcg,
			
 
				-				    enum memcg_event_item event)
			
 
				-{
			
 
				-	this_cpu_inc(memcg->stat->events[event]);
			
 
				-	cgroup_file_notify(&memcg->events_file);
			
 
				-}
			
 
				-
			
 
				 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
			
 
				 
			
 
				 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
			
@@ -492,32 +494,38 @@ void unlock_page_memcg(struct page *page);
 
				 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
			
 
				 					     int idx)
			
 
				 {
			
 
				-	long val = 0;
			
 
				-	int cpu;
			
 
				-
			
 
				-	for_each_possible_cpu(cpu)
			
 
				-		val += per_cpu(memcg->stat->count[idx], cpu);
			
 
				-
			
 
				-	if (val < 0)
			
 
				-		val = 0;
			
 
				-
			
 
				-	return val;
			
 
				+	long x = atomic_long_read(&memcg->stat[idx]);
			
 
				+#ifdef CONFIG_SMP
			
 
				+	if (x < 0)
			
 
				+		x = 0;
			
 
				+#endif
			
 
				+	return x;
			
 
				 }
			
 
				 
			
 
				 /* idx can be of type enum memcg_stat_item or node_stat_item */
			
 
				 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
			
 
				 				     int idx, int val)
			
 
				 {
			
 
				-	if (!mem_cgroup_disabled())
			
 
				-		__this_cpu_add(memcg->stat->count[idx], val);
			
 
				+	long x;
			
 
				+
			
 
				+	if (mem_cgroup_disabled())
			
 
				+		return;
			
 
				+
			
 
				+	x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
			
 
				+	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
			
 
				+		atomic_long_add(x, &memcg->stat[idx]);
			
 
				+		x = 0;
			
 
				+	}
			
 
				+	__this_cpu_write(memcg->stat_cpu->count[idx], x);
			
 
				 }
			
 
				 
			
 
				 /* idx can be of type enum memcg_stat_item or node_stat_item */
			
 
				 static inline void mod_memcg_state(struct mem_cgroup *memcg,
			
 
				 				   int idx, int val)
			
 
				 {
			
 
				-	if (!mem_cgroup_disabled())
			
 
				-		this_cpu_add(memcg->stat->count[idx], val);
			
 
				+	preempt_disable();
			
 
				+	__mod_memcg_state(memcg, idx, val);
			
 
				+	preempt_enable();
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -555,87 +563,108 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
 
				 					      enum node_stat_item idx)
			
 
				 {
			
 
				 	struct mem_cgroup_per_node *pn;
			
 
				-	long val = 0;
			
 
				-	int cpu;
			
 
				+	long x;
			
 
				 
			
 
				 	if (mem_cgroup_disabled())
			
 
				 		return node_page_state(lruvec_pgdat(lruvec), idx);
			
 
				 
			
 
				 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
			
 
				-	for_each_possible_cpu(cpu)
			
 
				-		val += per_cpu(pn->lruvec_stat->count[idx], cpu);
			
 
				-
			
 
				-	if (val < 0)
			
 
				-		val = 0;
			
 
				-
			
 
				-	return val;
			
 
				+	x = atomic_long_read(&pn->lruvec_stat[idx]);
			
 
				+#ifdef CONFIG_SMP
			
 
				+	if (x < 0)
			
 
				+		x = 0;
			
 
				+#endif
			
 
				+	return x;
			
 
				 }
			
 
				 
			
 
				 static inline void __mod_lruvec_state(struct lruvec *lruvec,
			
 
				 				      enum node_stat_item idx, int val)
			
 
				 {
			
 
				 	struct mem_cgroup_per_node *pn;
			
 
				+	long x;
			
 
				 
			
 
				+	/* Update node */
			
 
				 	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
			
 
				+
			
 
				 	if (mem_cgroup_disabled())
			
 
				 		return;
			
 
				+
			
 
				 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
			
 
				+
			
 
				+	/* Update memcg */
			
 
				 	__mod_memcg_state(pn->memcg, idx, val);
			
 
				-	__this_cpu_add(pn->lruvec_stat->count[idx], val);
			
 
				+
			
 
				+	/* Update lruvec */
			
 
				+	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
			
 
				+	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
			
 
				+		atomic_long_add(x, &pn->lruvec_stat[idx]);
			
 
				+		x = 0;
			
 
				+	}
			
 
				+	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
			
 
				 }
			
 
				 
			
 
				 static inline void mod_lruvec_state(struct lruvec *lruvec,
			
 
				 				    enum node_stat_item idx, int val)
			
 
				 {
			
 
				-	struct mem_cgroup_per_node *pn;
			
 
				-
			
 
				-	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
			
 
				-	if (mem_cgroup_disabled())
			
 
				-		return;
			
 
				-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
			
 
				-	mod_memcg_state(pn->memcg, idx, val);
			
 
				-	this_cpu_add(pn->lruvec_stat->count[idx], val);
			
 
				+	preempt_disable();
			
 
				+	__mod_lruvec_state(lruvec, idx, val);
			
 
				+	preempt_enable();
			
 
				 }
			
 
				 
			
 
				 static inline void __mod_lruvec_page_state(struct page *page,
			
 
				 					   enum node_stat_item idx, int val)
			
 
				 {
			
 
				-	struct mem_cgroup_per_node *pn;
			
 
				+	pg_data_t *pgdat = page_pgdat(page);
			
 
				+	struct lruvec *lruvec;
			
 
				 
			
 
				-	__mod_node_page_state(page_pgdat(page), idx, val);
			
 
				-	if (mem_cgroup_disabled() || !page->mem_cgroup)
			
 
				+	/* Untracked pages have no memcg, no lruvec. Update only the node */
			
 
				+	if (!page->mem_cgroup) {
			
 
				+		__mod_node_page_state(pgdat, idx, val);
			
 
				 		return;
			
 
				-	__mod_memcg_state(page->mem_cgroup, idx, val);
			
 
				-	pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
			
 
				-	__this_cpu_add(pn->lruvec_stat->count[idx], val);
			
 
				+	}
			
 
				+
			
 
				+	lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup);
			
 
				+	__mod_lruvec_state(lruvec, idx, val);
			
 
				 }
			
 
				 
			
 
				 static inline void mod_lruvec_page_state(struct page *page,
			
 
				 					 enum node_stat_item idx, int val)
			
 
				 {
			
 
				-	struct mem_cgroup_per_node *pn;
			
 
				-
			
 
				-	mod_node_page_state(page_pgdat(page), idx, val);
			
 
				-	if (mem_cgroup_disabled() || !page->mem_cgroup)
			
 
				-		return;
			
 
				-	mod_memcg_state(page->mem_cgroup, idx, val);
			
 
				-	pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
			
 
				-	this_cpu_add(pn->lruvec_stat->count[idx], val);
			
 
				+	preempt_disable();
			
 
				+	__mod_lruvec_page_state(page, idx, val);
			
 
				+	preempt_enable();
			
 
				 }
			
 
				 
			
 
				 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
			
 
				 						gfp_t gfp_mask,
			
 
				 						unsigned long *total_scanned);
			
 
				 
			
 
				+/* idx can be of type enum memcg_event_item or vm_event_item */
			
 
				+static inline void __count_memcg_events(struct mem_cgroup *memcg,
			
 
				+					int idx, unsigned long count)
			
 
				+{
			
 
				+	unsigned long x;
			
 
				+
			
 
				+	if (mem_cgroup_disabled())
			
 
				+		return;
			
 
				+
			
 
				+	x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
			
 
				+	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
			
 
				+		atomic_long_add(x, &memcg->events[idx]);
			
 
				+		x = 0;
			
 
				+	}
			
 
				+	__this_cpu_write(memcg->stat_cpu->events[idx], x);
			
 
				+}
			
 
				+
			
 
				 static inline void count_memcg_events(struct mem_cgroup *memcg,
			
 
				-				      enum vm_event_item idx,
			
 
				-				      unsigned long count)
			
 
				+				      int idx, unsigned long count)
			
 
				 {
			
 
				-	if (!mem_cgroup_disabled())
			
 
				-		this_cpu_add(memcg->stat->events[idx], count);
			
 
				+	preempt_disable();
			
 
				+	__count_memcg_events(memcg, idx, count);
			
 
				+	preempt_enable();
			
 
				 }
			
 
				 
			
 
				-/* idx can be of type enum memcg_stat_item or node_stat_item */
			
 
				+/* idx can be of type enum memcg_event_item or vm_event_item */
			
 
				 static inline void count_memcg_page_event(struct page *page,
			
 
				 					  int idx)
			
 
				 {
			
@@ -654,12 +683,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 
				 	rcu_read_lock();
			
 
				 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			
 
				 	if (likely(memcg)) {
			
 
				-		this_cpu_inc(memcg->stat->events[idx]);
			
 
				+		count_memcg_events(memcg, idx, 1);
			
 
				 		if (idx == OOM_KILL)
			
 
				 			cgroup_file_notify(&memcg->events_file);
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
 
				 }
			
 
				+
			
 
				+static inline void mem_cgroup_event(struct mem_cgroup *memcg,
			
 
				+				    enum memcg_event_item event)
			
 
				+{
			
 
				+	count_memcg_events(memcg, event, 1);
			
 
				+	cgroup_file_notify(&memcg->events_file);
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				 void mem_cgroup_split_huge_fixup(struct page *head);
			
 
				 #endif
			
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1312,8 +1312,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 
				 		unsigned long end, unsigned long floor, unsigned long ceiling);
			
 
				 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
			
 
				 			struct vm_area_struct *vma);
			
 
				-void unmap_mapping_range(struct address_space *mapping,
			
 
				-		loff_t const holebegin, loff_t const holelen, int even_cows);
			
 
				 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
			
 
				 			     unsigned long *start, unsigned long *end,
			
 
				 			     pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
			
@@ -1324,12 +1322,6 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address,
 
				 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
			
 
				 			void *buf, int len, int write);
			
 
				 
			
 
				-static inline void unmap_shared_mapping_range(struct address_space *mapping,
			
 
				-		loff_t const holebegin, loff_t const holelen)
			
 
				-{
			
 
				-	unmap_mapping_range(mapping, holebegin, holelen, 0);
			
 
				-}
			
 
				-
			
 
				 extern void truncate_pagecache(struct inode *inode, loff_t new);
			
 
				 extern void truncate_setsize(struct inode *inode, loff_t newsize);
			
 
				 void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
			
@@ -1344,6 +1336,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
				 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
			
 
				 			    unsigned long address, unsigned int fault_flags,
			
 
				 			    bool *unlocked);
			
 
				+void unmap_mapping_pages(struct address_space *mapping,
			
 
				+		pgoff_t start, pgoff_t nr, bool even_cows);
			
 
				+void unmap_mapping_range(struct address_space *mapping,
			
 
				+		loff_t const holebegin, loff_t const holelen, int even_cows);
			
 
				 #else
			
 
				 static inline int handle_mm_fault(struct vm_area_struct *vma,
			
 
				 		unsigned long address, unsigned int flags)
			
@@ -1360,10 +1356,20 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 
				 	BUG();
			
 
				 	return -EFAULT;
			
 
				 }
			
 
				+static inline void unmap_mapping_pages(struct address_space *mapping,
			
 
				+		pgoff_t start, pgoff_t nr, bool even_cows) { }
			
 
				+static inline void unmap_mapping_range(struct address_space *mapping,
			
 
				+		loff_t const holebegin, loff_t const holelen, int even_cows) { }
			
 
				 #endif
			
 
				 
			
 
				-extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len,
			
 
				-		unsigned int gup_flags);
			
 
				+static inline void unmap_shared_mapping_range(struct address_space *mapping,
			
 
				+		loff_t const holebegin, loff_t const holelen)
			
 
				+{
			
 
				+	unmap_mapping_range(mapping, holebegin, holelen, 0);
			
 
				+}
			
 
				+
			
 
				+extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
			
 
				+		void *buf, int len, unsigned int gup_flags);
			
 
				 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
			
 
				 		void *buf, int len, unsigned int gup_flags);
			
 
				 extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
			
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -31,28 +31,56 @@ struct hmm;
 
				  * it to keep track of whatever it is we are using the page for at the
			
 
				  * moment. Note that we have no way to track which tasks are using
			
 
				  * a page, though if it is a pagecache page, rmap structures can tell us
			
 
				- * who is mapping it.
			
 
				+ * who is mapping it. If you allocate the page using alloc_pages(), you
			
 
				+ * can use some of the space in struct page for your own purposes.
			
 
				  *
			
 
				- * The objects in struct page are organized in double word blocks in
			
 
				- * order to allows us to use atomic double word operations on portions
			
 
				- * of struct page. That is currently only used by slub but the arrangement
			
 
				- * allows the use of atomic double word operations on the flags/mapping
			
 
				- * and lru list pointers also.
			
 
				+ * Pages that were once in the page cache may be found under the RCU lock
			
 
				+ * even after they have been recycled to a different purpose.  The page
			
 
				+ * cache reads and writes some of the fields in struct page to pin the
			
 
				+ * page before checking that it's still in the page cache.  It is vital
			
 
				+ * that all users of struct page:
			
 
				+ * 1. Use the first word as PageFlags.
			
 
				+ * 2. Clear or preserve bit 0 of page->compound_head.  It is used as
			
 
				+ *    PageTail for compound pages, and the page cache must not see false
			
 
				+ *    positives.  Some users put a pointer here (guaranteed to be at least
			
 
				+ *    4-byte aligned), other users avoid using the field altogether.
			
 
				+ * 3. page->_refcount must either not be used, or must be used in such a
			
 
				+ *    way that other CPUs temporarily incrementing and then decrementing the
			
 
				+ *    refcount does not cause problems.  On receiving the page from
			
 
				+ *    alloc_pages(), the refcount will be positive.
			
 
				+ * 4. Either preserve page->_mapcount or restore it to -1 before freeing it.
			
 
				+ *
			
 
				+ * If you allocate pages of order > 0, you can use the fields in the struct
			
 
				+ * page associated with each page, but bear in mind that the pages may have
			
 
				+ * been inserted individually into the page cache, so you must use the above
			
 
				+ * four fields in a compatible way for each struct page.
			
 
				+ *
			
 
				+ * SLUB uses cmpxchg_double() to atomically update its freelist and
			
 
				+ * counters.  That requires that freelist & counters be adjacent and
			
 
				+ * double-word aligned.  We align all struct pages to double-word
			
 
				+ * boundaries, and ensure that 'freelist' is aligned within the
			
 
				+ * struct.
			
 
				  */
			
 
				+#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
			
 
				+#define _struct_page_alignment	__aligned(2 * sizeof(unsigned long))
			
 
				+#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
			
 
				+#define _slub_counter_t		unsigned long
			
 
				+#else
			
 
				+#define _slub_counter_t		unsigned int
			
 
				+#endif
			
 
				+#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
			
 
				+#define _struct_page_alignment
			
 
				+#define _slub_counter_t		unsigned int
			
 
				+#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */
			
 
				+
			
 
				 struct page {
			
 
				 	/* First double word block */
			
 
				 	unsigned long flags;		/* Atomic flags, some possibly
			
 
				 					 * updated asynchronously */
			
 
				 	union {
			
 
				-		struct address_space *mapping;	/* If low bit clear, points to
			
 
				-						 * inode address_space, or NULL.
			
 
				-						 * If page mapped as anonymous
			
 
				-						 * memory, low bit is set, and
			
 
				-						 * it points to anon_vma object
			
 
				-						 * or KSM private structure. See
			
 
				-						 * PAGE_MAPPING_ANON and
			
 
				-						 * PAGE_MAPPING_KSM.
			
 
				-						 */
			
 
				+		/* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */
			
 
				+		struct address_space *mapping;
			
 
				+
			
 
				 		void *s_mem;			/* slab first object */
			
 
				 		atomic_t compound_mapcount;	/* first tail page */
			
 
				 		/* page_deferred_list().next	 -- second tail page */
			
@@ -66,40 +94,27 @@ struct page {
 
				 	};
			
 
				 
			
 
				 	union {
			
 
				-#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
			
 
				-	defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
			
 
				-		/* Used for cmpxchg_double in slub */
			
 
				-		unsigned long counters;
			
 
				-#else
			
 
				-		/*
			
 
				-		 * Keep _refcount separate from slub cmpxchg_double data.
			
 
				-		 * As the rest of the double word is protected by slab_lock
			
 
				-		 * but _refcount is not.
			
 
				-		 */
			
 
				-		unsigned counters;
			
 
				-#endif
			
 
				-		struct {
			
 
				+		_slub_counter_t counters;
			
 
				+		unsigned int active;		/* SLAB */
			
 
				+		struct {			/* SLUB */
			
 
				+			unsigned inuse:16;
			
 
				+			unsigned objects:15;
			
 
				+			unsigned frozen:1;
			
 
				+		};
			
 
				+		int units;			/* SLOB */
			
 
				+
			
 
				+		struct {			/* Page cache */
			
 
				+			/*
			
 
				+			 * Count of ptes mapped in mms, to show when
			
 
				+			 * page is mapped & limit reverse map searches.
			
 
				+			 *
			
 
				+			 * Extra information about page type may be
			
 
				+			 * stored here for pages that are never mapped,
			
 
				+			 * in which case the value MUST BE <= -2.
			
 
				+			 * See page-flags.h for more details.
			
 
				+			 */
			
 
				+			atomic_t _mapcount;
			
 
				 
			
 
				-			union {
			
 
				-				/*
			
 
				-				 * Count of ptes mapped in mms, to show when
			
 
				-				 * page is mapped & limit reverse map searches.
			
 
				-				 *
			
 
				-				 * Extra information about page type may be
			
 
				-				 * stored here for pages that are never mapped,
			
 
				-				 * in which case the value MUST BE <= -2.
			
 
				-				 * See page-flags.h for more details.
			
 
				-				 */
			
 
				-				atomic_t _mapcount;
			
 
				-
			
 
				-				unsigned int active;		/* SLAB */
			
 
				-				struct {			/* SLUB */
			
 
				-					unsigned inuse:16;
			
 
				-					unsigned objects:15;
			
 
				-					unsigned frozen:1;
			
 
				-				};
			
 
				-				int units;			/* SLOB */
			
 
				-			};
			
 
				 			/*
			
 
				 			 * Usage count, *USE WRAPPER FUNCTION* when manual
			
 
				 			 * accounting. See page_ref.h
			
@@ -109,8 +124,6 @@ struct page {
 
				 	};
			
 
				 
			
 
				 	/*
			
 
				-	 * Third double word block
			
 
				-	 *
			
 
				 	 * WARNING: bit 0 of the first word encode PageTail(). That means
			
 
				 	 * the rest users of the storage space MUST NOT use the bit to
			
 
				 	 * avoid collision and false-positive PageTail().
			
@@ -145,19 +158,9 @@ struct page {
 
				 			unsigned long compound_head; /* If bit zero is set */
			
 
				 
			
 
				 			/* First tail page only */
			
 
				-#ifdef CONFIG_64BIT
			
 
				-			/*
			
 
				-			 * On 64 bit system we have enough space in struct page
			
 
				-			 * to encode compound_dtor and compound_order with
			
 
				-			 * unsigned int. It can help compiler generate better or
			
 
				-			 * smaller code on some archtectures.
			
 
				-			 */
			
 
				-			unsigned int compound_dtor;
			
 
				-			unsigned int compound_order;
			
 
				-#else
			
 
				-			unsigned short int compound_dtor;
			
 
				-			unsigned short int compound_order;
			
 
				-#endif
			
 
				+			unsigned char compound_dtor;
			
 
				+			unsigned char compound_order;
			
 
				+			/* two/six bytes available here */
			
 
				 		};
			
 
				 
			
 
				 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
			
@@ -171,15 +174,14 @@ struct page {
 
				 #endif
			
 
				 	};
			
 
				 
			
 
				-	/* Remainder is not double word aligned */
			
 
				 	union {
			
 
				-		unsigned long private;		/* Mapping-private opaque data:
			
 
				-					 	 * usually used for buffer_heads
			
 
				-						 * if PagePrivate set; used for
			
 
				-						 * swp_entry_t if PageSwapCache;
			
 
				-						 * indicates order in the buddy
			
 
				-						 * system if PG_buddy is set.
			
 
				-						 */
			
 
				+		/*
			
 
				+		 * Mapping-private opaque data:
			
 
				+		 * Usually used for buffer_heads if PagePrivate
			
 
				+		 * Used for swp_entry_t if PageSwapCache
			
 
				+		 * Indicates order in the buddy system if PageBuddy
			
 
				+		 */
			
 
				+		unsigned long private;
			
 
				 #if USE_SPLIT_PTE_PTLOCKS
			
 
				 #if ALLOC_SPLIT_PTLOCKS
			
 
				 		spinlock_t *ptl;
			
@@ -212,15 +214,7 @@ struct page {
 
				 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
			
 
				 	int _last_cpupid;
			
 
				 #endif
			
 
				-}
			
 
				-/*
			
 
				- * The struct page can be forced to be double word aligned so that atomic ops
			
 
				- * on double words work. The SLUB allocator can make use of such a feature.
			
 
				- */
			
 
				-#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
			
 
				-	__aligned(2 * sizeof(unsigned long))
			
 
				-#endif
			
 
				-;
			
 
				+} _struct_page_alignment;
			
 
				 
			
 
				 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
			
 
				 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
			
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -2,6 +2,7 @@
 
				 #ifndef _LINUX_MMU_NOTIFIER_H
			
 
				 #define _LINUX_MMU_NOTIFIER_H
			
 
				 
			
 
				+#include <linux/types.h>
			
 
				 #include <linux/list.h>
			
 
				 #include <linux/spinlock.h>
			
 
				 #include <linux/mm_types.h>
			
@@ -10,6 +11,9 @@
 
				 struct mmu_notifier;
			
 
				 struct mmu_notifier_ops;
			
 
				 
			
 
				+/* mmu_notifier_ops flags */
			
 
				+#define MMU_INVALIDATE_DOES_NOT_BLOCK	(0x01)
			
 
				+
			
 
				 #ifdef CONFIG_MMU_NOTIFIER
			
 
				 
			
 
				 /*
			
@@ -26,6 +30,15 @@ struct mmu_notifier_mm {
 
				 };
			
 
				 
			
 
				 struct mmu_notifier_ops {
			
 
				+	/*
			
 
				+	 * Flags to specify behavior of callbacks for this MMU notifier.
			
 
				+	 * Used to determine which context an operation may be called.
			
 
				+	 *
			
 
				+	 * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not
			
 
				+	 *	block
			
 
				+	 */
			
 
				+	int flags;
			
 
				+
			
 
				 	/*
			
 
				 	 * Called either by mmu_notifier_unregister or when the mm is
			
 
				 	 * being destroyed by exit_mmap, always before all pages are
			
@@ -137,6 +150,10 @@ struct mmu_notifier_ops {
 
				 	 * page. Pages will no longer be referenced by the linux
			
 
				 	 * address space but may still be referenced by sptes until
			
 
				 	 * the last refcount is dropped.
			
 
				+	 *
			
 
				+	 * If both of these callbacks cannot block, and invalidate_range
			
 
				+	 * cannot block, mmu_notifier_ops.flags should have
			
 
				+	 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
			
 
				 	 */
			
 
				 	void (*invalidate_range_start)(struct mmu_notifier *mn,
			
 
				 				       struct mm_struct *mm,
			
@@ -159,12 +176,13 @@ struct mmu_notifier_ops {
 
				 	 * external TLB range needs to be flushed. For more in depth
			
 
				 	 * discussion on this see Documentation/vm/mmu_notifier.txt
			
 
				 	 *
			
 
				-	 * The invalidate_range() function is called under the ptl
			
 
				-	 * spin-lock and not allowed to sleep.
			
 
				-	 *
			
 
				 	 * Note that this function might be called with just a sub-range
			
 
				 	 * of what was passed to invalidate_range_start()/end(), if
			
 
				 	 * called between those functions.
			
 
				+	 *
			
 
				+	 * If this callback cannot block, and invalidate_range_{start,end}
			
 
				+	 * cannot block, mmu_notifier_ops.flags should have
			
 
				+	 * MMU_INVALIDATE_DOES_NOT_BLOCK set.
			
 
				 	 */
			
 
				 	void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
			
 
				 				 unsigned long start, unsigned long end);
			
@@ -218,6 +236,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
 
				 				  bool only_end);
			
 
				 extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
			
 
				 				  unsigned long start, unsigned long end);
			
 
				+extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm);
			
 
				 
			
 
				 static inline void mmu_notifier_release(struct mm_struct *mm)
			
 
				 {
			
@@ -457,6 +476,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm,
 
				 {
			
 
				 }
			
 
				 
			
 
				+static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				 static inline void mmu_notifier_mm_init(struct mm_struct *mm)
			
 
				 {
			
 
				 }
			
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1166,8 +1166,16 @@ extern unsigned long usemap_size(void);
 
				 
			
 
				 /*
			
 
				  * We use the lower bits of the mem_map pointer to store
			
 
				- * a little bit of information.  There should be at least
			
 
				- * 3 bits here due to 32-bit alignment.
			
 
				+ * a little bit of information.  The pointer is calculated
			
 
				+ * as mem_map - section_nr_to_pfn(pnum).  The result is
			
 
				+ * aligned to the minimum alignment of the two values:
			
 
				+ *   1. All mem_map arrays are page-aligned.
			
 
				+ *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
			
 
				+ *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
			
 
				+ *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
			
 
				+ *      worst combination is powerpc with 256k pages,
			
 
				+ *      which results in PFN_SECTION_SHIFT equal 6.
			
 
				+ * To sum it up, at least 6 bits are available.
			
 
				  */
			
 
				 #define	SECTION_MARKED_PRESENT	(1UL<<0)
			
 
				 #define SECTION_HAS_MEM_MAP	(1UL<<1)
			
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -46,11 +46,6 @@
 
				  * guarantees that this bit is cleared for a page when it first is entered into
			
 
				  * the page cache.
			
 
				  *
			
 
				- * PG_highmem pages are not permanently mapped into the kernel virtual address
			
 
				- * space, they need to be kmapped separately for doing IO on the pages.  The
			
 
				- * struct page (these bits with information) are always mapped into kernel
			
 
				- * address space...
			
 
				- *
			
 
				  * PG_hwpoison indicates that a page got corrupted in hardware and contains
			
 
				  * data with incorrect ECC bits that triggered a machine check. Accessing is
			
 
				  * not safe since it may cause another machine check. Don't touch!
			
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -9,14 +9,14 @@
 
				 #ifndef _LINUX_PAGEVEC_H
			
 
				 #define _LINUX_PAGEVEC_H
			
 
				 
			
 
				-/* 14 pointers + two long's align the pagevec structure to a power of two */
			
 
				-#define PAGEVEC_SIZE	14
			
 
				+/* 15 pointers + header align the pagevec structure to a power of two */
			
 
				+#define PAGEVEC_SIZE	15
			
 
				 
			
 
				 struct page;
			
 
				 struct address_space;
			
 
				 
			
 
				 struct pagevec {
			
 
				-	unsigned long nr;
			
 
				+	unsigned char nr;
			
 
				 	bool percpu_pvec_drained;
			
 
				 	struct page *pages[PAGEVEC_SIZE];
			
 
				 };
			
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -11,7 +11,7 @@
 
				 /*
			
 
				  * Routines for handling mm_structs
			
 
				  */
			
 
				-extern struct mm_struct * mm_alloc(void);
			
 
				+extern struct mm_struct *mm_alloc(void);
			
 
				 
			
 
				 /**
			
 
				  * mmgrab() - Pin a &struct mm_struct.
			
@@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm)
 
				 	atomic_inc(&mm->mm_count);
			
 
				 }
			
 
				 
			
 
				-/* mmdrop drops the mm and the page tables */
			
 
				-extern void __mmdrop(struct mm_struct *);
			
 
				-static inline void mmdrop(struct mm_struct *mm)
			
 
				-{
			
 
				-	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
			
 
				-		__mmdrop(mm);
			
 
				-}
			
 
				-
			
 
				-static inline void mmdrop_async_fn(struct work_struct *work)
			
 
				-{
			
 
				-	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
			
 
				-	__mmdrop(mm);
			
 
				-}
			
 
				-
			
 
				-static inline void mmdrop_async(struct mm_struct *mm)
			
 
				-{
			
 
				-	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
			
 
				-		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
			
 
				-		schedule_work(&mm->async_put_work);
			
 
				-	}
			
 
				-}
			
 
				+extern void mmdrop(struct mm_struct *mm);
			
 
				 
			
 
				 /**
			
 
				  * mmget() - Pin the address space associated with a &struct mm_struct.
			
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -112,13 +112,11 @@ extern void shmem_uncharge(struct inode *inode, long pages);
 
				 
			
 
				 #ifdef CONFIG_TMPFS
			
 
				 
			
 
				-extern int shmem_add_seals(struct file *file, unsigned int seals);
			
 
				-extern int shmem_get_seals(struct file *file);
			
 
				-extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
			
 
				+extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg);
			
 
				 
			
 
				 #else
			
 
				 
			
 
				-static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
			
 
				+static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a)
			
 
				 {
			
 
				 	return -EINVAL;
			
 
				 }
			
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -332,7 +332,6 @@ extern void mark_page_accessed(struct page *);
 
				 extern void lru_add_drain(void);
			
 
				 extern void lru_add_drain_cpu(int cpu);
			
 
				 extern void lru_add_drain_all(void);
			
 
				-extern void lru_add_drain_all_cpuslocked(void);
			
 
				 extern void rotate_reclaimable_page(struct page *page);
			
 
				 extern void deactivate_file_page(struct page *page);
			
 
				 extern void mark_page_lazyfree(struct page *page);
			
@@ -345,7 +344,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 
				 
			
 
				 /* linux/mm/vmscan.c */
			
 
				 extern unsigned long zone_reclaimable_pages(struct zone *zone);
			
 
				-extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
			
 
				 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
			
 
				 					gfp_t gfp_mask, nodemask_t *mask);
			
 
				 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
			
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -216,23 +216,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 
				 	return x;
			
 
				 }
			
 
				 
			
 
				-static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
			
 
				-					enum node_stat_item item)
			
 
				-{
			
 
				-	long x = atomic_long_read(&pgdat->vm_stat[item]);
			
 
				-
			
 
				-#ifdef CONFIG_SMP
			
 
				-	int cpu;
			
 
				-	for_each_online_cpu(cpu)
			
 
				-		x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];
			
 
				-
			
 
				-	if (x < 0)
			
 
				-		x = 0;
			
 
				-#endif
			
 
				-	return x;
			
 
				-}
			
 
				-
			
 
				-
			
 
				 #ifdef CONFIG_NUMA
			
 
				 extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
			
 
				 extern unsigned long sum_zone_node_page_state(int node,
			
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver);
 
				 
			
 
				 int zpool_unregister_driver(struct zpool_driver *driver);
			
 
				 
			
 
				+bool zpool_evictable(struct zpool *pool);
			
 
				+
			
 
				 #endif
			
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -192,12 +192,12 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re
 
				 
			
 
				 TRACE_EVENT(mm_shrink_slab_start,
			
 
				 	TP_PROTO(struct shrinker *shr, struct shrink_control *sc,
			
 
				-		long nr_objects_to_shrink, unsigned long pgs_scanned,
			
 
				-		unsigned long lru_pgs, unsigned long cache_items,
			
 
				-		unsigned long long delta, unsigned long total_scan),
			
 
				+		long nr_objects_to_shrink, unsigned long cache_items,
			
 
				+		unsigned long long delta, unsigned long total_scan,
			
 
				+		int priority),
			
 
				 
			
 
				-	TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs,
			
 
				-		cache_items, delta, total_scan),
			
 
				+	TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan,
			
 
				+		priority),
			
 
				 
			
 
				 	TP_STRUCT__entry(
			
 
				 		__field(struct shrinker *, shr)
			
@@ -205,11 +205,10 @@ TRACE_EVENT(mm_shrink_slab_start,
 
				 		__field(int, nid)
			
 
				 		__field(long, nr_objects_to_shrink)
			
 
				 		__field(gfp_t, gfp_flags)
			
 
				-		__field(unsigned long, pgs_scanned)
			
 
				-		__field(unsigned long, lru_pgs)
			
 
				 		__field(unsigned long, cache_items)
			
 
				 		__field(unsigned long long, delta)
			
 
				 		__field(unsigned long, total_scan)
			
 
				+		__field(int, priority)
			
 
				 	),
			
 
				 
			
 
				 	TP_fast_assign(
			
@@ -218,24 +217,22 @@ TRACE_EVENT(mm_shrink_slab_start,
 
				 		__entry->nid = sc->nid;
			
 
				 		__entry->nr_objects_to_shrink = nr_objects_to_shrink;
			
 
				 		__entry->gfp_flags = sc->gfp_mask;
			
 
				-		__entry->pgs_scanned = pgs_scanned;
			
 
				-		__entry->lru_pgs = lru_pgs;
			
 
				 		__entry->cache_items = cache_items;
			
 
				 		__entry->delta = delta;
			
 
				 		__entry->total_scan = total_scan;
			
 
				+		__entry->priority = priority;
			
 
				 	),
			
 
				 
			
 
				-	TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld",
			
 
				+	TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d",
			
 
				 		__entry->shrink,
			
 
				 		__entry->shr,
			
 
				 		__entry->nid,
			
 
				 		__entry->nr_objects_to_shrink,
			
 
				 		show_gfp_flags(__entry->gfp_flags),
			
 
				-		__entry->pgs_scanned,
			
 
				-		__entry->lru_pgs,
			
 
				 		__entry->cache_items,
			
 
				 		__entry->delta,
			
 
				-		__entry->total_scan)
			
 
				+		__entry->total_scan,
			
 
				+		__entry->priority)
			
 
				 );
			
 
				 
			
 
				 TRACE_EVENT(mm_shrink_slab_end,
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,7 @@
 
				 #include <linux/blkdev.h>
			
 
				 #include <linux/fs_struct.h>
			
 
				 #include <linux/magic.h>
			
 
				+#include <linux/sched/mm.h>
			
 
				 #include <linux/perf_event.h>
			
 
				 #include <linux/posix-timers.h>
			
 
				 #include <linux/user-return-notifier.h>
			
@@ -390,6 +391,241 @@ void free_task(struct task_struct *tsk)
 
				 }
			
 
				 EXPORT_SYMBOL(free_task);
			
 
				 
			
 
				+#ifdef CONFIG_MMU
			
 
				+static __latent_entropy int dup_mmap(struct mm_struct *mm,
			
 
				+					struct mm_struct *oldmm)
			
 
				+{
			
 
				+	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
			
 
				+	struct rb_node **rb_link, *rb_parent;
			
 
				+	int retval;
			
 
				+	unsigned long charge;
			
 
				+	LIST_HEAD(uf);
			
 
				+
			
 
				+	uprobe_start_dup_mmap();
			
 
				+	if (down_write_killable(&oldmm->mmap_sem)) {
			
 
				+		retval = -EINTR;
			
 
				+		goto fail_uprobe_end;
			
 
				+	}
			
 
				+	flush_cache_dup_mm(oldmm);
			
 
				+	uprobe_dup_mmap(oldmm, mm);
			
 
				+	/*
			
 
				+	 * Not linked in yet - no deadlock potential:
			
 
				+	 */
			
 
				+	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
			
 
				+
			
 
				+	/* No ordering required: file already has been exposed. */
			
 
				+	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
			
 
				+
			
 
				+	mm->total_vm = oldmm->total_vm;
			
 
				+	mm->data_vm = oldmm->data_vm;
			
 
				+	mm->exec_vm = oldmm->exec_vm;
			
 
				+	mm->stack_vm = oldmm->stack_vm;
			
 
				+
			
 
				+	rb_link = &mm->mm_rb.rb_node;
			
 
				+	rb_parent = NULL;
			
 
				+	pprev = &mm->mmap;
			
 
				+	retval = ksm_fork(mm, oldmm);
			
 
				+	if (retval)
			
 
				+		goto out;
			
 
				+	retval = khugepaged_fork(mm, oldmm);
			
 
				+	if (retval)
			
 
				+		goto out;
			
 
				+
			
 
				+	prev = NULL;
			
 
				+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
			
 
				+		struct file *file;
			
 
				+
			
 
				+		if (mpnt->vm_flags & VM_DONTCOPY) {
			
 
				+			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
			
 
				+			continue;
			
 
				+		}
			
 
				+		charge = 0;
			
 
				+		if (mpnt->vm_flags & VM_ACCOUNT) {
			
 
				+			unsigned long len = vma_pages(mpnt);
			
 
				+
			
 
				+			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
			
 
				+				goto fail_nomem;
			
 
				+			charge = len;
			
 
				+		}
			
 
				+		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
			
 
				+		if (!tmp)
			
 
				+			goto fail_nomem;
			
 
				+		*tmp = *mpnt;
			
 
				+		INIT_LIST_HEAD(&tmp->anon_vma_chain);
			
 
				+		retval = vma_dup_policy(mpnt, tmp);
			
 
				+		if (retval)
			
 
				+			goto fail_nomem_policy;
			
 
				+		tmp->vm_mm = mm;
			
 
				+		retval = dup_userfaultfd(tmp, &uf);
			
 
				+		if (retval)
			
 
				+			goto fail_nomem_anon_vma_fork;
			
 
				+		if (tmp->vm_flags & VM_WIPEONFORK) {
			
 
				+			/* VM_WIPEONFORK gets a clean slate in the child. */
			
 
				+			tmp->anon_vma = NULL;
			
 
				+			if (anon_vma_prepare(tmp))
			
 
				+				goto fail_nomem_anon_vma_fork;
			
 
				+		} else if (anon_vma_fork(tmp, mpnt))
			
 
				+			goto fail_nomem_anon_vma_fork;
			
 
				+		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
			
 
				+		tmp->vm_next = tmp->vm_prev = NULL;
			
 
				+		file = tmp->vm_file;
			
 
				+		if (file) {
			
 
				+			struct inode *inode = file_inode(file);
			
 
				+			struct address_space *mapping = file->f_mapping;
			
 
				+
			
 
				+			get_file(file);
			
 
				+			if (tmp->vm_flags & VM_DENYWRITE)
			
 
				+				atomic_dec(&inode->i_writecount);
			
 
				+			i_mmap_lock_write(mapping);
			
 
				+			if (tmp->vm_flags & VM_SHARED)
			
 
				+				atomic_inc(&mapping->i_mmap_writable);
			
 
				+			flush_dcache_mmap_lock(mapping);
			
 
				+			/* insert tmp into the share list, just after mpnt */
			
 
				+			vma_interval_tree_insert_after(tmp, mpnt,
			
 
				+					&mapping->i_mmap);
			
 
				+			flush_dcache_mmap_unlock(mapping);
			
 
				+			i_mmap_unlock_write(mapping);
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * Clear hugetlb-related page reserves for children. This only
			
 
				+		 * affects MAP_PRIVATE mappings. Faults generated by the child
			
 
				+		 * are not guaranteed to succeed, even if read-only
			
 
				+		 */
			
 
				+		if (is_vm_hugetlb_page(tmp))
			
 
				+			reset_vma_resv_huge_pages(tmp);
			
 
				+
			
 
				+		/*
			
 
				+		 * Link in the new vma and copy the page table entries.
			
 
				+		 */
			
 
				+		*pprev = tmp;
			
 
				+		pprev = &tmp->vm_next;
			
 
				+		tmp->vm_prev = prev;
			
 
				+		prev = tmp;
			
 
				+
			
 
				+		__vma_link_rb(mm, tmp, rb_link, rb_parent);
			
 
				+		rb_link = &tmp->vm_rb.rb_right;
			
 
				+		rb_parent = &tmp->vm_rb;
			
 
				+
			
 
				+		mm->map_count++;
			
 
				+		if (!(tmp->vm_flags & VM_WIPEONFORK))
			
 
				+			retval = copy_page_range(mm, oldmm, mpnt);
			
 
				+
			
 
				+		if (tmp->vm_ops && tmp->vm_ops->open)
			
 
				+			tmp->vm_ops->open(tmp);
			
 
				+
			
 
				+		if (retval)
			
 
				+			goto out;
			
 
				+	}
			
 
				+	/* a new mm has just been created */
			
 
				+	arch_dup_mmap(oldmm, mm);
			
 
				+	retval = 0;
			
 
				+out:
			
 
				+	up_write(&mm->mmap_sem);
			
 
				+	flush_tlb_mm(oldmm);
			
 
				+	up_write(&oldmm->mmap_sem);
			
 
				+	dup_userfaultfd_complete(&uf);
			
 
				+fail_uprobe_end:
			
 
				+	uprobe_end_dup_mmap();
			
 
				+	return retval;
			
 
				+fail_nomem_anon_vma_fork:
			
 
				+	mpol_put(vma_policy(tmp));
			
 
				+fail_nomem_policy:
			
 
				+	kmem_cache_free(vm_area_cachep, tmp);
			
 
				+fail_nomem:
			
 
				+	retval = -ENOMEM;
			
 
				+	vm_unacct_memory(charge);
			
 
				+	goto out;
			
 
				+}
			
 
				+
			
 
				+static inline int mm_alloc_pgd(struct mm_struct *mm)
			
 
				+{
			
 
				+	mm->pgd = pgd_alloc(mm);
			
 
				+	if (unlikely(!mm->pgd))
			
 
				+		return -ENOMEM;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline void mm_free_pgd(struct mm_struct *mm)
			
 
				+{
			
 
				+	pgd_free(mm, mm->pgd);
			
 
				+}
			
 
				+#else
			
 
				+static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
			
 
				+{
			
 
				+	down_write(&oldmm->mmap_sem);
			
 
				+	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
			
 
				+	up_write(&oldmm->mmap_sem);
			
 
				+	return 0;
			
 
				+}
			
 
				+#define mm_alloc_pgd(mm)	(0)
			
 
				+#define mm_free_pgd(mm)
			
 
				+#endif /* CONFIG_MMU */
			
 
				+
			
 
				+static void check_mm(struct mm_struct *mm)
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	for (i = 0; i < NR_MM_COUNTERS; i++) {
			
 
				+		long x = atomic_long_read(&mm->rss_stat.count[i]);
			
 
				+
			
 
				+		if (unlikely(x))
			
 
				+			printk(KERN_ALERT "BUG: Bad rss-counter state "
			
 
				+					  "mm:%p idx:%d val:%ld\n", mm, i, x);
			
 
				+	}
			
 
				+
			
 
				+	if (mm_pgtables_bytes(mm))
			
 
				+		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
			
 
				+				mm_pgtables_bytes(mm));
			
 
				+
			
 
				+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
			
 
				+	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
			
 
				+#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
			
 
				+
			
 
				+/*
			
 
				+ * Called when the last reference to the mm
			
 
				+ * is dropped: either by a lazy thread or by
			
 
				+ * mmput. Free the page directory and the mm.
			
 
				+ */
			
 
				+static void __mmdrop(struct mm_struct *mm)
			
 
				+{
			
 
				+	BUG_ON(mm == &init_mm);
			
 
				+	mm_free_pgd(mm);
			
 
				+	destroy_context(mm);
			
 
				+	hmm_mm_destroy(mm);
			
 
				+	mmu_notifier_mm_destroy(mm);
			
 
				+	check_mm(mm);
			
 
				+	put_user_ns(mm->user_ns);
			
 
				+	free_mm(mm);
			
 
				+}
			
 
				+
			
 
				+void mmdrop(struct mm_struct *mm)
			
 
				+{
			
 
				+	if (unlikely(atomic_dec_and_test(&mm->mm_count)))
			
 
				+		__mmdrop(mm);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(mmdrop);
			
 
				+
			
 
				+static void mmdrop_async_fn(struct work_struct *work)
			
 
				+{
			
 
				+	struct mm_struct *mm;
			
 
				+
			
 
				+	mm = container_of(work, struct mm_struct, async_put_work);
			
 
				+	__mmdrop(mm);
			
 
				+}
			
 
				+
			
 
				+static void mmdrop_async(struct mm_struct *mm)
			
 
				+{
			
 
				+	if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
			
 
				+		INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
			
 
				+		schedule_work(&mm->async_put_work);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static inline void free_signal_struct(struct signal_struct *sig)
			
 
				 {
			
 
				 	taskstats_tgid_free(sig);
			
@@ -594,181 +830,8 @@ free_tsk:
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_MMU
			
 
				-static __latent_entropy int dup_mmap(struct mm_struct *mm,
			
 
				-					struct mm_struct *oldmm)
			
 
				-{
			
 
				-	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
			
 
				-	struct rb_node **rb_link, *rb_parent;
			
 
				-	int retval;
			
 
				-	unsigned long charge;
			
 
				-	LIST_HEAD(uf);
			
 
				-
			
 
				-	uprobe_start_dup_mmap();
			
 
				-	if (down_write_killable(&oldmm->mmap_sem)) {
			
 
				-		retval = -EINTR;
			
 
				-		goto fail_uprobe_end;
			
 
				-	}
			
 
				-	flush_cache_dup_mm(oldmm);
			
 
				-	uprobe_dup_mmap(oldmm, mm);
			
 
				-	/*
			
 
				-	 * Not linked in yet - no deadlock potential:
			
 
				-	 */
			
 
				-	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
			
 
				-
			
 
				-	/* No ordering required: file already has been exposed. */
			
 
				-	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
			
 
				-
			
 
				-	mm->total_vm = oldmm->total_vm;
			
 
				-	mm->data_vm = oldmm->data_vm;
			
 
				-	mm->exec_vm = oldmm->exec_vm;
			
 
				-	mm->stack_vm = oldmm->stack_vm;
			
 
				-
			
 
				-	rb_link = &mm->mm_rb.rb_node;
			
 
				-	rb_parent = NULL;
			
 
				-	pprev = &mm->mmap;
			
 
				-	retval = ksm_fork(mm, oldmm);
			
 
				-	if (retval)
			
 
				-		goto out;
			
 
				-	retval = khugepaged_fork(mm, oldmm);
			
 
				-	if (retval)
			
 
				-		goto out;
			
 
				-
			
 
				-	prev = NULL;
			
 
				-	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
			
 
				-		struct file *file;
			
 
				-
			
 
				-		if (mpnt->vm_flags & VM_DONTCOPY) {
			
 
				-			vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
			
 
				-			continue;
			
 
				-		}
			
 
				-		charge = 0;
			
 
				-		if (mpnt->vm_flags & VM_ACCOUNT) {
			
 
				-			unsigned long len = vma_pages(mpnt);
			
 
				-
			
 
				-			if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
			
 
				-				goto fail_nomem;
			
 
				-			charge = len;
			
 
				-		}
			
 
				-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
			
 
				-		if (!tmp)
			
 
				-			goto fail_nomem;
			
 
				-		*tmp = *mpnt;
			
 
				-		INIT_LIST_HEAD(&tmp->anon_vma_chain);
			
 
				-		retval = vma_dup_policy(mpnt, tmp);
			
 
				-		if (retval)
			
 
				-			goto fail_nomem_policy;
			
 
				-		tmp->vm_mm = mm;
			
 
				-		retval = dup_userfaultfd(tmp, &uf);
			
 
				-		if (retval)
			
 
				-			goto fail_nomem_anon_vma_fork;
			
 
				-		if (tmp->vm_flags & VM_WIPEONFORK) {
			
 
				-			/* VM_WIPEONFORK gets a clean slate in the child. */
			
 
				-			tmp->anon_vma = NULL;
			
 
				-			if (anon_vma_prepare(tmp))
			
 
				-				goto fail_nomem_anon_vma_fork;
			
 
				-		} else if (anon_vma_fork(tmp, mpnt))
			
 
				-			goto fail_nomem_anon_vma_fork;
			
 
				-		tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
			
 
				-		tmp->vm_next = tmp->vm_prev = NULL;
			
 
				-		file = tmp->vm_file;
			
 
				-		if (file) {
			
 
				-			struct inode *inode = file_inode(file);
			
 
				-			struct address_space *mapping = file->f_mapping;
			
 
				-
			
 
				-			get_file(file);
			
 
				-			if (tmp->vm_flags & VM_DENYWRITE)
			
 
				-				atomic_dec(&inode->i_writecount);
			
 
				-			i_mmap_lock_write(mapping);
			
 
				-			if (tmp->vm_flags & VM_SHARED)
			
 
				-				atomic_inc(&mapping->i_mmap_writable);
			
 
				-			flush_dcache_mmap_lock(mapping);
			
 
				-			/* insert tmp into the share list, just after mpnt */
			
 
				-			vma_interval_tree_insert_after(tmp, mpnt,
			
 
				-					&mapping->i_mmap);
			
 
				-			flush_dcache_mmap_unlock(mapping);
			
 
				-			i_mmap_unlock_write(mapping);
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * Clear hugetlb-related page reserves for children. This only
			
 
				-		 * affects MAP_PRIVATE mappings. Faults generated by the child
			
 
				-		 * are not guaranteed to succeed, even if read-only
			
 
				-		 */
			
 
				-		if (is_vm_hugetlb_page(tmp))
			
 
				-			reset_vma_resv_huge_pages(tmp);
			
 
				-
			
 
				-		/*
			
 
				-		 * Link in the new vma and copy the page table entries.
			
 
				-		 */
			
 
				-		*pprev = tmp;
			
 
				-		pprev = &tmp->vm_next;
			
 
				-		tmp->vm_prev = prev;
			
 
				-		prev = tmp;
			
 
				-
			
 
				-		__vma_link_rb(mm, tmp, rb_link, rb_parent);
			
 
				-		rb_link = &tmp->vm_rb.rb_right;
			
 
				-		rb_parent = &tmp->vm_rb;
			
 
				-
			
 
				-		mm->map_count++;
			
 
				-		if (!(tmp->vm_flags & VM_WIPEONFORK))
			
 
				-			retval = copy_page_range(mm, oldmm, mpnt);
			
 
				-
			
 
				-		if (tmp->vm_ops && tmp->vm_ops->open)
			
 
				-			tmp->vm_ops->open(tmp);
			
 
				-
			
 
				-		if (retval)
			
 
				-			goto out;
			
 
				-	}
			
 
				-	/* a new mm has just been created */
			
 
				-	retval = arch_dup_mmap(oldmm, mm);
			
 
				-out:
			
 
				-	up_write(&mm->mmap_sem);
			
 
				-	flush_tlb_mm(oldmm);
			
 
				-	up_write(&oldmm->mmap_sem);
			
 
				-	dup_userfaultfd_complete(&uf);
			
 
				-fail_uprobe_end:
			
 
				-	uprobe_end_dup_mmap();
			
 
				-	return retval;
			
 
				-fail_nomem_anon_vma_fork:
			
 
				-	mpol_put(vma_policy(tmp));
			
 
				-fail_nomem_policy:
			
 
				-	kmem_cache_free(vm_area_cachep, tmp);
			
 
				-fail_nomem:
			
 
				-	retval = -ENOMEM;
			
 
				-	vm_unacct_memory(charge);
			
 
				-	goto out;
			
 
				-}
			
 
				-
			
 
				-static inline int mm_alloc_pgd(struct mm_struct *mm)
			
 
				-{
			
 
				-	mm->pgd = pgd_alloc(mm);
			
 
				-	if (unlikely(!mm->pgd))
			
 
				-		return -ENOMEM;
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static inline void mm_free_pgd(struct mm_struct *mm)
			
 
				-{
			
 
				-	pgd_free(mm, mm->pgd);
			
 
				-}
			
 
				-#else
			
 
				-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
			
 
				-{
			
 
				-	down_write(&oldmm->mmap_sem);
			
 
				-	RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
			
 
				-	up_write(&oldmm->mmap_sem);
			
 
				-	return 0;
			
 
				-}
			
 
				-#define mm_alloc_pgd(mm)	(0)
			
 
				-#define mm_free_pgd(mm)
			
 
				-#endif /* CONFIG_MMU */
			
 
				-
			
 
				 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
			
 
				 
			
 
				-#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
			
 
				-#define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
			
 
				-
			
 
				 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
			
 
				 
			
 
				 static int __init coredump_filter_setup(char *s)
			
@@ -858,27 +921,6 @@ fail_nopgd:
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-static void check_mm(struct mm_struct *mm)
			
 
				-{
			
 
				-	int i;
			
 
				-
			
 
				-	for (i = 0; i < NR_MM_COUNTERS; i++) {
			
 
				-		long x = atomic_long_read(&mm->rss_stat.count[i]);
			
 
				-
			
 
				-		if (unlikely(x))
			
 
				-			printk(KERN_ALERT "BUG: Bad rss-counter state "
			
 
				-					  "mm:%p idx:%d val:%ld\n", mm, i, x);
			
 
				-	}
			
 
				-
			
 
				-	if (mm_pgtables_bytes(mm))
			
 
				-		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
			
 
				-				mm_pgtables_bytes(mm));
			
 
				-
			
 
				-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
			
 
				-	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Allocate and initialize an mm_struct.
			
 
				  */
			
@@ -894,24 +936,6 @@ struct mm_struct *mm_alloc(void)
 
				 	return mm_init(mm, current, current_user_ns());
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Called when the last reference to the mm
			
 
				- * is dropped: either by a lazy thread or by
			
 
				- * mmput. Free the page directory and the mm.
			
 
				- */
			
 
				-void __mmdrop(struct mm_struct *mm)
			
 
				-{
			
 
				-	BUG_ON(mm == &init_mm);
			
 
				-	mm_free_pgd(mm);
			
 
				-	destroy_context(mm);
			
 
				-	hmm_mm_destroy(mm);
			
 
				-	mmu_notifier_mm_destroy(mm);
			
 
				-	check_mm(mm);
			
 
				-	put_user_ns(mm->user_ns);
			
 
				-	free_mm(mm);
			
 
				-}
			
 
				-EXPORT_SYMBOL_GPL(__mmdrop);
			
 
				-
			
 
				 static inline void __mmput(struct mm_struct *mm)
			
 
				 {
			
 
				 	VM_BUG_ON(atomic_read(&mm->mm_users));
			
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = {
 
				 		.mode		= 0644,
			
 
				 		.proc_handler	= proc_dointvec,
			
 
				 	 },
			
 
				-	 {
			
 
				-		.procname	= "hugepages_treat_as_movable",
			
 
				-		.data		= &hugepages_treat_as_movable,
			
 
				-		.maxlen		= sizeof(int),
			
 
				-		.mode		= 0644,
			
 
				-		.proc_handler	= proc_dointvec,
			
 
				-	},
			
 
				 	{
			
 
				 		.procname	= "nr_overcommit_hugepages",
			
 
				 		.data		= NULL,
			
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB
 
				 
			
 
				 	  A sane initial value is 80 MB.
			
 
				 
			
 
				-# For architectures that support deferred memory initialisation
			
 
				-config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
			
 
				-	bool
			
 
				-
			
 
				 config DEFERRED_STRUCT_PAGE_INIT
			
 
				 	bool "Defer initialisation of struct pages to kthreads"
			
 
				 	default n
			
 
				-	depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
			
 
				-	depends on NO_BOOTMEM && MEMORY_HOTPLUG
			
 
				+	depends on NO_BOOTMEM
			
 
				 	depends on !FLATMEM
			
 
				 	help
			
 
				 	  Ordinarily all struct pages are initialised during early boot in a
			
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500;
 
				  * @order: The order of the current allocation
			
 
				  * @alloc_flags: The allocation flags of the current allocation
			
 
				  * @ac: The context of current allocation
			
 
				- * @mode: The migration mode for async, sync light, or sync migration
			
 
				+ * @prio: Determines how hard direct compaction should try to succeed
			
 
				  *
			
 
				  * This is the main entry point for direct page compaction.
			
 
				  */
			
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 
				 		 */
			
 
				 		start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
			
 
				 		end_index = (endbyte >> PAGE_SHIFT);
			
 
				-		if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) {
			
 
				+		/*
			
 
				+		 * The page at end_index will be inclusively discarded according
			
 
				+		 * by invalidate_mapping_pages(), so subtracting 1 from
			
 
				+		 * end_index means we will skip the last page.  But if endbyte
			
 
				+		 * is page aligned or is at the end of file, we should not skip
			
 
				+		 * that page - discarding the last page is safe enough.
			
 
				+		 */
			
 
				+		if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK &&
			
 
				+				endbyte != inode->i_size - 1) {
			
 
				 			/* First page is tricky as 0 - 1 = -1, but pgoff_t
			
 
				 			 * is unsigned, so the end_index >= start_index
			
 
				 			 * check below would be true and we'll discard the whole
			
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -31,7 +31,6 @@
 
				 #include <linux/blkdev.h>
			
 
				 #include <linux/security.h>
			
 
				 #include <linux/cpuset.h>
			
 
				-#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
			
 
				 #include <linux/hugetlb.h>
			
 
				 #include <linux/memcontrol.h>
			
 
				 #include <linux/cleancache.h>
			
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -418,7 +418,7 @@ again:
 
				 		}
			
 
				 
			
 
				 		if (!pte_present(pte)) {
			
 
				-			swp_entry_t entry;
			
 
				+			swp_entry_t entry = pte_to_swp_entry(pte);
			
 
				 
			
 
				 			if (!non_swap_entry(entry)) {
			
 
				 				if (hmm_vma_walk->fault)
			
@@ -426,8 +426,6 @@ again:
 
				 				continue;
			
 
				 			}
			
 
				 
			
 
				-			entry = pte_to_swp_entry(pte);
			
 
				-
			
 
				 			/*
			
 
				 			 * This is a special swap entry, ignore migration, use
			
 
				 			 * device and report anything else as error.
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 
				 	 * pmdp_invalidate() is required to make sure we don't miss
			
 
				 	 * dirty/young flags set by hardware.
			
 
				 	 */
			
 
				-	entry = *pmd;
			
 
				-	pmdp_invalidate(vma, addr, pmd);
			
 
				-
			
 
				-	/*
			
 
				-	 * Recover dirty/young flags.  It relies on pmdp_invalidate to not
			
 
				-	 * corrupt them.
			
 
				-	 */
			
 
				-	if (pmd_dirty(*pmd))
			
 
				-		entry = pmd_mkdirty(entry);
			
 
				-	if (pmd_young(*pmd))
			
 
				-		entry = pmd_mkyoung(entry);
			
 
				+	entry = pmdp_invalidate(vma, addr, pmd);
			
 
				 
			
 
				 	entry = pmd_modify(entry, newprot);
			
 
				 	if (preserve_write)
			
@@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
				 	struct mm_struct *mm = vma->vm_mm;
			
 
				 	struct page *page;
			
 
				 	pgtable_t pgtable;
			
 
				-	pmd_t _pmd;
			
 
				-	bool young, write, dirty, soft_dirty, pmd_migration = false;
			
 
				+	pmd_t old_pmd, _pmd;
			
 
				+	bool young, write, soft_dirty, pmd_migration = false;
			
 
				 	unsigned long addr;
			
 
				 	int i;
			
 
				 
			
@@ -2116,24 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
				 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
			
 
				 	}
			
 
				 
			
 
				+	/*
			
 
				+	 * Up to this point the pmd is present and huge and userland has the
			
 
				+	 * whole access to the hugepage during the split (which happens in
			
 
				+	 * place). If we overwrite the pmd with the not-huge version pointing
			
 
				+	 * to the pte here (which of course we could if all CPUs were bug
			
 
				+	 * free), userland could trigger a small page size TLB miss on the
			
 
				+	 * small sized TLB while the hugepage TLB entry is still established in
			
 
				+	 * the huge TLB. Some CPU doesn't like that.
			
 
				+	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
			
 
				+	 * 383 on page 93. Intel should be safe but is also warns that it's
			
 
				+	 * only safe if the permission and cache attributes of the two entries
			
 
				+	 * loaded in the two TLB is identical (which should be the case here).
			
 
				+	 * But it is generally safer to never allow small and huge TLB entries
			
 
				+	 * for the same virtual address to be loaded simultaneously. So instead
			
 
				+	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
			
 
				+	 * current pmd notpresent (atomically because here the pmd_trans_huge
			
 
				+	 * must remain set at all times on the pmd until the split is complete
			
 
				+	 * for this pmd), then we flush the SMP TLB and finally we write the
			
 
				+	 * non-huge version of the pmd entry with pmd_populate.
			
 
				+	 */
			
 
				+	old_pmd = pmdp_invalidate(vma, haddr, pmd);
			
 
				+
			
 
				 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
			
 
				-	pmd_migration = is_pmd_migration_entry(*pmd);
			
 
				+	pmd_migration = is_pmd_migration_entry(old_pmd);
			
 
				 	if (pmd_migration) {
			
 
				 		swp_entry_t entry;
			
 
				 
			
 
				-		entry = pmd_to_swp_entry(*pmd);
			
 
				+		entry = pmd_to_swp_entry(old_pmd);
			
 
				 		page = pfn_to_page(swp_offset(entry));
			
 
				 	} else
			
 
				 #endif
			
 
				-		page = pmd_page(*pmd);
			
 
				+		page = pmd_page(old_pmd);
			
 
				 	VM_BUG_ON_PAGE(!page_count(page), page);
			
 
				 	page_ref_add(page, HPAGE_PMD_NR - 1);
			
 
				-	write = pmd_write(*pmd);
			
 
				-	young = pmd_young(*pmd);
			
 
				-	dirty = pmd_dirty(*pmd);
			
 
				-	soft_dirty = pmd_soft_dirty(*pmd);
			
 
				+	if (pmd_dirty(old_pmd))
			
 
				+		SetPageDirty(page);
			
 
				+	write = pmd_write(old_pmd);
			
 
				+	young = pmd_young(old_pmd);
			
 
				+	soft_dirty = pmd_soft_dirty(old_pmd);
			
 
				 
			
 
				-	pmdp_huge_split_prepare(vma, haddr, pmd);
			
 
				+	/*
			
 
				+	 * Withdraw the table only after we mark the pmd entry invalid.
			
 
				+	 * This's critical for some architectures (Power).
			
 
				+	 */
			
 
				 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
			
 
				 	pmd_populate(mm, &_pmd, pgtable);
			
 
				 
			
@@ -2160,8 +2176,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
				 			if (soft_dirty)
			
 
				 				entry = pte_mksoft_dirty(entry);
			
 
				 		}
			
 
				-		if (dirty)
			
 
				-			SetPageDirty(page + i);
			
 
				 		pte = pte_offset_map(&_pmd, addr);
			
 
				 		BUG_ON(!pte_none(*pte));
			
 
				 		set_pte_at(mm, addr, pte, entry);
			
@@ -2189,28 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
				 	}
			
 
				 
			
 
				 	smp_wmb(); /* make pte visible before pmd */
			
 
				-	/*
			
 
				-	 * Up to this point the pmd is present and huge and userland has the
			
 
				-	 * whole access to the hugepage during the split (which happens in
			
 
				-	 * place). If we overwrite the pmd with the not-huge version pointing
			
 
				-	 * to the pte here (which of course we could if all CPUs were bug
			
 
				-	 * free), userland could trigger a small page size TLB miss on the
			
 
				-	 * small sized TLB while the hugepage TLB entry is still established in
			
 
				-	 * the huge TLB. Some CPU doesn't like that.
			
 
				-	 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
			
 
				-	 * 383 on page 93. Intel should be safe but is also warns that it's
			
 
				-	 * only safe if the permission and cache attributes of the two entries
			
 
				-	 * loaded in the two TLB is identical (which should be the case here).
			
 
				-	 * But it is generally safer to never allow small and huge TLB entries
			
 
				-	 * for the same virtual address to be loaded simultaneously. So instead
			
 
				-	 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
			
 
				-	 * current pmd notpresent (atomically because here the pmd_trans_huge
			
 
				-	 * and pmd_trans_splitting must remain set at all times on the pmd
			
 
				-	 * until the split is complete for this pmd), then we flush the SMP TLB
			
 
				-	 * and finally we write the non-huge version of the pmd entry with
			
 
				-	 * pmd_populate.
			
 
				-	 */
			
 
				-	pmdp_invalidate(vma, haddr, pmd);
			
 
				 	pmd_populate(mm, pmd, pgtable);
			
 
				 
			
 
				 	if (freeze) {
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,10 +34,9 @@
 
				 #include <linux/hugetlb_cgroup.h>
			
 
				 #include <linux/node.h>
			
 
				 #include <linux/userfaultfd_k.h>
			
 
				+#include <linux/page_owner.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				-int hugepages_treat_as_movable;
			
 
				-
			
 
				 int hugetlb_max_hstate __read_mostly;
			
 
				 unsigned int default_hstate_idx;
			
 
				 struct hstate hstates[HUGE_MAX_HSTATE];
			
@@ -926,7 +925,7 @@ retry_cpuset:
 
				 /* Movability of hugepages depends on migration support. */
			
 
				 static inline gfp_t htlb_alloc_mask(struct hstate *h)
			
 
				 {
			
 
				-	if (hugepages_treat_as_movable || hugepage_migration_supported(h))
			
 
				+	if (hugepage_migration_supported(h))
			
 
				 		return GFP_HIGHUSER_MOVABLE;
			
 
				 	else
			
 
				 		return GFP_HIGHUSER;
			
@@ -1108,7 +1107,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
 
				 	return zone_spans_pfn(zone, last_pfn);
			
 
				 }
			
 
				 
			
 
				-static struct page *alloc_gigantic_page(int nid, struct hstate *h)
			
 
				+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
			
 
				+		int nid, nodemask_t *nodemask)
			
 
				 {
			
 
				 	unsigned int order = huge_page_order(h);
			
 
				 	unsigned long nr_pages = 1 << order;
			
@@ -1116,11 +1116,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
 
				 	struct zonelist *zonelist;
			
 
				 	struct zone *zone;
			
 
				 	struct zoneref *z;
			
 
				-	gfp_t gfp_mask;
			
 
				 
			
 
				-	gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
			
 
				 	zonelist = node_zonelist(nid, gfp_mask);
			
 
				-	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) {
			
 
				+	for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
			
 
				 		spin_lock_irqsave(&zone->lock, flags);
			
 
				 
			
 
				 		pfn = ALIGN(zone->zone_start_pfn, nr_pages);
			
@@ -1151,41 +1149,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h)
 
				 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
			
 
				 static void prep_compound_gigantic_page(struct page *page, unsigned int order);
			
 
				 
			
 
				-static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
			
 
				-{
			
 
				-	struct page *page;
			
 
				-
			
 
				-	page = alloc_gigantic_page(nid, h);
			
 
				-	if (page) {
			
 
				-		prep_compound_gigantic_page(page, huge_page_order(h));
			
 
				-		prep_new_huge_page(h, page, nid);
			
 
				-	}
			
 
				-
			
 
				-	return page;
			
 
				-}
			
 
				-
			
 
				-static int alloc_fresh_gigantic_page(struct hstate *h,
			
 
				-				nodemask_t *nodes_allowed)
			
 
				-{
			
 
				-	struct page *page = NULL;
			
 
				-	int nr_nodes, node;
			
 
				-
			
 
				-	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
			
 
				-		page = alloc_fresh_gigantic_page_node(h, node);
			
 
				-		if (page)
			
 
				-			return 1;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
			
 
				 static inline bool gigantic_page_supported(void) { return false; }
			
 
				+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
			
 
				+		int nid, nodemask_t *nodemask) { return NULL; }
			
 
				 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
			
 
				 static inline void destroy_compound_gigantic_page(struct page *page,
			
 
				 						unsigned int order) { }
			
 
				-static inline int alloc_fresh_gigantic_page(struct hstate *h,
			
 
				-					nodemask_t *nodes_allowed) { return 0; }
			
 
				 #endif
			
 
				 
			
 
				 static void update_and_free_page(struct hstate *h, struct page *page)
			
@@ -1250,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
 
				 	ClearPagePrivate(&page[1]);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
			
 
				+ * code
			
 
				+ */
			
 
				+static inline bool PageHugeTemporary(struct page *page)
			
 
				+{
			
 
				+	if (!PageHuge(page))
			
 
				+		return false;
			
 
				+
			
 
				+	return (unsigned long)page[2].mapping == -1U;
			
 
				+}
			
 
				+
			
 
				+static inline void SetPageHugeTemporary(struct page *page)
			
 
				+{
			
 
				+	page[2].mapping = (void *)-1U;
			
 
				+}
			
 
				+
			
 
				+static inline void ClearPageHugeTemporary(struct page *page)
			
 
				+{
			
 
				+	page[2].mapping = NULL;
			
 
				+}
			
 
				+
			
 
				 void free_huge_page(struct page *page)
			
 
				 {
			
 
				 	/*
			
@@ -1284,7 +1276,11 @@ void free_huge_page(struct page *page)
 
				 	if (restore_reserve)
			
 
				 		h->resv_huge_pages++;
			
 
				 
			
 
				-	if (h->surplus_huge_pages_node[nid]) {
			
 
				+	if (PageHugeTemporary(page)) {
			
 
				+		list_del(&page->lru);
			
 
				+		ClearPageHugeTemporary(page);
			
 
				+		update_and_free_page(h, page);
			
 
				+	} else if (h->surplus_huge_pages_node[nid]) {
			
 
				 		/* remove the page from active list */
			
 
				 		list_del(&page->lru);
			
 
				 		update_and_free_page(h, page);
			
@@ -1306,7 +1302,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 
				 	h->nr_huge_pages++;
			
 
				 	h->nr_huge_pages_node[nid]++;
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				-	put_page(page); /* free it into the hugepage allocator */
			
 
				 }
			
 
				 
			
 
				 static void prep_compound_gigantic_page(struct page *page, unsigned int order)
			
@@ -1383,41 +1378,70 @@ pgoff_t __basepage_index(struct page *page)
 
				 	return (index << compound_order(page_head)) + compound_idx;
			
 
				 }
			
 
				 
			
 
				-static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
			
 
				+static struct page *alloc_buddy_huge_page(struct hstate *h,
			
 
				+		gfp_t gfp_mask, int nid, nodemask_t *nmask)
			
 
				 {
			
 
				+	int order = huge_page_order(h);
			
 
				 	struct page *page;
			
 
				 
			
 
				-	page = __alloc_pages_node(nid,
			
 
				-		htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
			
 
				-						__GFP_RETRY_MAYFAIL|__GFP_NOWARN,
			
 
				-		huge_page_order(h));
			
 
				-	if (page) {
			
 
				-		prep_new_huge_page(h, page, nid);
			
 
				-	}
			
 
				+	gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
			
 
				+	if (nid == NUMA_NO_NODE)
			
 
				+		nid = numa_mem_id();
			
 
				+	page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
			
 
				+	if (page)
			
 
				+		__count_vm_event(HTLB_BUDDY_PGALLOC);
			
 
				+	else
			
 
				+		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
			
 
				+
			
 
				+	return page;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Common helper to allocate a fresh hugetlb page. All specific allocators
			
 
				+ * should use this function to get new hugetlb pages
			
 
				+ */
			
 
				+static struct page *alloc_fresh_huge_page(struct hstate *h,
			
 
				+		gfp_t gfp_mask, int nid, nodemask_t *nmask)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+
			
 
				+	if (hstate_is_gigantic(h))
			
 
				+		page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
			
 
				+	else
			
 
				+		page = alloc_buddy_huge_page(h, gfp_mask,
			
 
				+				nid, nmask);
			
 
				+	if (!page)
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (hstate_is_gigantic(h))
			
 
				+		prep_compound_gigantic_page(page, huge_page_order(h));
			
 
				+	prep_new_huge_page(h, page, page_to_nid(page));
			
 
				 
			
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
			
 
				+/*
			
 
				+ * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
			
 
				+ * manner.
			
 
				+ */
			
 
				+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
			
 
				 {
			
 
				 	struct page *page;
			
 
				 	int nr_nodes, node;
			
 
				-	int ret = 0;
			
 
				+	gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
			
 
				 
			
 
				 	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
			
 
				-		page = alloc_fresh_huge_page_node(h, node);
			
 
				-		if (page) {
			
 
				-			ret = 1;
			
 
				+		page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
			
 
				+		if (page)
			
 
				 			break;
			
 
				-		}
			
 
				 	}
			
 
				 
			
 
				-	if (ret)
			
 
				-		count_vm_event(HTLB_BUDDY_PGALLOC);
			
 
				-	else
			
 
				-		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
			
 
				+	if (!page)
			
 
				+		return 0;
			
 
				 
			
 
				-	return ret;
			
 
				+	put_page(page); /* free it into the hugepage allocator */
			
 
				+
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1525,79 +1549,66 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 
				 	return rc;
			
 
				 }
			
 
				 
			
 
				-static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
			
 
				-		gfp_t gfp_mask, int nid, nodemask_t *nmask)
			
 
				-{
			
 
				-	int order = huge_page_order(h);
			
 
				-
			
 
				-	gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
			
 
				-	if (nid == NUMA_NO_NODE)
			
 
				-		nid = numa_mem_id();
			
 
				-	return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
			
 
				-}
			
 
				-
			
 
				-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
			
 
				+/*
			
 
				+ * Allocates a fresh surplus page from the page allocator.
			
 
				+ */
			
 
				+static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
			
 
				 		int nid, nodemask_t *nmask)
			
 
				 {
			
 
				-	struct page *page;
			
 
				-	unsigned int r_nid;
			
 
				+	struct page *page = NULL;
			
 
				 
			
 
				 	if (hstate_is_gigantic(h))
			
 
				 		return NULL;
			
 
				 
			
 
				+	spin_lock(&hugetlb_lock);
			
 
				+	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
			
 
				+		goto out_unlock;
			
 
				+	spin_unlock(&hugetlb_lock);
			
 
				+
			
 
				+	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
			
 
				+	if (!page)
			
 
				+		return NULL;
			
 
				+
			
 
				+	spin_lock(&hugetlb_lock);
			
 
				 	/*
			
 
				-	 * Assume we will successfully allocate the surplus page to
			
 
				-	 * prevent racing processes from causing the surplus to exceed
			
 
				-	 * overcommit
			
 
				-	 *
			
 
				-	 * This however introduces a different race, where a process B
			
 
				-	 * tries to grow the static hugepage pool while alloc_pages() is
			
 
				-	 * called by process A. B will only examine the per-node
			
 
				-	 * counters in determining if surplus huge pages can be
			
 
				-	 * converted to normal huge pages in adjust_pool_surplus(). A
			
 
				-	 * won't be able to increment the per-node counter, until the
			
 
				-	 * lock is dropped by B, but B doesn't drop hugetlb_lock until
			
 
				-	 * no more huge pages can be converted from surplus to normal
			
 
				-	 * state (and doesn't try to convert again). Thus, we have a
			
 
				-	 * case where a surplus huge page exists, the pool is grown, and
			
 
				-	 * the surplus huge page still exists after, even though it
			
 
				-	 * should just have been converted to a normal huge page. This
			
 
				-	 * does not leak memory, though, as the hugepage will be freed
			
 
				-	 * once it is out of use. It also does not allow the counters to
			
 
				-	 * go out of whack in adjust_pool_surplus() as we don't modify
			
 
				-	 * the node values until we've gotten the hugepage and only the
			
 
				-	 * per-node value is checked there.
			
 
				+	 * We could have raced with the pool size change.
			
 
				+	 * Double check that and simply deallocate the new page
			
 
				+	 * if we would end up overcommiting the surpluses. Abuse
			
 
				+	 * temporary page to workaround the nasty free_huge_page
			
 
				+	 * codeflow
			
 
				 	 */
			
 
				-	spin_lock(&hugetlb_lock);
			
 
				 	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
			
 
				-		spin_unlock(&hugetlb_lock);
			
 
				-		return NULL;
			
 
				+		SetPageHugeTemporary(page);
			
 
				+		put_page(page);
			
 
				+		page = NULL;
			
 
				 	} else {
			
 
				-		h->nr_huge_pages++;
			
 
				 		h->surplus_huge_pages++;
			
 
				+		h->nr_huge_pages_node[page_to_nid(page)]++;
			
 
				 	}
			
 
				+
			
 
				+out_unlock:
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 
			
 
				-	page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
			
 
				+	return page;
			
 
				+}
			
 
				 
			
 
				-	spin_lock(&hugetlb_lock);
			
 
				-	if (page) {
			
 
				-		INIT_LIST_HEAD(&page->lru);
			
 
				-		r_nid = page_to_nid(page);
			
 
				-		set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
			
 
				-		set_hugetlb_cgroup(page, NULL);
			
 
				-		/*
			
 
				-		 * We incremented the global counters already
			
 
				-		 */
			
 
				-		h->nr_huge_pages_node[r_nid]++;
			
 
				-		h->surplus_huge_pages_node[r_nid]++;
			
 
				-		__count_vm_event(HTLB_BUDDY_PGALLOC);
			
 
				-	} else {
			
 
				-		h->nr_huge_pages--;
			
 
				-		h->surplus_huge_pages--;
			
 
				-		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
			
 
				-	}
			
 
				-	spin_unlock(&hugetlb_lock);
			
 
				+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
			
 
				+		int nid, nodemask_t *nmask)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+
			
 
				+	if (hstate_is_gigantic(h))
			
 
				+		return NULL;
			
 
				+
			
 
				+	page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
			
 
				+	if (!page)
			
 
				+		return NULL;
			
 
				+
			
 
				+	/*
			
 
				+	 * We do not account these pages as surplus because they are only
			
 
				+	 * temporary and will be released properly on the last reference
			
 
				+	 */
			
 
				+	SetPageHugeTemporary(page);
			
 
				 
			
 
				 	return page;
			
 
				 }
			
@@ -1606,7 +1617,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
 
				  * Use the VMA's mpolicy to allocate a huge page from the buddy.
			
 
				  */
			
 
				 static
			
 
				-struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
			
 
				+struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
			
 
				 		struct vm_area_struct *vma, unsigned long addr)
			
 
				 {
			
 
				 	struct page *page;
			
@@ -1616,17 +1627,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
 
				 	nodemask_t *nodemask;
			
 
				 
			
 
				 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
			
 
				-	page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
			
 
				+	page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
			
 
				 	mpol_cond_put(mpol);
			
 
				 
			
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * This allocation function is useful in the context where vma is irrelevant.
			
 
				- * E.g. soft-offlining uses this function because it only cares physical
			
 
				- * address of error page.
			
 
				- */
			
 
				+/* page migration callback function */
			
 
				 struct page *alloc_huge_page_node(struct hstate *h, int nid)
			
 
				 {
			
 
				 	gfp_t gfp_mask = htlb_alloc_mask(h);
			
@@ -1641,12 +1648,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 
			
 
				 	if (!page)
			
 
				-		page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
			
 
				+		page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
			
 
				 
			
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				-
			
 
				+/* page migration callback function */
			
 
				 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
			
 
				 		nodemask_t *nmask)
			
 
				 {
			
@@ -1664,9 +1671,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
 
				 	}
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 
			
 
				-	/* No reservations, try to overcommit */
			
 
				+	return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
			
 
				+}
			
 
				+
			
 
				+/* mempolicy aware migration callback */
			
 
				+struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
			
 
				+		unsigned long address)
			
 
				+{
			
 
				+	struct mempolicy *mpol;
			
 
				+	nodemask_t *nodemask;
			
 
				+	struct page *page;
			
 
				+	gfp_t gfp_mask;
			
 
				+	int node;
			
 
				+
			
 
				+	gfp_mask = htlb_alloc_mask(h);
			
 
				+	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
			
 
				+	page = alloc_huge_page_nodemask(h, node, nodemask);
			
 
				+	mpol_cond_put(mpol);
			
 
				 
			
 
				-	return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
			
 
				+	return page;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1694,7 +1717,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 
				 retry:
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 	for (i = 0; i < needed; i++) {
			
 
				-		page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
			
 
				+		page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
			
 
				 				NUMA_NO_NODE, NULL);
			
 
				 		if (!page) {
			
 
				 			alloc_ok = false;
			
@@ -2031,7 +2054,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 
				 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
			
 
				 	if (!page) {
			
 
				 		spin_unlock(&hugetlb_lock);
			
 
				-		page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
			
 
				+		page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
			
 
				 		if (!page)
			
 
				 			goto out_uncharge_cgroup;
			
 
				 		if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
			
@@ -2074,20 +2097,6 @@ out_subpool_put:
 
				 	return ERR_PTR(-ENOSPC);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * alloc_huge_page()'s wrapper which simply returns the page if allocation
			
 
				- * succeeds, otherwise NULL. This function is called from new_vma_page(),
			
 
				- * where no ERR_VALUE is expected to be returned.
			
 
				- */
			
 
				-struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
			
 
				-				unsigned long addr, int avoid_reserve)
			
 
				-{
			
 
				-	struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
			
 
				-	if (IS_ERR(page))
			
 
				-		page = NULL;
			
 
				-	return page;
			
 
				-}
			
 
				-
			
 
				 int alloc_bootmem_huge_page(struct hstate *h)
			
 
				 	__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
			
 
				 int __alloc_bootmem_huge_page(struct hstate *h)
			
@@ -2150,6 +2159,8 @@ static void __init gather_bootmem_prealloc(void)
 
				 		prep_compound_huge_page(page, h->order);
			
 
				 		WARN_ON(PageReserved(page));
			
 
				 		prep_new_huge_page(h, page, page_to_nid(page));
			
 
				+		put_page(page); /* free it into the hugepage allocator */
			
 
				+
			
 
				 		/*
			
 
				 		 * If we had gigantic hugepages allocated at boot time, we need
			
 
				 		 * to restore the 'stolen' pages to totalram_pages in order to
			
@@ -2169,7 +2180,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 
				 		if (hstate_is_gigantic(h)) {
			
 
				 			if (!alloc_bootmem_huge_page(h))
			
 
				 				break;
			
 
				-		} else if (!alloc_fresh_huge_page(h,
			
 
				+		} else if (!alloc_pool_huge_page(h,
			
 
				 					 &node_states[N_MEMORY]))
			
 
				 			break;
			
 
				 		cond_resched();
			
@@ -2289,7 +2300,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 
				 	 * First take pages out of surplus state.  Then make up the
			
 
				 	 * remaining difference by allocating fresh huge pages.
			
 
				 	 *
			
 
				-	 * We might race with __alloc_buddy_huge_page() here and be unable
			
 
				+	 * We might race with alloc_surplus_huge_page() here and be unable
			
 
				 	 * to convert a surplus huge page to a normal huge page. That is
			
 
				 	 * not critical, though, it just means the overall size of the
			
 
				 	 * pool might be one hugepage larger than it needs to be, but
			
@@ -2312,10 +2323,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 
				 		/* yield cpu to avoid soft lockup */
			
 
				 		cond_resched();
			
 
				 
			
 
				-		if (hstate_is_gigantic(h))
			
 
				-			ret = alloc_fresh_gigantic_page(h, nodes_allowed);
			
 
				-		else
			
 
				-			ret = alloc_fresh_huge_page(h, nodes_allowed);
			
 
				+		ret = alloc_pool_huge_page(h, nodes_allowed);
			
 
				 		spin_lock(&hugetlb_lock);
			
 
				 		if (!ret)
			
 
				 			goto out;
			
@@ -2335,7 +2343,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 
				 	 * By placing pages into the surplus state independent of the
			
 
				 	 * overcommit value, we are allowing the surplus pool size to
			
 
				 	 * exceed overcommit. There are few sane options here. Since
			
 
				-	 * __alloc_buddy_huge_page() is checking the global counter,
			
 
				+	 * alloc_surplus_huge_page() is checking the global counter,
			
 
				 	 * though, we'll note that we're not allowed to exceed surplus
			
 
				 	 * and won't grow the pool anywhere else. Not until one of the
			
 
				 	 * sysctls are changed, or the surplus pages go out of use.
			
@@ -2975,20 +2983,32 @@ out:
 
				 
			
 
				 void hugetlb_report_meminfo(struct seq_file *m)
			
 
				 {
			
 
				-	struct hstate *h = &default_hstate;
			
 
				+	struct hstate *h;
			
 
				+	unsigned long total = 0;
			
 
				+
			
 
				 	if (!hugepages_supported())
			
 
				 		return;
			
 
				-	seq_printf(m,
			
 
				-			"HugePages_Total:   %5lu\n"
			
 
				-			"HugePages_Free:    %5lu\n"
			
 
				-			"HugePages_Rsvd:    %5lu\n"
			
 
				-			"HugePages_Surp:    %5lu\n"
			
 
				-			"Hugepagesize:   %8lu kB\n",
			
 
				-			h->nr_huge_pages,
			
 
				-			h->free_huge_pages,
			
 
				-			h->resv_huge_pages,
			
 
				-			h->surplus_huge_pages,
			
 
				-			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
			
 
				+
			
 
				+	for_each_hstate(h) {
			
 
				+		unsigned long count = h->nr_huge_pages;
			
 
				+
			
 
				+		total += (PAGE_SIZE << huge_page_order(h)) * count;
			
 
				+
			
 
				+		if (h == &default_hstate)
			
 
				+			seq_printf(m,
			
 
				+				   "HugePages_Total:   %5lu\n"
			
 
				+				   "HugePages_Free:    %5lu\n"
			
 
				+				   "HugePages_Rsvd:    %5lu\n"
			
 
				+				   "HugePages_Surp:    %5lu\n"
			
 
				+				   "Hugepagesize:   %8lu kB\n",
			
 
				+				   count,
			
 
				+				   h->free_huge_pages,
			
 
				+				   h->resv_huge_pages,
			
 
				+				   h->surplus_huge_pages,
			
 
				+				   (PAGE_SIZE << huge_page_order(h)) / 1024);
			
 
				+	}
			
 
				+
			
 
				+	seq_printf(m, "Hugetlb:        %8lu kB\n", total / 1024);
			
 
				 }
			
 
				 
			
 
				 int hugetlb_report_node_meminfo(int nid, char *buf)
			
@@ -4799,3 +4819,36 @@ void putback_active_hugepage(struct page *page)
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 	put_page(page);
			
 
				 }
			
 
				+
			
 
				+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
			
 
				+{
			
 
				+	struct hstate *h = page_hstate(oldpage);
			
 
				+
			
 
				+	hugetlb_cgroup_migrate(oldpage, newpage);
			
 
				+	set_page_owner_migrate_reason(newpage, reason);
			
 
				+
			
 
				+	/*
			
 
				+	 * transfer temporary state of the new huge page. This is
			
 
				+	 * reverse to other transitions because the newpage is going to
			
 
				+	 * be final while the old one will be freed so it takes over
			
 
				+	 * the temporary status.
			
 
				+	 *
			
 
				+	 * Also note that we have to transfer the per-node surplus state
			
 
				+	 * here as well otherwise the global surplus count will not match
			
 
				+	 * the per-node's.
			
 
				+	 */
			
 
				+	if (PageHugeTemporary(newpage)) {
			
 
				+		int old_nid = page_to_nid(oldpage);
			
 
				+		int new_nid = page_to_nid(newpage);
			
 
				+
			
 
				+		SetPageHugeTemporary(oldpage);
			
 
				+		ClearPageHugeTemporary(newpage);
			
 
				+
			
 
				+		spin_lock(&hugetlb_lock);
			
 
				+		if (h->surplus_huge_pages_node[old_nid]) {
			
 
				+			h->surplus_huge_pages_node[old_nid]--;
			
 
				+			h->surplus_huge_pages_node[new_nid]++;
			
 
				+		}
			
 
				+		spin_unlock(&hugetlb_lock);
			
 
				+	}
			
 
				+}
			
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
 
				 
			
 
				 static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
			
 
				 {
			
 
				-	return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
			
 
				+	return v->vm_pgoff + vma_pages(v) - 1;
			
 
				 }
			
 
				 
			
 
				 INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
			
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm,
 
				 		}
			
 
				 
			
 
				 		if (page_mapped(page))
			
 
				-			unmap_mapping_range(mapping, index << PAGE_SHIFT,
			
 
				-					PAGE_SIZE, 0);
			
 
				+			unmap_mapping_pages(mapping, index, 1, false);
			
 
				 
			
 
				 		spin_lock_irq(&mapping->tree_lock);
			
 
				 
			
@@ -1674,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
				 	spin_unlock(&khugepaged_mm_lock);
			
 
				 
			
 
				 	mm = mm_slot->mm;
			
 
				-	down_read(&mm->mmap_sem);
			
 
				-	if (unlikely(khugepaged_test_exit(mm)))
			
 
				-		vma = NULL;
			
 
				-	else
			
 
				+	/*
			
 
				+	 * Don't wait for semaphore (to avoid long wait times).  Just move to
			
 
				+	 * the next mm on the list.
			
 
				+	 */
			
 
				+	vma = NULL;
			
 
				+	if (unlikely(!down_read_trylock(&mm->mmap_sem)))
			
 
				+		goto breakouterloop_mmap_sem;
			
 
				+	if (likely(!khugepaged_test_exit(mm)))
			
 
				 		vma = find_vma(mm, khugepaged_scan.address);
			
 
				 
			
 
				 	progress++;
			
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -91,7 +91,6 @@
 
				 #include <linux/stacktrace.h>
			
 
				 #include <linux/cache.h>
			
 
				 #include <linux/percpu.h>
			
 
				-#include <linux/hardirq.h>
			
 
				 #include <linux/bootmem.h>
			
 
				 #include <linux/pfn.h>
			
 
				 #include <linux/mmzone.h>
			
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 
				 	return mz;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Return page count for single (non recursive) @memcg.
			
 
				- *
			
 
				- * Implementation Note: reading percpu statistics for memcg.
			
 
				- *
			
 
				- * Both of vmstat[] and percpu_counter has threshold and do periodic
			
 
				- * synchronization to implement "quick" read. There are trade-off between
			
 
				- * reading cost and precision of value. Then, we may have a chance to implement
			
 
				- * a periodic synchronization of counter in memcg's counter.
			
 
				- *
			
 
				- * But this _read() function is used for user interface now. The user accounts
			
 
				- * memory usage by memory cgroup and he _always_ requires exact value because
			
 
				- * he accounts memory. Even if we provide quick-and-fuzzy read, we always
			
 
				- * have to visit all online cpus and make sum. So, for now, unnecessary
			
 
				- * synchronization is not implemented. (just implemented for cpu hotplug)
			
 
				- *
			
 
				- * If there are kernel internal actions which can make use of some not-exact
			
 
				- * value, and reading all cpu value can be performance bottleneck in some
			
 
				- * common workload, threshold and synchronization as vmstat[] should be
			
 
				- * implemented.
			
 
				- *
			
 
				- * The parameter idx can be of type enum memcg_event_item or vm_event_item.
			
 
				- */
			
 
				-
			
 
				 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
			
 
				 				      int event)
			
 
				 {
			
 
				-	unsigned long val = 0;
			
 
				-	int cpu;
			
 
				-
			
 
				-	for_each_possible_cpu(cpu)
			
 
				-		val += per_cpu(memcg->stat->events[event], cpu);
			
 
				-	return val;
			
 
				+	return atomic_long_read(&memcg->events[event]);
			
 
				 }
			
 
				 
			
 
				 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
			
@@ -586,27 +557,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 
				 	 * counted as CACHE even if it's on ANON LRU.
			
 
				 	 */
			
 
				 	if (PageAnon(page))
			
 
				-		__this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
			
 
				+		__mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
			
 
				 	else {
			
 
				-		__this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
			
 
				+		__mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
			
 
				 		if (PageSwapBacked(page))
			
 
				-			__this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
			
 
				+			__mod_memcg_state(memcg, NR_SHMEM, nr_pages);
			
 
				 	}
			
 
				 
			
 
				 	if (compound) {
			
 
				 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
			
 
				-		__this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
			
 
				+		__mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
			
 
				 	}
			
 
				 
			
 
				 	/* pagein of a big page is an event. So, ignore page size */
			
 
				 	if (nr_pages > 0)
			
 
				-		__this_cpu_inc(memcg->stat->events[PGPGIN]);
			
 
				+		__count_memcg_events(memcg, PGPGIN, 1);
			
 
				 	else {
			
 
				-		__this_cpu_inc(memcg->stat->events[PGPGOUT]);
			
 
				+		__count_memcg_events(memcg, PGPGOUT, 1);
			
 
				 		nr_pages = -nr_pages; /* for event */
			
 
				 	}
			
 
				 
			
 
				-	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
			
 
				+	__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
			
 
				 }
			
 
				 
			
 
				 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
			
@@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 
				 {
			
 
				 	unsigned long val, next;
			
 
				 
			
 
				-	val = __this_cpu_read(memcg->stat->nr_page_events);
			
 
				-	next = __this_cpu_read(memcg->stat->targets[target]);
			
 
				+	val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
			
 
				+	next = __this_cpu_read(memcg->stat_cpu->targets[target]);
			
 
				 	/* from time_after() in jiffies.h */
			
 
				 	if ((long)(next - val) < 0) {
			
 
				 		switch (target) {
			
@@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 
				 		default:
			
 
				 			break;
			
 
				 		}
			
 
				-		__this_cpu_write(memcg->stat->targets[target], next);
			
 
				+		__this_cpu_write(memcg->stat_cpu->targets[target], next);
			
 
				 		return true;
			
 
				 	}
			
 
				 	return false;
			
@@ -1124,7 +1095,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				-unsigned int memcg1_stats[] = {
			
 
				+static const unsigned int memcg1_stats[] = {
			
 
				 	MEMCG_CACHE,
			
 
				 	MEMCG_RSS,
			
 
				 	MEMCG_RSS_HUGE,
			
@@ -1205,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * This function returns the number of memcg under hierarchy tree. Returns
			
 
				- * 1(self count) if no children.
			
 
				- */
			
 
				-static int mem_cgroup_count_children(struct mem_cgroup *memcg)
			
 
				-{
			
 
				-	int num = 0;
			
 
				-	struct mem_cgroup *iter;
			
 
				-
			
 
				-	for_each_mem_cgroup_tree(iter, memcg)
			
 
				-		num++;
			
 
				-	return num;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Return the memory (and swap, if configured) limit for a memcg.
			
 
				  */
			
@@ -1707,11 +1664,6 @@ void unlock_page_memcg(struct page *page)
 
				 }
			
 
				 EXPORT_SYMBOL(unlock_page_memcg);
			
 
				 
			
 
				-/*
			
 
				- * size of first charge trial. "32" comes from vmscan.c's magic value.
			
 
				- * TODO: maybe necessary to use big numbers in big irons.
			
 
				- */
			
 
				-#define CHARGE_BATCH	32U
			
 
				 struct memcg_stock_pcp {
			
 
				 	struct mem_cgroup *cached; /* this never be root cgroup */
			
 
				 	unsigned int nr_pages;
			
@@ -1739,7 +1691,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 
				 	unsigned long flags;
			
 
				 	bool ret = false;
			
 
				 
			
 
				-	if (nr_pages > CHARGE_BATCH)
			
 
				+	if (nr_pages > MEMCG_CHARGE_BATCH)
			
 
				 		return ret;
			
 
				 
			
 
				 	local_irq_save(flags);
			
@@ -1808,7 +1760,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 
				 	}
			
 
				 	stock->nr_pages += nr_pages;
			
 
				 
			
 
				-	if (stock->nr_pages > CHARGE_BATCH)
			
 
				+	if (stock->nr_pages > MEMCG_CHARGE_BATCH)
			
 
				 		drain_stock(stock);
			
 
				 
			
 
				 	local_irq_restore(flags);
			
@@ -1858,9 +1810,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 
				 static int memcg_hotplug_cpu_dead(unsigned int cpu)
			
 
				 {
			
 
				 	struct memcg_stock_pcp *stock;
			
 
				+	struct mem_cgroup *memcg;
			
 
				 
			
 
				 	stock = &per_cpu(memcg_stock, cpu);
			
 
				 	drain_stock(stock);
			
 
				+
			
 
				+	for_each_mem_cgroup(memcg) {
			
 
				+		int i;
			
 
				+
			
 
				+		for (i = 0; i < MEMCG_NR_STAT; i++) {
			
 
				+			int nid;
			
 
				+			long x;
			
 
				+
			
 
				+			x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
			
 
				+			if (x)
			
 
				+				atomic_long_add(x, &memcg->stat[i]);
			
 
				+
			
 
				+			if (i >= NR_VM_NODE_STAT_ITEMS)
			
 
				+				continue;
			
 
				+
			
 
				+			for_each_node(nid) {
			
 
				+				struct mem_cgroup_per_node *pn;
			
 
				+
			
 
				+				pn = mem_cgroup_nodeinfo(memcg, nid);
			
 
				+				x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
			
 
				+				if (x)
			
 
				+					atomic_long_add(x, &pn->lruvec_stat[i]);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		for (i = 0; i < MEMCG_NR_EVENTS; i++) {
			
 
				+			long x;
			
 
				+
			
 
				+			x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
			
 
				+			if (x)
			
 
				+				atomic_long_add(x, &memcg->events[i]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -1881,7 +1868,7 @@ static void high_work_func(struct work_struct *work)
 
				 	struct mem_cgroup *memcg;
			
 
				 
			
 
				 	memcg = container_of(work, struct mem_cgroup, high_work);
			
 
				-	reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
			
 
				+	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1905,7 +1892,7 @@ void mem_cgroup_handle_over_high(void)
 
				 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
			
 
				 		      unsigned int nr_pages)
			
 
				 {
			
 
				-	unsigned int batch = max(CHARGE_BATCH, nr_pages);
			
 
				+	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
			
 
				 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
			
 
				 	struct mem_cgroup *mem_over_limit;
			
 
				 	struct page_counter *counter;
			
@@ -2415,18 +2402,11 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 
				 	for (i = 1; i < HPAGE_PMD_NR; i++)
			
 
				 		head[i].mem_cgroup = head->mem_cgroup;
			
 
				 
			
 
				-	__this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
			
 
				-		       HPAGE_PMD_NR);
			
 
				+	__mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
			
 
				 }
			
 
				 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
			
 
				 
			
 
				 #ifdef CONFIG_MEMCG_SWAP
			
 
				-static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
			
 
				-				       int nr_entries)
			
 
				-{
			
 
				-	this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
			
 
				  * @entry: swap entry to be moved
			
@@ -2450,8 +2430,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 
				 	new_id = mem_cgroup_id(to);
			
 
				 
			
 
				 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
			
 
				-		mem_cgroup_swap_statistics(from, -1);
			
 
				-		mem_cgroup_swap_statistics(to, 1);
			
 
				+		mod_memcg_state(from, MEMCG_SWAP, -1);
			
 
				+		mod_memcg_state(to, MEMCG_SWAP, 1);
			
 
				 		return 0;
			
 
				 	}
			
 
				 	return -EINVAL;
			
@@ -2467,23 +2447,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 
				 static DEFINE_MUTEX(memcg_limit_mutex);
			
 
				 
			
 
				 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
			
 
				-				   unsigned long limit)
			
 
				+				   unsigned long limit, bool memsw)
			
 
				 {
			
 
				-	unsigned long curusage;
			
 
				-	unsigned long oldusage;
			
 
				 	bool enlarge = false;
			
 
				-	int retry_count;
			
 
				 	int ret;
			
 
				-
			
 
				-	/*
			
 
				-	 * For keeping hierarchical_reclaim simple, how long we should retry
			
 
				-	 * is depends on callers. We set our retry-count to be function
			
 
				-	 * of # of children which we should visit in this loop.
			
 
				-	 */
			
 
				-	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
			
 
				-		      mem_cgroup_count_children(memcg);
			
 
				-
			
 
				-	oldusage = page_counter_read(&memcg->memory);
			
 
				+	bool limits_invariant;
			
 
				+	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
			
 
				 
			
 
				 	do {
			
 
				 		if (signal_pending(current)) {
			
@@ -2492,79 +2461,31 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 
				 		}
			
 
				 
			
 
				 		mutex_lock(&memcg_limit_mutex);
			
 
				-		if (limit > memcg->memsw.limit) {
			
 
				+		/*
			
 
				+		 * Make sure that the new limit (memsw or memory limit) doesn't
			
 
				+		 * break our basic invariant rule memory.limit <= memsw.limit.
			
 
				+		 */
			
 
				+		limits_invariant = memsw ? limit >= memcg->memory.limit :
			
 
				+					   limit <= memcg->memsw.limit;
			
 
				+		if (!limits_invariant) {
			
 
				 			mutex_unlock(&memcg_limit_mutex);
			
 
				 			ret = -EINVAL;
			
 
				 			break;
			
 
				 		}
			
 
				-		if (limit > memcg->memory.limit)
			
 
				+		if (limit > counter->limit)
			
 
				 			enlarge = true;
			
 
				-		ret = page_counter_limit(&memcg->memory, limit);
			
 
				+		ret = page_counter_limit(counter, limit);
			
 
				 		mutex_unlock(&memcg_limit_mutex);
			
 
				 
			
 
				 		if (!ret)
			
 
				 			break;
			
 
				 
			
 
				-		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
			
 
				-
			
 
				-		curusage = page_counter_read(&memcg->memory);
			
 
				-		/* Usage is reduced ? */
			
 
				-		if (curusage >= oldusage)
			
 
				-			retry_count--;
			
 
				-		else
			
 
				-			oldusage = curusage;
			
 
				-	} while (retry_count);
			
 
				-
			
 
				-	if (!ret && enlarge)
			
 
				-		memcg_oom_recover(memcg);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
			
 
				-					 unsigned long limit)
			
 
				-{
			
 
				-	unsigned long curusage;
			
 
				-	unsigned long oldusage;
			
 
				-	bool enlarge = false;
			
 
				-	int retry_count;
			
 
				-	int ret;
			
 
				-
			
 
				-	/* see mem_cgroup_resize_res_limit */
			
 
				-	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
			
 
				-		      mem_cgroup_count_children(memcg);
			
 
				-
			
 
				-	oldusage = page_counter_read(&memcg->memsw);
			
 
				-
			
 
				-	do {
			
 
				-		if (signal_pending(current)) {
			
 
				-			ret = -EINTR;
			
 
				+		if (!try_to_free_mem_cgroup_pages(memcg, 1,
			
 
				+					GFP_KERNEL, !memsw)) {
			
 
				+			ret = -EBUSY;
			
 
				 			break;
			
 
				 		}
			
 
				-
			
 
				-		mutex_lock(&memcg_limit_mutex);
			
 
				-		if (limit < memcg->memory.limit) {
			
 
				-			mutex_unlock(&memcg_limit_mutex);
			
 
				-			ret = -EINVAL;
			
 
				-			break;
			
 
				-		}
			
 
				-		if (limit > memcg->memsw.limit)
			
 
				-			enlarge = true;
			
 
				-		ret = page_counter_limit(&memcg->memsw, limit);
			
 
				-		mutex_unlock(&memcg_limit_mutex);
			
 
				-
			
 
				-		if (!ret)
			
 
				-			break;
			
 
				-
			
 
				-		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
			
 
				-
			
 
				-		curusage = page_counter_read(&memcg->memsw);
			
 
				-		/* Usage is reduced ? */
			
 
				-		if (curusage >= oldusage)
			
 
				-			retry_count--;
			
 
				-		else
			
 
				-			oldusage = curusage;
			
 
				-	} while (retry_count);
			
 
				+	} while (true);
			
 
				 
			
 
				 	if (!ret && enlarge)
			
 
				 		memcg_oom_recover(memcg);
			
@@ -3020,10 +2941,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 
				 		}
			
 
				 		switch (MEMFILE_TYPE(of_cft(of)->private)) {
			
 
				 		case _MEM:
			
 
				-			ret = mem_cgroup_resize_limit(memcg, nr_pages);
			
 
				+			ret = mem_cgroup_resize_limit(memcg, nr_pages, false);
			
 
				 			break;
			
 
				 		case _MEMSWAP:
			
 
				-			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
			
 
				+			ret = mem_cgroup_resize_limit(memcg, nr_pages, true);
			
 
				 			break;
			
 
				 		case _KMEM:
			
 
				 			ret = memcg_update_kmem_limit(memcg, nr_pages);
			
@@ -4168,8 +4089,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 
				 	if (!pn)
			
 
				 		return 1;
			
 
				 
			
 
				-	pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
			
 
				-	if (!pn->lruvec_stat) {
			
 
				+	pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
			
 
				+	if (!pn->lruvec_stat_cpu) {
			
 
				 		kfree(pn);
			
 
				 		return 1;
			
 
				 	}
			
@@ -4187,7 +4108,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 
				 {
			
 
				 	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
			
 
				 
			
 
				-	free_percpu(pn->lruvec_stat);
			
 
				+	free_percpu(pn->lruvec_stat_cpu);
			
 
				 	kfree(pn);
			
 
				 }
			
 
				 
			
@@ -4197,7 +4118,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
				 
			
 
				 	for_each_node(node)
			
 
				 		free_mem_cgroup_per_node_info(memcg, node);
			
 
				-	free_percpu(memcg->stat);
			
 
				+	free_percpu(memcg->stat_cpu);
			
 
				 	kfree(memcg);
			
 
				 }
			
 
				 
			
@@ -4226,8 +4147,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 
				 	if (memcg->id.id < 0)
			
 
				 		goto fail;
			
 
				 
			
 
				-	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
			
 
				-	if (!memcg->stat)
			
 
				+	memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
			
 
				+	if (!memcg->stat_cpu)
			
 
				 		goto fail;
			
 
				 
			
 
				 	for_each_node(node)
			
@@ -4584,8 +4505,8 @@ static int mem_cgroup_move_account(struct page *page,
 
				 	spin_lock_irqsave(&from->move_lock, flags);
			
 
				 
			
 
				 	if (!anon && page_mapped(page)) {
			
 
				-		__this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
			
 
				-		__this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
			
 
				+		__mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
			
 
				+		__mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -4597,16 +4518,14 @@ static int mem_cgroup_move_account(struct page *page,
 
				 		struct address_space *mapping = page_mapping(page);
			
 
				 
			
 
				 		if (mapping_cap_account_dirty(mapping)) {
			
 
				-			__this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
			
 
				-				       nr_pages);
			
 
				-			__this_cpu_add(to->stat->count[NR_FILE_DIRTY],
			
 
				-				       nr_pages);
			
 
				+			__mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
			
 
				+			__mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	if (PageWriteback(page)) {
			
 
				-		__this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
			
 
				-		__this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
			
 
				+		__mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
			
 
				+		__mod_memcg_state(to, NR_WRITEBACK, nr_pages);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -5642,12 +5561,12 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 
				 	}
			
 
				 
			
 
				 	local_irq_save(flags);
			
 
				-	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
			
 
				-	__this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
			
 
				-	__this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
			
 
				-	__this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
			
 
				-	__this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
			
 
				-	__this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
			
 
				+	__mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
			
 
				+	__mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
			
 
				+	__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
			
 
				+	__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
			
 
				+	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
			
 
				+	__this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
			
 
				 	memcg_check_events(ug->memcg, ug->dummy_page);
			
 
				 	local_irq_restore(flags);
			
 
				 
			
@@ -5874,7 +5793,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 
				 	if (in_softirq())
			
 
				 		gfp_mask = GFP_NOWAIT;
			
 
				 
			
 
				-	this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
			
 
				+	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
			
 
				 
			
 
				 	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
			
 
				 		return true;
			
@@ -5895,7 +5814,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
			
 
				+	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
			
 
				 
			
 
				 	refill_stock(memcg, nr_pages);
			
 
				 }
			
@@ -6019,7 +5938,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 
				 	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
			
 
				 				   nr_entries);
			
 
				 	VM_BUG_ON_PAGE(oldid, page);
			
 
				-	mem_cgroup_swap_statistics(swap_memcg, nr_entries);
			
 
				+	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
			
 
				 
			
 
				 	page->mem_cgroup = NULL;
			
 
				 
			
@@ -6085,7 +6004,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 
				 		mem_cgroup_id_get_many(memcg, nr_pages - 1);
			
 
				 	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
			
 
				 	VM_BUG_ON_PAGE(oldid, page);
			
 
				-	mem_cgroup_swap_statistics(memcg, nr_pages);
			
 
				+	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -6113,7 +6032,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
 
				 			else
			
 
				 				page_counter_uncharge(&memcg->memsw, nr_pages);
			
 
				 		}
			
 
				-		mem_cgroup_swap_statistics(memcg, -nr_pages);
			
 
				+		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
			
 
				 		mem_cgroup_id_put_many(memcg, nr_pages);
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 
				 
			
 
				 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
			
 
				 
			
 
				-/* tlb_gather_mmu
			
 
				- *	Called to initialize an (on-stack) mmu_gather structure for page-table
			
 
				- *	tear-down from @mm. The @fullmm argument is used when @mm is without
			
 
				- *	users and we're going to destroy the full address space (exit/execve).
			
 
				+/**
			
 
				+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
			
 
				+ * @tlb: the mmu_gather structure to initialize
			
 
				+ * @mm: the mm_struct of the target address space
			
 
				+ * @start: start of the region that will be removed from the page-table
			
 
				+ * @end: end of the region that will be removed from the page-table
			
 
				+ *
			
 
				+ * Called to initialize an (on-stack) mmu_gather structure for page-table
			
 
				+ * tear-down from @mm. The @start and @end are set to 0 and -1
			
 
				+ * respectively when @mm is without users and we're going to destroy
			
 
				+ * the full address space (exit/execve).
			
 
				  */
			
 
				 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
			
 
				 			unsigned long start, unsigned long end)
			
@@ -2791,9 +2798,38 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
 
				 	}
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * unmap_mapping_pages() - Unmap pages from processes.
			
 
				+ * @mapping: The address space containing pages to be unmapped.
			
 
				+ * @start: Index of first page to be unmapped.
			
 
				+ * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
			
 
				+ * @even_cows: Whether to unmap even private COWed pages.
			
 
				+ *
			
 
				+ * Unmap the pages in this address space from any userspace process which
			
 
				+ * has them mmaped.  Generally, you want to remove COWed pages as well when
			
 
				+ * a file is being truncated, but not when invalidating pages from the page
			
 
				+ * cache.
			
 
				+ */
			
 
				+void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
			
 
				+		pgoff_t nr, bool even_cows)
			
 
				+{
			
 
				+	struct zap_details details = { };
			
 
				+
			
 
				+	details.check_mapping = even_cows ? NULL : mapping;
			
 
				+	details.first_index = start;
			
 
				+	details.last_index = start + nr - 1;
			
 
				+	if (details.last_index < details.first_index)
			
 
				+		details.last_index = ULONG_MAX;
			
 
				+
			
 
				+	i_mmap_lock_write(mapping);
			
 
				+	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
			
 
				+		unmap_mapping_range_tree(&mapping->i_mmap, &details);
			
 
				+	i_mmap_unlock_write(mapping);
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * unmap_mapping_range - unmap the portion of all mmaps in the specified
			
 
				- * address_space corresponding to the specified page range in the underlying
			
 
				+ * address_space corresponding to the specified byte range in the underlying
			
 
				  * file.
			
 
				  *
			
 
				  * @mapping: the address space containing mmaps to be unmapped.
			
@@ -2811,7 +2847,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
 
				 void unmap_mapping_range(struct address_space *mapping,
			
 
				 		loff_t const holebegin, loff_t const holelen, int even_cows)
			
 
				 {
			
 
				-	struct zap_details details = { };
			
 
				 	pgoff_t hba = holebegin >> PAGE_SHIFT;
			
 
				 	pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
			
 
				 
			
@@ -2823,16 +2858,7 @@ void unmap_mapping_range(struct address_space *mapping,
 
				 			hlen = ULONG_MAX - hba + 1;
			
 
				 	}
			
 
				 
			
 
				-	details.check_mapping = even_cows ? NULL : mapping;
			
 
				-	details.first_index = hba;
			
 
				-	details.last_index = hba + hlen - 1;
			
 
				-	if (details.last_index < details.first_index)
			
 
				-		details.last_index = ULONG_MAX;
			
 
				-
			
 
				-	i_mmap_lock_write(mapping);
			
 
				-	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
			
 
				-		unmap_mapping_range_tree(&mapping->i_mmap, &details);
			
 
				-	i_mmap_unlock_write(mapping);
			
 
				+	unmap_mapping_pages(mapping, hba, hlen, even_cows);
			
 
				 }
			
 
				 EXPORT_SYMBOL(unmap_mapping_range);
			
 
				 
			
@@ -3485,9 +3511,8 @@ static int fault_around_bytes_get(void *data, u64 *val)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * fault_around_pages() and fault_around_mask() expects fault_around_bytes
			
 
				- * rounded down to nearest page order. It's what do_fault_around() expects to
			
 
				- * see.
			
 
				+ * fault_around_bytes must be rounded down to the nearest page order as it's
			
 
				+ * what do_fault_around() expects to see.
			
 
				  */
			
 
				 static int fault_around_bytes_set(void *data, u64 val)
			
 
				 {
			
@@ -3530,13 +3555,14 @@ late_initcall(fault_around_debugfs);
 
				  * This function doesn't cross the VMA boundaries, in order to call map_pages()
			
 
				  * only once.
			
 
				  *
			
 
				- * fault_around_pages() defines how many pages we'll try to map.
			
 
				- * do_fault_around() expects it to return a power of two less than or equal to
			
 
				- * PTRS_PER_PTE.
			
 
				+ * fault_around_bytes defines how many bytes we'll try to map.
			
 
				+ * do_fault_around() expects it to be set to a power of two less than or equal
			
 
				+ * to PTRS_PER_PTE.
			
 
				  *
			
 
				- * The virtual address of the area that we map is naturally aligned to the
			
 
				- * fault_around_pages() value (and therefore to page order).  This way it's
			
 
				- * easier to guarantee that we don't cross page table boundaries.
			
 
				+ * The virtual address of the area that we map is naturally aligned to
			
 
				+ * fault_around_bytes rounded down to the machine page size
			
 
				+ * (and therefore to page order).  This way it's easier to guarantee
			
 
				+ * that we don't cross page table boundaries.
			
 
				  */
			
 
				 static int do_fault_around(struct vm_fault *vmf)
			
 
				 {
			
@@ -3553,8 +3579,8 @@ static int do_fault_around(struct vm_fault *vmf)
 
				 	start_pgoff -= off;
			
 
				 
			
 
				 	/*
			
 
				-	 *  end_pgoff is either end of page table or end of vma
			
 
				-	 *  or fault_around_pages() from start_pgoff, depending what is nearest.
			
 
				+	 *  end_pgoff is either the end of the page table, the end of
			
 
				+	 *  the vma or nr_pages from start_pgoff, depending what is nearest.
			
 
				 	 */
			
 
				 	end_pgoff = start_pgoff -
			
 
				 		((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
			
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
				 	for (i = 0; i < mapsize; i++, page++)
			
 
				 		get_page_bootmem(section_nr, page, SECTION_INFO);
			
 
				 
			
 
				-	usemap = __nr_to_section(section_nr)->pageblock_flags;
			
 
				+	usemap = ms->pageblock_flags;
			
 
				 	page = virt_to_page(usemap);
			
 
				 
			
 
				 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
			
@@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
				 	struct mem_section *ms;
			
 
				 	struct page *page, *memmap;
			
 
				 
			
 
				-	if (!pfn_valid(start_pfn))
			
 
				-		return;
			
 
				-
			
 
				 	section_nr = pfn_to_section_nr(start_pfn);
			
 
				 	ms = __nr_to_section(section_nr);
			
 
				 
			
@@ -210,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
				 
			
 
				 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
			
 
				 
			
 
				-	usemap = __nr_to_section(section_nr)->pageblock_flags;
			
 
				+	usemap = ms->pageblock_flags;
			
 
				 	page = virt_to_page(usemap);
			
 
				 
			
 
				 	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
			
@@ -1637,7 +1634,7 @@ repeat:
 
				 		goto failed_removal;
			
 
				 
			
 
				 	cond_resched();
			
 
				-	lru_add_drain_all_cpuslocked();
			
 
				+	lru_add_drain_all();
			
 
				 	drain_all_pages(zone);
			
 
				 
			
 
				 	pfn = scan_movable_pages(start_pfn, end_pfn);
			
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1121,8 +1121,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x)
 
				 	}
			
 
				 
			
 
				 	if (PageHuge(page)) {
			
 
				-		BUG_ON(!vma);
			
 
				-		return alloc_huge_page_noerr(vma, address, 1);
			
 
				+		return alloc_huge_page_vma(page_hstate(compound_head(page)),
			
 
				+				vma, address);
			
 
				 	} else if (thp_migration_supported() && PageTransHuge(page)) {
			
 
				 		struct page *thp;
			
 
				 
			
@@ -1263,6 +1263,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 
				 		     unsigned long maxnode)
			
 
				 {
			
 
				 	unsigned long k;
			
 
				+	unsigned long t;
			
 
				 	unsigned long nlongs;
			
 
				 	unsigned long endmask;
			
 
				 
			
@@ -1279,13 +1280,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 
				 	else
			
 
				 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
			
 
				 
			
 
				-	/* When the user specified more nodes than supported just check
			
 
				-	   if the non supported part is all zero. */
			
 
				+	/*
			
 
				+	 * When the user specified more nodes than supported just check
			
 
				+	 * if the non supported part is all zero.
			
 
				+	 *
			
 
				+	 * If maxnode have more longs than MAX_NUMNODES, check
			
 
				+	 * the bits in that area first. And then go through to
			
 
				+	 * check the rest bits which equal or bigger than MAX_NUMNODES.
			
 
				+	 * Otherwise, just check bits [MAX_NUMNODES, maxnode).
			
 
				+	 */
			
 
				 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
			
 
				-		if (nlongs > PAGE_SIZE/sizeof(long))
			
 
				-			return -EINVAL;
			
 
				 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
			
 
				-			unsigned long t;
			
 
				 			if (get_user(t, nmask + k))
			
 
				 				return -EFAULT;
			
 
				 			if (k == nlongs - 1) {
			
@@ -1298,6 +1303,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 
				 		endmask = ~0UL;
			
 
				 	}
			
 
				 
			
 
				+	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
			
 
				+		unsigned long valid_mask = endmask;
			
 
				+
			
 
				+		valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
			
 
				+		if (get_user(t, nmask + nlongs - 1))
			
 
				+			return -EFAULT;
			
 
				+		if (t & valid_mask)
			
 
				+			return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
			
 
				 		return -EFAULT;
			
 
				 	nodes_addr(*nodes)[nlongs-1] &= endmask;
			
@@ -1418,10 +1433,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
 
				 		goto out_put;
			
 
				 	}
			
 
				 
			
 
				-	if (!nodes_subset(*new, node_states[N_MEMORY])) {
			
 
				-		err = -EINVAL;
			
 
				+	task_nodes = cpuset_mems_allowed(current);
			
 
				+	nodes_and(*new, *new, task_nodes);
			
 
				+	if (nodes_empty(*new))
			
 
				+		goto out_put;
			
 
				+
			
 
				+	nodes_and(*new, *new, node_states[N_MEMORY]);
			
 
				+	if (nodes_empty(*new))
			
 
				 		goto out_put;
			
 
				-	}
			
 
				 
			
 
				 	err = security_task_movememory(task);
			
 
				 	if (err)
			
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
 
				 		put_anon_vma(anon_vma);
			
 
				 
			
 
				 	if (rc == MIGRATEPAGE_SUCCESS) {
			
 
				-		hugetlb_cgroup_migrate(hpage, new_hpage);
			
 
				+		move_hugetlb_state(hpage, new_hpage, reason);
			
 
				 		put_new_page = NULL;
			
 
				-		set_page_owner_migrate_reason(new_hpage, reason);
			
 
				 	}
			
 
				 
			
 
				 	unlock_page(hpage);
			
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
			
 
				 
			
 
				+/*
			
 
				+ * Must be called while holding mm->mmap_sem for either read or write.
			
 
				+ * The result is guaranteed to be valid until mm->mmap_sem is dropped.
			
 
				+ */
			
 
				+bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
			
 
				+{
			
 
				+	struct mmu_notifier *mn;
			
 
				+	int id;
			
 
				+	bool ret = false;
			
 
				+
			
 
				+	WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
			
 
				+
			
 
				+	if (!mm_has_notifiers(mm))
			
 
				+		return ret;
			
 
				+
			
 
				+	id = srcu_read_lock(&srcu);
			
 
				+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
			
 
				+		if (!mn->ops->invalidate_range &&
			
 
				+		    !mn->ops->invalidate_range_start &&
			
 
				+		    !mn->ops->invalidate_range_end)
			
 
				+				continue;
			
 
				+
			
 
				+		if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
			
 
				+			ret = true;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	srcu_read_unlock(&srcu, id);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static int do_mmu_notifier_register(struct mmu_notifier *mn,
			
 
				 				    struct mm_struct *mm,
			
 
				 				    int take_mmap_sem)
			
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 
				 				if (!page || PageKsm(page))
			
 
				 					continue;
			
 
				 
			
 
				+				/* Also skip shared copy-on-write pages */
			
 
				+				if (is_cow_mapping(vma->vm_flags) &&
			
 
				+				    page_mapcount(page) != 1)
			
 
				+					continue;
			
 
				+
			
 
				 				/* Avoid TLB flush if possible */
			
 
				 				if (pte_protnone(oldpte))
			
 
				 					continue;
			
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
 
				 	return -ENOMEM;
			
 
				 }
			
 
				 
			
 
				-void unmap_mapping_range(struct address_space *mapping,
			
 
				-			 loff_t const holebegin, loff_t const holelen,
			
 
				-			 int even_cows)
			
 
				-{
			
 
				-}
			
 
				-EXPORT_SYMBOL(unmap_mapping_range);
			
 
				-
			
 
				 int filemap_fault(struct vm_fault *vmf)
			
 
				 {
			
 
				 	BUG();
			
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				-	 * If the mm has notifiers then we would need to invalidate them around
			
 
				-	 * unmap_page_range and that is risky because notifiers can sleep and
			
 
				-	 * what they do is basically undeterministic.  So let's have a short
			
 
				+	 * If the mm has invalidate_{start,end}() notifiers that could block,
			
 
				 	 * sleep to give the oom victim some more time.
			
 
				 	 * TODO: we really want to get rid of this ugly hack and make sure that
			
 
				-	 * notifiers cannot block for unbounded amount of time and add
			
 
				-	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
			
 
				+	 * notifiers cannot block for unbounded amount of time
			
 
				 	 */
			
 
				-	if (mm_has_notifiers(mm)) {
			
 
				+	if (mm_has_blockable_invalidate_notifiers(mm)) {
			
 
				 		up_read(&mm->mmap_sem);
			
 
				 		schedule_timeout_idle(HZ);
			
 
				 		goto unlock_oom;
			
@@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 
				 		 * count elevated without a good reason.
			
 
				 		 */
			
 
				 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
			
 
				-			tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
			
 
				-			unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
			
 
				-					 NULL);
			
 
				-			tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
			
 
				+			const unsigned long start = vma->vm_start;
			
 
				+			const unsigned long end = vma->vm_end;
			
 
				+
			
 
				+			tlb_gather_mmu(&tlb, mm, start, end);
			
 
				+			mmu_notifier_invalidate_range_start(mm, start, end);
			
 
				+			unmap_page_range(&tlb, vma, start, end, NULL);
			
 
				+			mmu_notifier_invalidate_range_end(mm, start, end);
			
 
				+			tlb_finish_mmu(&tlb, start, end);
			
 
				 		}
			
 
				 	}
			
 
				 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly;
 
				 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
			
 
				 
			
 
				 /*
			
 
				- * Determine how many pages need to be initialized durig early boot
			
 
				+ * Determine how many pages need to be initialized during early boot
			
 
				  * (non-deferred initialization).
			
 
				  * The value of first_deferred_pfn will be set later, once non-deferred pages
			
 
				  * are initialized, but for now set it ULONG_MAX.
			
@@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
 
				 				unsigned long pfn, unsigned long zone_end,
			
 
				 				unsigned long *nr_initialised)
			
 
				 {
			
 
				-	/* Always populate low zones for address-contrained allocations */
			
 
				+	/* Always populate low zones for address-constrained allocations */
			
 
				 	if (zone_end < pgdat_end_pfn(pgdat))
			
 
				 		return true;
			
 
				 	(*nr_initialised)++;
			
@@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *zone,
 
				 }
			
 
				 
			
 
				 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
			
 
				-				unsigned long zone, int nid)
			
 
				+				unsigned long zone, int nid, bool zero)
			
 
				 {
			
 
				-	mm_zero_struct_page(page);
			
 
				+	if (zero)
			
 
				+		mm_zero_struct_page(page);
			
 
				 	set_page_links(page, zone, nid, pfn);
			
 
				 	init_page_count(page);
			
 
				 	page_mapcount_reset(page);
			
@@ -1194,9 +1195,9 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 
				 }
			
 
				 
			
 
				 static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
			
 
				-					int nid)
			
 
				+					int nid, bool zero)
			
 
				 {
			
 
				-	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
			
 
				+	return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
			
@@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
 
				 		if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
			
 
				 			break;
			
 
				 	}
			
 
				-	__init_single_pfn(pfn, zid, nid);
			
 
				+	__init_single_pfn(pfn, zid, nid, true);
			
 
				 }
			
 
				 #else
			
 
				 static inline void init_reserved_page(unsigned long pfn)
			
@@ -1457,92 +1458,87 @@ static inline void __init pgdat_init_report_one_done(void)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Helper for deferred_init_range, free the given range, reset the counters, and
			
 
				- * return number of pages freed.
			
 
				+ * Returns true if page needs to be initialized or freed to buddy allocator.
			
 
				+ *
			
 
				+ * First we check if pfn is valid on architectures where it is possible to have
			
 
				+ * holes within pageblock_nr_pages. On systems where it is not possible, this
			
 
				+ * function is optimized out.
			
 
				+ *
			
 
				+ * Then, we check if a current large page is valid by only checking the validity
			
 
				+ * of the head pfn.
			
 
				+ *
			
 
				+ * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
			
 
				+ * within a node: a pfn is between start and end of a node, but does not belong
			
 
				+ * to this memory node.
			
 
				  */
			
 
				-static inline unsigned long __init __def_free(unsigned long *nr_free,
			
 
				-					      unsigned long *free_base_pfn,
			
 
				-					      struct page **page)
			
 
				+static inline bool __init
			
 
				+deferred_pfn_valid(int nid, unsigned long pfn,
			
 
				+		   struct mminit_pfnnid_cache *nid_init_state)
			
 
				 {
			
 
				-	unsigned long nr = *nr_free;
			
 
				+	if (!pfn_valid_within(pfn))
			
 
				+		return false;
			
 
				+	if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
			
 
				+		return false;
			
 
				+	if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
			
 
				+		return false;
			
 
				+	return true;
			
 
				+}
			
 
				 
			
 
				-	deferred_free_range(*free_base_pfn, nr);
			
 
				-	*free_base_pfn = 0;
			
 
				-	*nr_free = 0;
			
 
				-	*page = NULL;
			
 
				+/*
			
 
				+ * Free pages to buddy allocator. Try to free aligned pages in
			
 
				+ * pageblock_nr_pages sizes.
			
 
				+ */
			
 
				+static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
			
 
				+				       unsigned long end_pfn)
			
 
				+{
			
 
				+	struct mminit_pfnnid_cache nid_init_state = { };
			
 
				+	unsigned long nr_pgmask = pageblock_nr_pages - 1;
			
 
				+	unsigned long nr_free = 0;
			
 
				 
			
 
				-	return nr;
			
 
				+	for (; pfn < end_pfn; pfn++) {
			
 
				+		if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
			
 
				+			deferred_free_range(pfn - nr_free, nr_free);
			
 
				+			nr_free = 0;
			
 
				+		} else if (!(pfn & nr_pgmask)) {
			
 
				+			deferred_free_range(pfn - nr_free, nr_free);
			
 
				+			nr_free = 1;
			
 
				+			cond_resched();
			
 
				+		} else {
			
 
				+			nr_free++;
			
 
				+		}
			
 
				+	}
			
 
				+	/* Free the last block of pages to allocator */
			
 
				+	deferred_free_range(pfn - nr_free, nr_free);
			
 
				 }
			
 
				 
			
 
				-static unsigned long __init deferred_init_range(int nid, int zid,
			
 
				-						unsigned long start_pfn,
			
 
				-						unsigned long end_pfn)
			
 
				+/*
			
 
				+ * Initialize struct pages.  We minimize pfn page lookups and scheduler checks
			
 
				+ * by performing it only once every pageblock_nr_pages.
			
 
				+ * Return number of pages initialized.
			
 
				+ */
			
 
				+static unsigned long  __init deferred_init_pages(int nid, int zid,
			
 
				+						 unsigned long pfn,
			
 
				+						 unsigned long end_pfn)
			
 
				 {
			
 
				 	struct mminit_pfnnid_cache nid_init_state = { };
			
 
				 	unsigned long nr_pgmask = pageblock_nr_pages - 1;
			
 
				-	unsigned long free_base_pfn = 0;
			
 
				 	unsigned long nr_pages = 0;
			
 
				-	unsigned long nr_free = 0;
			
 
				 	struct page *page = NULL;
			
 
				-	unsigned long pfn;
			
 
				 
			
 
				-	/*
			
 
				-	 * First we check if pfn is valid on architectures where it is possible
			
 
				-	 * to have holes within pageblock_nr_pages. On systems where it is not
			
 
				-	 * possible, this function is optimized out.
			
 
				-	 *
			
 
				-	 * Then, we check if a current large page is valid by only checking the
			
 
				-	 * validity of the head pfn.
			
 
				-	 *
			
 
				-	 * meminit_pfn_in_nid is checked on systems where pfns can interleave
			
 
				-	 * within a node: a pfn is between start and end of a node, but does not
			
 
				-	 * belong to this memory node.
			
 
				-	 *
			
 
				-	 * Finally, we minimize pfn page lookups and scheduler checks by
			
 
				-	 * performing it only once every pageblock_nr_pages.
			
 
				-	 *
			
 
				-	 * We do it in two loops: first we initialize struct page, than free to
			
 
				-	 * buddy allocator, becuse while we are freeing pages we can access
			
 
				-	 * pages that are ahead (computing buddy page in __free_one_page()).
			
 
				-	 */
			
 
				-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
			
 
				-		if (!pfn_valid_within(pfn))
			
 
				+	for (; pfn < end_pfn; pfn++) {
			
 
				+		if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
			
 
				+			page = NULL;
			
 
				 			continue;
			
 
				-		if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
			
 
				-			if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
			
 
				-				if (page && (pfn & nr_pgmask))
			
 
				-					page++;
			
 
				-				else
			
 
				-					page = pfn_to_page(pfn);
			
 
				-				__init_single_page(page, pfn, zid, nid);
			
 
				-				cond_resched();
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	page = NULL;
			
 
				-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
			
 
				-		if (!pfn_valid_within(pfn)) {
			
 
				-			nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
			
 
				-		} else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
			
 
				-			nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
			
 
				-		} else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
			
 
				-			nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
			
 
				-		} else if (page && (pfn & nr_pgmask)) {
			
 
				-			page++;
			
 
				-			nr_free++;
			
 
				-		} else {
			
 
				-			nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
			
 
				+		} else if (!page || !(pfn & nr_pgmask)) {
			
 
				 			page = pfn_to_page(pfn);
			
 
				-			free_base_pfn = pfn;
			
 
				-			nr_free = 1;
			
 
				 			cond_resched();
			
 
				+		} else {
			
 
				+			page++;
			
 
				 		}
			
 
				+		__init_single_page(page, pfn, zid, nid, true);
			
 
				+		nr_pages++;
			
 
				 	}
			
 
				-	/* Free the last block of pages to allocator */
			
 
				-	nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
			
 
				-
			
 
				-	return nr_pages;
			
 
				+	return (nr_pages);
			
 
				 }
			
 
				 
			
 
				 /* Initialise remaining memory on a node */
			
@@ -1582,10 +1578,21 @@ static int __init deferred_init_memmap(void *data)
 
				 	}
			
 
				 	first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
			
 
				 
			
 
				+	/*
			
 
				+	 * Initialize and free pages. We do it in two loops: first we initialize
			
 
				+	 * struct page, than free to buddy allocator, because while we are
			
 
				+	 * freeing pages we can access pages that are ahead (computing buddy
			
 
				+	 * page in __free_one_page()).
			
 
				+	 */
			
 
				+	for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
			
 
				+		spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
			
 
				+		epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
			
 
				+		nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
			
 
				+	}
			
 
				 	for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
			
 
				 		spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
			
 
				 		epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
			
 
				-		nr_pages += deferred_init_range(nid, zid, spfn, epfn);
			
 
				+		deferred_free_pages(nid, zid, spfn, epfn);
			
 
				 	}
			
 
				 
			
 
				 	/* Sanity check that the next zone really is unpopulated */
			
@@ -3391,7 +3398,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 
				 	if (gfp_mask & __GFP_THISNODE)
			
 
				 		goto out;
			
 
				 
			
 
				-	/* Exhausted what can be done so it's blamo time */
			
 
				+	/* Exhausted what can be done so it's blame time */
			
 
				 	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
			
 
				 		*did_some_progress = 1;
			
 
				 
			
@@ -4272,7 +4279,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 
				 	struct page *page;
			
 
				 
			
 
				 	/*
			
 
				-	 * __get_free_pages() returns a 32-bit address, which cannot represent
			
 
				+	 * __get_free_pages() returns a virtual address, which cannot represent
			
 
				 	 * a highmem page
			
 
				 	 */
			
 
				 	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
			
@@ -5393,15 +5400,20 @@ not_early:
 
				 		 * can be created for invalid pages (for alignment)
			
 
				 		 * check here not to call set_pageblock_migratetype() against
			
 
				 		 * pfn out of zone.
			
 
				+		 *
			
 
				+		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
			
 
				+		 * because this is done early in sparse_add_one_section
			
 
				 		 */
			
 
				 		if (!(pfn & (pageblock_nr_pages - 1))) {
			
 
				 			struct page *page = pfn_to_page(pfn);
			
 
				 
			
 
				-			__init_single_page(page, pfn, zone, nid);
			
 
				+			__init_single_page(page, pfn, zone, nid,
			
 
				+					context != MEMMAP_HOTPLUG);
			
 
				 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
			
 
				 			cond_resched();
			
 
				 		} else {
			
 
				-			__init_single_pfn(pfn, zone, nid);
			
 
				+			__init_single_pfn(pfn, zone, nid,
			
 
				+					context != MEMMAP_HOTPLUG);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,7 +59,9 @@
 
				  */
			
 
				 
			
 
				 static struct page_ext_operations *page_ext_ops[] = {
			
 
				+#ifdef CONFIG_DEBUG_PAGEALLOC
			
 
				 	&debug_guardpage_ops,
			
 
				+#endif
			
 
				 #ifdef CONFIG_PAGE_OWNER
			
 
				 	&page_owner_ops,
			
 
				 #endif
			
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -528,21 +528,18 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 
				 
			
 
				 static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
			
 
				 {
			
 
				-	struct page *page;
			
 
				-	struct page_ext *page_ext;
			
 
				-	unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
			
 
				-	unsigned long end_pfn = pfn + zone->spanned_pages;
			
 
				+	unsigned long pfn = zone->zone_start_pfn;
			
 
				+	unsigned long end_pfn = zone_end_pfn(zone);
			
 
				 	unsigned long count = 0;
			
 
				 
			
 
				-	/* Scan block by block. First and last block may be incomplete */
			
 
				-	pfn = zone->zone_start_pfn;
			
 
				-
			
 
				 	/*
			
 
				 	 * Walk the zone in pageblock_nr_pages steps. If a page block spans
			
 
				 	 * a zone boundary, it will be double counted between zones. This does
			
 
				 	 * not matter as the mixed block count will still be correct
			
 
				 	 */
			
 
				 	for (; pfn < end_pfn; ) {
			
 
				+		unsigned long block_end_pfn;
			
 
				+
			
 
				 		if (!pfn_valid(pfn)) {
			
 
				 			pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
			
 
				 			continue;
			
@@ -551,9 +548,10 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 
				 		block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
			
 
				 		block_end_pfn = min(block_end_pfn, end_pfn);
			
 
				 
			
 
				-		page = pfn_to_page(pfn);
			
 
				-
			
 
				 		for (; pfn < block_end_pfn; pfn++) {
			
 
				+			struct page *page;
			
 
				+			struct page_ext *page_ext;
			
 
				+
			
 
				 			if (!pfn_valid_within(pfn))
			
 
				 				continue;
			
 
				 
			
@@ -635,9 +633,7 @@ static int __init pageowner_init(void)
 
				 
			
 
				 	dentry = debugfs_create_file("page_owner", S_IRUSR, NULL,
			
 
				 			NULL, &proc_page_owner_operations);
			
 
				-	if (IS_ERR(dentry))
			
 
				-		return PTR_ERR(dentry);
			
 
				 
			
 
				-	return 0;
			
 
				+	return PTR_ERR_OR_ZERO(dentry);
			
 
				 }
			
 
				 late_initcall(pageowner_init)
			
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 
				 #endif
			
 
				 
			
 
				 #ifndef __HAVE_ARCH_PMDP_INVALIDATE
			
 
				-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
			
 
				 		     pmd_t *pmdp)
			
 
				 {
			
 
				-	pmd_t entry = *pmdp;
			
 
				-	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
			
 
				+	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
			
 
				 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
			
 
				+	return old;
			
 
				 }
			
 
				 #endif
			
 
				 
			
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2717,15 +2717,28 @@ continue_resched:
 
				 	return error;
			
 
				 }
			
 
				 
			
 
				+static unsigned int *memfd_file_seals_ptr(struct file *file)
			
 
				+{
			
 
				+	if (file->f_op == &shmem_file_operations)
			
 
				+		return &SHMEM_I(file_inode(file))->seals;
			
 
				+
			
 
				+#ifdef CONFIG_HUGETLBFS
			
 
				+	if (file->f_op == &hugetlbfs_file_operations)
			
 
				+		return &HUGETLBFS_I(file_inode(file))->seals;
			
 
				+#endif
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				 #define F_ALL_SEALS (F_SEAL_SEAL | \
			
 
				 		     F_SEAL_SHRINK | \
			
 
				 		     F_SEAL_GROW | \
			
 
				 		     F_SEAL_WRITE)
			
 
				 
			
 
				-int shmem_add_seals(struct file *file, unsigned int seals)
			
 
				+static int memfd_add_seals(struct file *file, unsigned int seals)
			
 
				 {
			
 
				 	struct inode *inode = file_inode(file);
			
 
				-	struct shmem_inode_info *info = SHMEM_I(inode);
			
 
				+	unsigned int *file_seals;
			
 
				 	int error;
			
 
				 
			
 
				 	/*
			
@@ -2758,8 +2771,6 @@ int shmem_add_seals(struct file *file, unsigned int seals)
 
				 	 * other file types.
			
 
				 	 */
			
 
				 
			
 
				-	if (file->f_op != &shmem_file_operations)
			
 
				-		return -EINVAL;
			
 
				 	if (!(file->f_mode & FMODE_WRITE))
			
 
				 		return -EPERM;
			
 
				 	if (seals & ~(unsigned int)F_ALL_SEALS)
			
@@ -2767,12 +2778,18 @@ int shmem_add_seals(struct file *file, unsigned int seals)
 
				 
			
 
				 	inode_lock(inode);
			
 
				 
			
 
				-	if (info->seals & F_SEAL_SEAL) {
			
 
				+	file_seals = memfd_file_seals_ptr(file);
			
 
				+	if (!file_seals) {
			
 
				+		error = -EINVAL;
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	if (*file_seals & F_SEAL_SEAL) {
			
 
				 		error = -EPERM;
			
 
				 		goto unlock;
			
 
				 	}
			
 
				 
			
 
				-	if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) {
			
 
				+	if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
			
 
				 		error = mapping_deny_writable(file->f_mapping);
			
 
				 		if (error)
			
 
				 			goto unlock;
			
@@ -2784,25 +2801,22 @@ int shmem_add_seals(struct file *file, unsigned int seals)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	info->seals |= seals;
			
 
				+	*file_seals |= seals;
			
 
				 	error = 0;
			
 
				 
			
 
				 unlock:
			
 
				 	inode_unlock(inode);
			
 
				 	return error;
			
 
				 }
			
 
				-EXPORT_SYMBOL_GPL(shmem_add_seals);
			
 
				 
			
 
				-int shmem_get_seals(struct file *file)
			
 
				+static int memfd_get_seals(struct file *file)
			
 
				 {
			
 
				-	if (file->f_op != &shmem_file_operations)
			
 
				-		return -EINVAL;
			
 
				+	unsigned int *seals = memfd_file_seals_ptr(file);
			
 
				 
			
 
				-	return SHMEM_I(file_inode(file))->seals;
			
 
				+	return seals ? *seals : -EINVAL;
			
 
				 }
			
 
				-EXPORT_SYMBOL_GPL(shmem_get_seals);
			
 
				 
			
 
				-long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
			
 
				+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
			
 
				 {
			
 
				 	long error;
			
 
				 
			
@@ -2812,10 +2826,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 
				 		if (arg > UINT_MAX)
			
 
				 			return -EINVAL;
			
 
				 
			
 
				-		error = shmem_add_seals(file, arg);
			
 
				+		error = memfd_add_seals(file, arg);
			
 
				 		break;
			
 
				 	case F_GET_SEALS:
			
 
				-		error = shmem_get_seals(file);
			
 
				+		error = memfd_get_seals(file);
			
 
				 		break;
			
 
				 	default:
			
 
				 		error = -EINVAL;
			
@@ -3657,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create,
 
				 		const char __user *, uname,
			
 
				 		unsigned int, flags)
			
 
				 {
			
 
				-	struct shmem_inode_info *info;
			
 
				+	unsigned int *file_seals;
			
 
				 	struct file *file;
			
 
				 	int fd, error;
			
 
				 	char *name;
			
@@ -3667,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create,
 
				 		if (flags & ~(unsigned int)MFD_ALL_FLAGS)
			
 
				 			return -EINVAL;
			
 
				 	} else {
			
 
				-		/* Sealing not supported in hugetlbfs (MFD_HUGETLB) */
			
 
				-		if (flags & MFD_ALLOW_SEALING)
			
 
				-			return -EINVAL;
			
 
				 		/* Allow huge page size encoding in flags. */
			
 
				 		if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
			
 
				 				(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
			
@@ -3722,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create,
 
				 	file->f_flags |= O_RDWR | O_LARGEFILE;
			
 
				 
			
 
				 	if (flags & MFD_ALLOW_SEALING) {
			
 
				-		/*
			
 
				-		 * flags check at beginning of function ensures
			
 
				-		 * this is not a hugetlbfs (MFD_HUGETLB) file.
			
 
				-		 */
			
 
				-		info = SHMEM_I(file_inode(file));
			
 
				-		info->seals &= ~F_SEAL_SEAL;
			
 
				+		file_seals = memfd_file_seals_ptr(file);
			
 
				+		*file_seals &= ~F_SEAL_SEAL;
			
 
				 	}
			
 
				 
			
 
				 	fd_install(fd, file);
			
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1316,8 +1316,6 @@ void __init kmem_cache_init_late(void)
 
				 {
			
 
				 	struct kmem_cache *cachep;
			
 
				 
			
 
				-	slab_state = UP;
			
 
				-
			
 
				 	/* 6) resize the head arrays to their final sizes */
			
 
				 	mutex_lock(&slab_mutex);
			
 
				 	list_for_each_entry(cachep, &slab_caches, list)
			
@@ -1353,8 +1351,6 @@ static int __init cpucache_init(void)
 
				 				slab_online_cpu, slab_offline_cpu);
			
 
				 	WARN_ON(ret < 0);
			
 
				 
			
 
				-	/* Done! */
			
 
				-	slab_state = FULL;
			
 
				 	return 0;
			
 
				 }
			
 
				 __initcall(cpucache_init);
			
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -78,9 +78,6 @@ extern const struct kmalloc_info_struct {
 
				 	unsigned long size;
			
 
				 } kmalloc_info[];
			
 
				 
			
 
				-unsigned long calculate_alignment(slab_flags_t flags,
			
 
				-		unsigned long align, unsigned long size);
			
 
				-
			
 
				 #ifndef CONFIG_SLOB
			
 
				 /* Kmalloc array related functions */
			
 
				 void setup_kmalloc_cache_index_table(void);
			
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -267,6 +267,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s)
 
				 }
			
 
				 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
			
 
				 
			
 
				+/*
			
 
				+ * Figure out what the alignment of the objects will be given a set of
			
 
				+ * flags, a user specified alignment and the size of the objects.
			
 
				+ */
			
 
				+static unsigned long calculate_alignment(unsigned long flags,
			
 
				+		unsigned long align, unsigned long size)
			
 
				+{
			
 
				+	/*
			
 
				+	 * If the user wants hardware cache aligned objects then follow that
			
 
				+	 * suggestion if the object is sufficiently large.
			
 
				+	 *
			
 
				+	 * The hardware cache alignment cannot override the specified
			
 
				+	 * alignment though. If that is greater then use it.
			
 
				+	 */
			
 
				+	if (flags & SLAB_HWCACHE_ALIGN) {
			
 
				+		unsigned long ralign;
			
 
				+
			
 
				+		ralign = cache_line_size();
			
 
				+		while (size <= ralign / 2)
			
 
				+			ralign /= 2;
			
 
				+		align = max(align, ralign);
			
 
				+	}
			
 
				+
			
 
				+	if (align < ARCH_SLAB_MINALIGN)
			
 
				+		align = ARCH_SLAB_MINALIGN;
			
 
				+
			
 
				+	return ALIGN(align, sizeof(void *));
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Find a mergeable slab cache
			
 
				  */
			
@@ -337,33 +366,6 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Figure out what the alignment of the objects will be given a set of
			
 
				- * flags, a user specified alignment and the size of the objects.
			
 
				- */
			
 
				-unsigned long calculate_alignment(slab_flags_t flags,
			
 
				-		unsigned long align, unsigned long size)
			
 
				-{
			
 
				-	/*
			
 
				-	 * If the user wants hardware cache aligned objects then follow that
			
 
				-	 * suggestion if the object is sufficiently large.
			
 
				-	 *
			
 
				-	 * The hardware cache alignment cannot override the specified
			
 
				-	 * alignment though. If that is greater then use it.
			
 
				-	 */
			
 
				-	if (flags & SLAB_HWCACHE_ALIGN) {
			
 
				-		unsigned long ralign = cache_line_size();
			
 
				-		while (size <= ralign / 2)
			
 
				-			ralign /= 2;
			
 
				-		align = max(align, ralign);
			
 
				-	}
			
 
				-
			
 
				-	if (align < ARCH_SLAB_MINALIGN)
			
 
				-		align = ARCH_SLAB_MINALIGN;
			
 
				-
			
 
				-	return ALIGN(align, sizeof(void *));
			
 
				-}
			
 
				-
			
 
				 static struct kmem_cache *create_cache(const char *name,
			
 
				 		size_t object_size, size_t size, size_t align,
			
 
				 		slab_flags_t flags, void (*ctor)(void *),
			
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 
				 	u8 *start;
			
 
				 	u8 *fault;
			
 
				 	u8 *end;
			
 
				+	u8 *pad;
			
 
				 	int length;
			
 
				 	int remainder;
			
 
				 
			
@@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 
				 	if (!remainder)
			
 
				 		return 1;
			
 
				 
			
 
				+	pad = end - remainder;
			
 
				 	metadata_access_enable();
			
 
				-	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
			
 
				+	fault = memchr_inv(pad, POISON_INUSE, remainder);
			
 
				 	metadata_access_disable();
			
 
				 	if (!fault)
			
 
				 		return 1;
			
@@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 
				 		end--;
			
 
				 
			
 
				 	slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
			
 
				-	print_section(KERN_ERR, "Padding ", end - remainder, remainder);
			
 
				+	print_section(KERN_ERR, "Padding ", pad, remainder);
			
 
				 
			
 
				-	restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
			
 
				+	restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -2220,9 +2222,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 
				 
			
 
				 /*
			
 
				  * Put a page that was just frozen (in __slab_free) into a partial page
			
 
				- * slot if available. This is done without interrupts disabled and without
			
 
				- * preemption disabled. The cmpxchg is racy and may put the partial page
			
 
				- * onto a random cpus partial slot.
			
 
				+ * slot if available.
			
 
				  *
			
 
				  * If we did not find a slot then simply move all the partials to the
			
 
				  * per node partial list.
			
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
 
				  */
			
 
				 static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
			
 
				 {
			
 
				-	return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
			
 
				+	unsigned long coded_mem_map =
			
 
				+		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
			
 
				+	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
			
 
				+	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
			
 
				+	return coded_mem_map;
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -411,7 +411,7 @@ static void __lru_cache_add(struct page *page)
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * lru_cache_add: add a page to the page lists
			
 
				+ * lru_cache_add_anon - add a page to the page lists
			
 
				  * @page: the page to add
			
 
				  */
			
 
				 void lru_cache_add_anon(struct page *page)
			
@@ -688,7 +688,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 
				 
			
 
				 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
			
 
				 
			
 
				-void lru_add_drain_all_cpuslocked(void)
			
 
				+/*
			
 
				+ * Doesn't need any cpu hotplug locking because we do rely on per-cpu
			
 
				+ * kworkers being shut down before our page_alloc_cpu_dead callback is
			
 
				+ * executed on the offlined cpu.
			
 
				+ * Calling this function with cpu hotplug locks held can actually lead
			
 
				+ * to obscure indirect dependencies via WQ context.
			
 
				+ */
			
 
				+void lru_add_drain_all(void)
			
 
				 {
			
 
				 	static DEFINE_MUTEX(lock);
			
 
				 	static struct cpumask has_work;
			
@@ -724,13 +731,6 @@ void lru_add_drain_all_cpuslocked(void)
 
				 	mutex_unlock(&lock);
			
 
				 }
			
 
				 
			
 
				-void lru_add_drain_all(void)
			
 
				-{
			
 
				-	get_online_cpus();
			
 
				-	lru_add_drain_all_cpuslocked();
			
 
				-	put_online_cpus();
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * release_pages - batched put_page()
			
 
				  * @pages: array of pages to release
			
@@ -930,10 +930,10 @@ EXPORT_SYMBOL(__pagevec_lru_add);
 
				  */
			
 
				 unsigned pagevec_lookup_entries(struct pagevec *pvec,
			
 
				 				struct address_space *mapping,
			
 
				-				pgoff_t start, unsigned nr_pages,
			
 
				+				pgoff_t start, unsigned nr_entries,
			
 
				 				pgoff_t *indices)
			
 
				 {
			
 
				-	pvec->nr = find_get_entries(mapping, start, nr_pages,
			
 
				+	pvec->nr = find_get_entries(mapping, start, nr_entries,
			
 
				 				    pvec->pages, indices);
			
 
				 	return pagevec_count(pvec);
			
 
				 }
			
@@ -965,9 +965,8 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
 
				  * @mapping:	The address_space to search
			
 
				  * @start:	The starting page index
			
 
				  * @end:	The final page index
			
 
				- * @nr_pages:	The maximum number of pages
			
 
				  *
			
 
				- * pagevec_lookup_range() will search for and return a group of up to @nr_pages
			
 
				+ * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
			
 
				  * pages in the mapping starting from index @start and upto index @end
			
 
				  * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
			
 
				  * reference against the pages in @pvec.
			
@@ -977,7 +976,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec)
 
				  * also update @start to index the next page for the traversal.
			
 
				  *
			
 
				  * pagevec_lookup_range() returns the number of pages which were found. If this
			
 
				- * number is smaller than @nr_pages, the end of specified range has been
			
 
				+ * number is smaller than PAGEVEC_SIZE, the end of specified range has been
			
 
				  * reached.
			
 
				  */
			
 
				 unsigned pagevec_lookup_range(struct pagevec *pvec,
			
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -179,12 +179,8 @@ static void
 
				 truncate_cleanup_page(struct address_space *mapping, struct page *page)
			
 
				 {
			
 
				 	if (page_mapped(page)) {
			
 
				-		loff_t holelen;
			
 
				-
			
 
				-		holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
			
 
				-		unmap_mapping_range(mapping,
			
 
				-				   (loff_t)page->index << PAGE_SHIFT,
			
 
				-				   holelen, 0);
			
 
				+		pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
			
 
				+		unmap_mapping_pages(mapping, page->index, nr, false);
			
 
				 	}
			
 
				 
			
 
				 	if (page_has_private(page))
			
@@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
				 					/*
			
 
				 					 * Zap the rest of the file in one hit.
			
 
				 					 */
			
 
				-					unmap_mapping_range(mapping,
			
 
				-					   (loff_t)index << PAGE_SHIFT,
			
 
				-					   (loff_t)(1 + end - index)
			
 
				-							 << PAGE_SHIFT,
			
 
				-							 0);
			
 
				+					unmap_mapping_pages(mapping, index,
			
 
				+						(1 + end - index), false);
			
 
				 					did_range_unmap = 1;
			
 
				 				} else {
			
 
				 					/*
			
 
				 					 * Just zap this page
			
 
				 					 */
			
 
				-					unmap_mapping_range(mapping,
			
 
				-					   (loff_t)index << PAGE_SHIFT,
			
 
				-					   PAGE_SIZE, 0);
			
 
				+					unmap_mapping_pages(mapping, index,
			
 
				+								1, false);
			
 
				 				}
			
 
				 			}
			
 
				 			BUG_ON(page_mapped(page));
			
@@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 
				 	 * get remapped later.
			
 
				 	 */
			
 
				 	if (dax_mapping(mapping)) {
			
 
				-		unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
			
 
				-				    (loff_t)(end - start + 1) << PAGE_SHIFT, 0);
			
 
				+		unmap_mapping_pages(mapping, start, end - start + 1, false);
			
 
				 	}
			
 
				 out:
			
 
				 	cleancache_invalidate_inode(mapping);
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
 
				 	return nr;
			
 
				 }
			
 
				 
			
 
				-unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
			
 
				-{
			
 
				-	unsigned long nr;
			
 
				-
			
 
				-	nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) +
			
 
				-	     node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) +
			
 
				-	     node_page_state_snapshot(pgdat, NR_ISOLATED_FILE);
			
 
				-
			
 
				-	if (get_nr_swap_pages() > 0)
			
 
				-		nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) +
			
 
				-		      node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) +
			
 
				-		      node_page_state_snapshot(pgdat, NR_ISOLATED_ANON);
			
 
				-
			
 
				-	return nr;
			
 
				-}
			
 
				-
			
 
				 /**
			
 
				  * lruvec_lru_size -  Returns the number of pages on the given LRU list.
			
 
				  * @lruvec: lru vector
			
@@ -310,9 +294,7 @@ EXPORT_SYMBOL(unregister_shrinker);
 
				 #define SHRINK_BATCH 128
			
 
				 
			
 
				 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
			
 
				-				    struct shrinker *shrinker,
			
 
				-				    unsigned long nr_scanned,
			
 
				-				    unsigned long nr_eligible)
			
 
				+				    struct shrinker *shrinker, int priority)
			
 
				 {
			
 
				 	unsigned long freed = 0;
			
 
				 	unsigned long long delta;
			
@@ -337,9 +319,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 
				 	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
			
 
				 
			
 
				 	total_scan = nr;
			
 
				-	delta = (4 * nr_scanned) / shrinker->seeks;
			
 
				-	delta *= freeable;
			
 
				-	do_div(delta, nr_eligible + 1);
			
 
				+	delta = freeable >> priority;
			
 
				+	delta *= 4;
			
 
				+	do_div(delta, shrinker->seeks);
			
 
				 	total_scan += delta;
			
 
				 	if (total_scan < 0) {
			
 
				 		pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
			
@@ -373,8 +355,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 
				 		total_scan = freeable * 2;
			
 
				 
			
 
				 	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
			
 
				-				   nr_scanned, nr_eligible,
			
 
				-				   freeable, delta, total_scan);
			
 
				+				   freeable, delta, total_scan, priority);
			
 
				 
			
 
				 	/*
			
 
				 	 * Normally, we should not scan less than batch_size objects in one
			
@@ -434,8 +415,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 
				  * @gfp_mask: allocation context
			
 
				  * @nid: node whose slab caches to target
			
 
				  * @memcg: memory cgroup whose slab caches to target
			
 
				- * @nr_scanned: pressure numerator
			
 
				- * @nr_eligible: pressure denominator
			
 
				+ * @priority: the reclaim priority
			
 
				  *
			
 
				  * Call the shrink functions to age shrinkable caches.
			
 
				  *
			
@@ -447,20 +427,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 
				  * objects from the memory cgroup specified. Otherwise, only unaware
			
 
				  * shrinkers are called.
			
 
				  *
			
 
				- * @nr_scanned and @nr_eligible form a ratio that indicate how much of
			
 
				- * the available objects should be scanned.  Page reclaim for example
			
 
				- * passes the number of pages scanned and the number of pages on the
			
 
				- * LRU lists that it considered on @nid, plus a bias in @nr_scanned
			
 
				- * when it encountered mapped pages.  The ratio is further biased by
			
 
				- * the ->seeks setting of the shrink function, which indicates the
			
 
				- * cost to recreate an object relative to that of an LRU page.
			
 
				+ * @priority is sc->priority, we take the number of objects and >> by priority
			
 
				+ * in order to get the scan target.
			
 
				  *
			
 
				  * Returns the number of reclaimed slab objects.
			
 
				  */
			
 
				 static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
			
 
				 				 struct mem_cgroup *memcg,
			
 
				-				 unsigned long nr_scanned,
			
 
				-				 unsigned long nr_eligible)
			
 
				+				 int priority)
			
 
				 {
			
 
				 	struct shrinker *shrinker;
			
 
				 	unsigned long freed = 0;
			
@@ -468,9 +442,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 
				 	if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
			
 
				 		return 0;
			
 
				 
			
 
				-	if (nr_scanned == 0)
			
 
				-		nr_scanned = SWAP_CLUSTER_MAX;
			
 
				-
			
 
				 	if (!down_read_trylock(&shrinker_rwsem)) {
			
 
				 		/*
			
 
				 		 * If we would return 0, our callers would understand that we
			
@@ -501,7 +472,16 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 
				 		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
			
 
				 			sc.nid = 0;
			
 
				 
			
 
				-		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
			
 
				+		freed += do_shrink_slab(&sc, shrinker, priority);
			
 
				+		/*
			
 
				+		 * Bail out if someone want to register a new shrinker to
			
 
				+		 * prevent the regsitration from being stalled for long periods
			
 
				+		 * by parallel ongoing shrinking.
			
 
				+		 */
			
 
				+		if (rwsem_is_contended(&shrinker_rwsem)) {
			
 
				+			freed = freed ? : 1;
			
 
				+			break;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	up_read(&shrinker_rwsem);
			
@@ -519,8 +499,7 @@ void drop_slab_node(int nid)
 
				 
			
 
				 		freed = 0;
			
 
				 		do {
			
 
				-			freed += shrink_slab(GFP_KERNEL, nid, memcg,
			
 
				-					     1000, 1000);
			
 
				+			freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
			
 
				 		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
			
 
				 	} while (freed > 10);
			
 
				 }
			
@@ -1436,14 +1415,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 
				 
			
 
				 		if (PageDirty(page)) {
			
 
				 			struct address_space *mapping;
			
 
				+			bool migrate_dirty;
			
 
				 
			
 
				 			/*
			
 
				 			 * Only pages without mappings or that have a
			
 
				 			 * ->migratepage callback are possible to migrate
			
 
				-			 * without blocking
			
 
				+			 * without blocking. However, we can be racing with
			
 
				+			 * truncation so it's necessary to lock the page
			
 
				+			 * to stabilise the mapping as truncation holds
			
 
				+			 * the page lock until after the page is removed
			
 
				+			 * from the page cache.
			
 
				 			 */
			
 
				+			if (!trylock_page(page))
			
 
				+				return ret;
			
 
				+
			
 
				 			mapping = page_mapping(page);
			
 
				-			if (mapping && !mapping->a_ops->migratepage)
			
 
				+			migrate_dirty = mapping && mapping->a_ops->migratepage;
			
 
				+			unlock_page(page);
			
 
				+			if (!migrate_dirty)
			
 
				 				return ret;
			
 
				 		}
			
 
				 	}
			
@@ -2615,14 +2604,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
				 
			
 
				 			reclaimed = sc->nr_reclaimed;
			
 
				 			scanned = sc->nr_scanned;
			
 
				-
			
 
				 			shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
			
 
				 			node_lru_pages += lru_pages;
			
 
				 
			
 
				 			if (memcg)
			
 
				 				shrink_slab(sc->gfp_mask, pgdat->node_id,
			
 
				-					    memcg, sc->nr_scanned - scanned,
			
 
				-					    lru_pages);
			
 
				+					    memcg, sc->priority);
			
 
				 
			
 
				 			/* Record the group's reclaim efficiency */
			
 
				 			vmpressure(sc->gfp_mask, memcg, false,
			
@@ -2646,14 +2633,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 
				 			}
			
 
				 		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
			
 
				 
			
 
				-		/*
			
 
				-		 * Shrink the slab caches in the same proportion that
			
 
				-		 * the eligible LRU pages were scanned.
			
 
				-		 */
			
 
				 		if (global_reclaim(sc))
			
 
				 			shrink_slab(sc->gfp_mask, pgdat->node_id, NULL,
			
 
				-				    sc->nr_scanned - nr_scanned,
			
 
				-				    node_lru_pages);
			
 
				+				    sc->priority);
			
 
				 
			
 
				 		if (reclaim_state) {
			
 
				 			sc->nr_reclaimed += reclaim_state->reclaimed_slab;