10 лет назад · 15a25b2ead
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -592,6 +592,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 
				 extern void pmdp_splitting_flush(struct vm_area_struct *vma,
			
 
				 				 unsigned long address, pmd_t *pmdp);
			
 
				 
			
 
				+extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
			
 
				+				 unsigned long address, pmd_t *pmdp);
			
 
				+#define pmdp_collapse_flush pmdp_collapse_flush
			
 
				+
			
 
				 #define __HAVE_ARCH_PGTABLE_DEPOSIT
			
 
				 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
			
 
				 				       pgtable_t pgtable);
			
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -560,41 +560,47 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
 
				 	pmd_t pmd;
			
 
				 
			
 
				 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
			
 
				-	if (pmd_trans_huge(*pmdp)) {
			
 
				-		pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
			
 
				-	} else {
			
 
				-		/*
			
 
				-		 * khugepaged calls this for normal pmd
			
 
				-		 */
			
 
				-		pmd = *pmdp;
			
 
				-		pmd_clear(pmdp);
			
 
				-		/*
			
 
				-		 * Wait for all pending hash_page to finish. This is needed
			
 
				-		 * in case of subpage collapse. When we collapse normal pages
			
 
				-		 * to hugepage, we first clear the pmd, then invalidate all
			
 
				-		 * the PTE entries. The assumption here is that any low level
			
 
				-		 * page fault will see a none pmd and take the slow path that
			
 
				-		 * will wait on mmap_sem. But we could very well be in a
			
 
				-		 * hash_page with local ptep pointer value. Such a hash page
			
 
				-		 * can result in adding new HPTE entries for normal subpages.
			
 
				-		 * That means we could be modifying the page content as we
			
 
				-		 * copy them to a huge page. So wait for parallel hash_page
			
 
				-		 * to finish before invalidating HPTE entries. We can do this
			
 
				-		 * by sending an IPI to all the cpus and executing a dummy
			
 
				-		 * function there.
			
 
				-		 */
			
 
				-		kick_all_cpus_sync();
			
 
				-		/*
			
 
				-		 * Now invalidate the hpte entries in the range
			
 
				-		 * covered by pmd. This make sure we take a
			
 
				-		 * fault and will find the pmd as none, which will
			
 
				-		 * result in a major fault which takes mmap_sem and
			
 
				-		 * hence wait for collapse to complete. Without this
			
 
				-		 * the __collapse_huge_page_copy can result in copying
			
 
				-		 * the old content.
			
 
				-		 */
			
 
				-		flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
			
 
				-	}
			
 
				+	VM_BUG_ON(!pmd_trans_huge(*pmdp));
			
 
				+	pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
			
 
				+	return pmd;
			
 
				+}
			
 
				+
			
 
				+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			
 
				+			  pmd_t *pmdp)
			
 
				+{
			
 
				+	pmd_t pmd;
			
 
				+
			
 
				+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
			
 
				+	VM_BUG_ON(pmd_trans_huge(*pmdp));
			
 
				+
			
 
				+	pmd = *pmdp;
			
 
				+	pmd_clear(pmdp);
			
 
				+	/*
			
 
				+	 * Wait for all pending hash_page to finish. This is needed
			
 
				+	 * in case of subpage collapse. When we collapse normal pages
			
 
				+	 * to hugepage, we first clear the pmd, then invalidate all
			
 
				+	 * the PTE entries. The assumption here is that any low level
			
 
				+	 * page fault will see a none pmd and take the slow path that
			
 
				+	 * will wait on mmap_sem. But we could very well be in a
			
 
				+	 * hash_page with local ptep pointer value. Such a hash page
			
 
				+	 * can result in adding new HPTE entries for normal subpages.
			
 
				+	 * That means we could be modifying the page content as we
			
 
				+	 * copy them to a huge page. So wait for parallel hash_page
			
 
				+	 * to finish before invalidating HPTE entries. We can do this
			
 
				+	 * by sending an IPI to all the cpus and executing a dummy
			
 
				+	 * function there.
			
 
				+	 */
			
 
				+	kick_all_cpus_sync();
			
 
				+	/*
			
 
				+	 * Now invalidate the hpte entries in the range
			
 
				+	 * covered by pmd. This make sure we take a
			
 
				+	 * fault and will find the pmd as none, which will
			
 
				+	 * result in a major fault which takes mmap_sem and
			
 
				+	 * hence wait for collapse to complete. Without this
			
 
				+	 * the __collapse_huge_page_copy can result in copying
			
 
				+	 * the old content.
			
 
				+	 */
			
 
				+	flush_tlb_pmd_range(vma->vm_mm, &pmd, address);
			
 
				 	return pmd;
			
 
				 }
			
 
				 
			
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -189,6 +189,27 @@ extern void pmdp_splitting_flush(struct vm_area_struct *vma,
 
				 				 unsigned long address, pmd_t *pmdp);
			
 
				 #endif
			
 
				 
			
 
				+#ifndef pmdp_collapse_flush
			
 
				+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
			
 
				+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
			
 
				+					unsigned long address,
			
 
				+					pmd_t *pmdp)
			
 
				+{
			
 
				+	return pmdp_clear_flush(vma, address, pmdp);
			
 
				+}
			
 
				+#define pmdp_collapse_flush pmdp_collapse_flush
			
 
				+#else
			
 
				+static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
			
 
				+					unsigned long address,
			
 
				+					pmd_t *pmdp)
			
 
				+{
			
 
				+	BUILD_BUG();
			
 
				+	return *pmdp;
			
 
				+}
			
 
				+#define pmdp_collapse_flush pmdp_collapse_flush
			
 
				+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
			
 
				+#endif
			
 
				+
			
 
				 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
			
 
				 extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
			
 
				 				       pgtable_t pgtable);
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2499,7 +2499,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 
				 	 * huge and small TLB entries for the same virtual address
			
 
				 	 * to avoid the risk of CPU bugs in that area.
			
 
				 	 */
			
 
				-	_pmd = pmdp_clear_flush(vma, address, pmd);
			
 
				+	_pmd = pmdp_collapse_flush(vma, address, pmd);
			
 
				 	spin_unlock(pmd_ptl);
			
 
				 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);