|
@@ -2063,7 +2063,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
struct page *page;
|
|
|
pgtable_t pgtable;
|
|
|
- pmd_t old, _pmd;
|
|
|
+ pmd_t old_pmd, _pmd;
|
|
|
bool young, write, soft_dirty, pmd_migration = false;
|
|
|
unsigned long addr;
|
|
|
int i;
|
|
@@ -2106,23 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
|
|
return __split_huge_zero_page_pmd(vma, haddr, pmd);
|
|
|
}
|
|
|
|
|
|
+ /*
|
|
|
+ * Up to this point the pmd is present and huge and userland has the
|
|
|
+ * whole access to the hugepage during the split (which happens in
|
|
|
+ * place). If we overwrite the pmd with the not-huge version pointing
|
|
|
+ * to the pte here (which of course we could if all CPUs were bug
|
|
|
+ * free), userland could trigger a small page size TLB miss on the
|
|
|
+ * small sized TLB while the hugepage TLB entry is still established in
|
|
|
+ * the huge TLB. Some CPU doesn't like that.
|
|
|
+ * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
|
|
|
+ * 383 on page 93. Intel should be safe but is also warns that it's
|
|
|
+ * only safe if the permission and cache attributes of the two entries
|
|
|
+ * loaded in the two TLB is identical (which should be the case here).
|
|
|
+ * But it is generally safer to never allow small and huge TLB entries
|
|
|
+ * for the same virtual address to be loaded simultaneously. So instead
|
|
|
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
|
|
|
+ * current pmd notpresent (atomically because here the pmd_trans_huge
|
|
|
+ * must remain set at all times on the pmd until the split is complete
|
|
|
+ * for this pmd), then we flush the SMP TLB and finally we write the
|
|
|
+ * non-huge version of the pmd entry with pmd_populate.
|
|
|
+ */
|
|
|
+ old_pmd = pmdp_invalidate(vma, haddr, pmd);
|
|
|
+
|
|
|
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
|
|
|
- pmd_migration = is_pmd_migration_entry(*pmd);
|
|
|
+ pmd_migration = is_pmd_migration_entry(old_pmd);
|
|
|
if (pmd_migration) {
|
|
|
swp_entry_t entry;
|
|
|
|
|
|
- entry = pmd_to_swp_entry(*pmd);
|
|
|
+ entry = pmd_to_swp_entry(old_pmd);
|
|
|
page = pfn_to_page(swp_offset(entry));
|
|
|
} else
|
|
|
#endif
|
|
|
- page = pmd_page(*pmd);
|
|
|
+ page = pmd_page(old_pmd);
|
|
|
VM_BUG_ON_PAGE(!page_count(page), page);
|
|
|
page_ref_add(page, HPAGE_PMD_NR - 1);
|
|
|
- write = pmd_write(*pmd);
|
|
|
- young = pmd_young(*pmd);
|
|
|
- soft_dirty = pmd_soft_dirty(*pmd);
|
|
|
+ if (pmd_dirty(old_pmd))
|
|
|
+ SetPageDirty(page);
|
|
|
+ write = pmd_write(old_pmd);
|
|
|
+ young = pmd_young(old_pmd);
|
|
|
+ soft_dirty = pmd_soft_dirty(old_pmd);
|
|
|
|
|
|
- pmdp_huge_split_prepare(vma, haddr, pmd);
|
|
|
+ /*
|
|
|
+ * Withdraw the table only after we mark the pmd entry invalid.
|
|
|
+ * This's critical for some architectures (Power).
|
|
|
+ */
|
|
|
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
|
|
|
pmd_populate(mm, &_pmd, pgtable);
|
|
|
|
|
@@ -2176,35 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
|
|
|
}
|
|
|
|
|
|
smp_wmb(); /* make pte visible before pmd */
|
|
|
- /*
|
|
|
- * Up to this point the pmd is present and huge and userland has the
|
|
|
- * whole access to the hugepage during the split (which happens in
|
|
|
- * place). If we overwrite the pmd with the not-huge version pointing
|
|
|
- * to the pte here (which of course we could if all CPUs were bug
|
|
|
- * free), userland could trigger a small page size TLB miss on the
|
|
|
- * small sized TLB while the hugepage TLB entry is still established in
|
|
|
- * the huge TLB. Some CPU doesn't like that.
|
|
|
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
|
|
|
- * 383 on page 93. Intel should be safe but is also warns that it's
|
|
|
- * only safe if the permission and cache attributes of the two entries
|
|
|
- * loaded in the two TLB is identical (which should be the case here).
|
|
|
- * But it is generally safer to never allow small and huge TLB entries
|
|
|
- * for the same virtual address to be loaded simultaneously. So instead
|
|
|
- * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
|
|
|
- * current pmd notpresent (atomically because here the pmd_trans_huge
|
|
|
- * must remain set at all times on the pmd until the split is complete
|
|
|
- * for this pmd), then we flush the SMP TLB and finally we write the
|
|
|
- * non-huge version of the pmd entry with pmd_populate.
|
|
|
- */
|
|
|
- old = pmdp_invalidate(vma, haddr, pmd);
|
|
|
-
|
|
|
- /*
|
|
|
- * Transfer dirty bit using value returned by pmd_invalidate() to be
|
|
|
- * sure we don't race with CPU that can set the bit under us.
|
|
|
- */
|
|
|
- if (pmd_dirty(old))
|
|
|
- SetPageDirty(page);
|
|
|
-
|
|
|
pmd_populate(mm, pmd, pgtable);
|
|
|
|
|
|
if (freeze) {
|