|
|
@@ -1568,8 +1568,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|
|
deactivate_page(page);
|
|
|
|
|
|
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
|
|
|
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
|
|
|
- tlb->fullmm);
|
|
|
+ pmdp_invalidate(vma, addr, pmd);
|
|
|
orig_pmd = pmd_mkold(orig_pmd);
|
|
|
orig_pmd = pmd_mkclean(orig_pmd);
|
|
|
|
|
|
@@ -1724,37 +1723,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
|
{
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
|
spinlock_t *ptl;
|
|
|
- int ret = 0;
|
|
|
+ pmd_t entry;
|
|
|
+ bool preserve_write;
|
|
|
+ int ret;
|
|
|
|
|
|
ptl = __pmd_trans_huge_lock(pmd, vma);
|
|
|
- if (ptl) {
|
|
|
- pmd_t entry;
|
|
|
- bool preserve_write = prot_numa && pmd_write(*pmd);
|
|
|
- ret = 1;
|
|
|
+ if (!ptl)
|
|
|
+ return 0;
|
|
|
|
|
|
- /*
|
|
|
- * Avoid trapping faults against the zero page. The read-only
|
|
|
- * data is likely to be read-cached on the local CPU and
|
|
|
- * local/remote hits to the zero page are not interesting.
|
|
|
- */
|
|
|
- if (prot_numa && is_huge_zero_pmd(*pmd)) {
|
|
|
- spin_unlock(ptl);
|
|
|
- return ret;
|
|
|
- }
|
|
|
+ preserve_write = prot_numa && pmd_write(*pmd);
|
|
|
+ ret = 1;
|
|
|
|
|
|
- if (!prot_numa || !pmd_protnone(*pmd)) {
|
|
|
- entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
|
|
|
- entry = pmd_modify(entry, newprot);
|
|
|
- if (preserve_write)
|
|
|
- entry = pmd_mk_savedwrite(entry);
|
|
|
- ret = HPAGE_PMD_NR;
|
|
|
- set_pmd_at(mm, addr, pmd, entry);
|
|
|
- BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
|
|
|
- pmd_write(entry));
|
|
|
- }
|
|
|
- spin_unlock(ptl);
|
|
|
- }
|
|
|
+ /*
|
|
|
+ * Avoid trapping faults against the zero page. The read-only
|
|
|
+ * data is likely to be read-cached on the local CPU and
|
|
|
+ * local/remote hits to the zero page are not interesting.
|
|
|
+ */
|
|
|
+ if (prot_numa && is_huge_zero_pmd(*pmd))
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ if (prot_numa && pmd_protnone(*pmd))
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * In case prot_numa, we are under down_read(mmap_sem). It's critical
|
|
|
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
|
|
|
+ * which is also under down_read(mmap_sem):
|
|
|
+ *
|
|
|
+ * CPU0: CPU1:
|
|
|
+ * change_huge_pmd(prot_numa=1)
|
|
|
+ * pmdp_huge_get_and_clear_notify()
|
|
|
+ * madvise_dontneed()
|
|
|
+ * zap_pmd_range()
|
|
|
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
|
|
|
+ * // skip the pmd
|
|
|
+ * set_pmd_at();
|
|
|
+ * // pmd is re-established
|
|
|
+ *
|
|
|
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
|
|
|
+ * which may break userspace.
|
|
|
+ *
|
|
|
+ * pmdp_invalidate() is required to make sure we don't miss
|
|
|
+ * dirty/young flags set by hardware.
|
|
|
+ */
|
|
|
+ entry = *pmd;
|
|
|
+ pmdp_invalidate(vma, addr, pmd);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Recover dirty/young flags. It relies on pmdp_invalidate to not
|
|
|
+ * corrupt them.
|
|
|
+ */
|
|
|
+ if (pmd_dirty(*pmd))
|
|
|
+ entry = pmd_mkdirty(entry);
|
|
|
+ if (pmd_young(*pmd))
|
|
|
+ entry = pmd_mkyoung(entry);
|
|
|
|
|
|
+ entry = pmd_modify(entry, newprot);
|
|
|
+ if (preserve_write)
|
|
|
+ entry = pmd_mk_savedwrite(entry);
|
|
|
+ ret = HPAGE_PMD_NR;
|
|
|
+ set_pmd_at(mm, addr, pmd, entry);
|
|
|
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
|
|
|
+unlock:
|
|
|
+ spin_unlock(ptl);
|
|
|
return ret;
|
|
|
}
|
|
|
|