|
@@ -1746,7 +1746,39 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
|
|
|
if (prot_numa && pmd_protnone(*pmd))
|
|
|
goto unlock;
|
|
|
|
|
|
- entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
|
|
|
+ /*
|
|
|
+ * In case prot_numa, we are under down_read(mmap_sem). It's critical
|
|
|
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
|
|
|
+ * which is also under down_read(mmap_sem):
|
|
|
+ *
|
|
|
+ * CPU0: CPU1:
|
|
|
+ * change_huge_pmd(prot_numa=1)
|
|
|
+ * pmdp_huge_get_and_clear_notify()
|
|
|
+ * madvise_dontneed()
|
|
|
+ * zap_pmd_range()
|
|
|
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
|
|
|
+ * // skip the pmd
|
|
|
+ * set_pmd_at();
|
|
|
+ * // pmd is re-established
|
|
|
+ *
|
|
|
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
|
|
|
+ * which may break userspace.
|
|
|
+ *
|
|
|
+ * pmdp_invalidate() is required to make sure we don't miss
|
|
|
+ * dirty/young flags set by hardware.
|
|
|
+ */
|
|
|
+ entry = *pmd;
|
|
|
+ pmdp_invalidate(vma, addr, pmd);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Recover dirty/young flags. It relies on pmdp_invalidate to not
|
|
|
+ * corrupt them.
|
|
|
+ */
|
|
|
+ if (pmd_dirty(*pmd))
|
|
|
+ entry = pmd_mkdirty(entry);
|
|
|
+ if (pmd_young(*pmd))
|
|
|
+ entry = pmd_mkyoung(entry);
|
|
|
+
|
|
|
entry = pmd_modify(entry, newprot);
|
|
|
if (preserve_write)
|
|
|
entry = pmd_mk_savedwrite(entry);
|