|
@@ -757,6 +757,60 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
|
|
|
|
|
|
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
|
|
|
+static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
|
|
|
+{
|
|
|
+ if (likely(vma->vm_flags & VM_WRITE))
|
|
|
+ pud = pud_mkwrite(pud);
|
|
|
+ return pud;
|
|
|
+}
|
|
|
+
|
|
|
+static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
|
|
|
+ pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
|
|
|
+{
|
|
|
+ struct mm_struct *mm = vma->vm_mm;
|
|
|
+ pud_t entry;
|
|
|
+ spinlock_t *ptl;
|
|
|
+
|
|
|
+ ptl = pud_lock(mm, pud);
|
|
|
+ entry = pud_mkhuge(pfn_t_pud(pfn, prot));
|
|
|
+ if (pfn_t_devmap(pfn))
|
|
|
+ entry = pud_mkdevmap(entry);
|
|
|
+ if (write) {
|
|
|
+ entry = pud_mkyoung(pud_mkdirty(entry));
|
|
|
+ entry = maybe_pud_mkwrite(entry, vma);
|
|
|
+ }
|
|
|
+ set_pud_at(mm, addr, pud, entry);
|
|
|
+ update_mmu_cache_pud(vma, addr, pud);
|
|
|
+ spin_unlock(ptl);
|
|
|
+}
|
|
|
+
|
|
|
+int vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
|
|
|
+ pud_t *pud, pfn_t pfn, bool write)
|
|
|
+{
|
|
|
+ pgprot_t pgprot = vma->vm_page_prot;
|
|
|
+ /*
|
|
|
+ * If we had pud_special, we could avoid all these restrictions,
|
|
|
+ * but we need to be consistent with PTEs and architectures that
|
|
|
+ * can't support a 'special' bit.
|
|
|
+ */
|
|
|
+ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
|
|
|
+ BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
|
|
|
+ (VM_PFNMAP|VM_MIXEDMAP));
|
|
|
+ BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
|
|
|
+ BUG_ON(!pfn_t_devmap(pfn));
|
|
|
+
|
|
|
+ if (addr < vma->vm_start || addr >= vma->vm_end)
|
|
|
+ return VM_FAULT_SIGBUS;
|
|
|
+
|
|
|
+ track_pfn_insert(vma, &pgprot, pfn);
|
|
|
+
|
|
|
+ insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
|
|
|
+ return VM_FAULT_NOPAGE;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
|
|
|
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
|
|
|
+
|
|
|
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
|
|
|
pmd_t *pmd)
|
|
|
{
|
|
@@ -887,6 +941,123 @@ out:
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
|
|
|
+static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
|
|
|
+ pud_t *pud)
|
|
|
+{
|
|
|
+ pud_t _pud;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We should set the dirty bit only for FOLL_WRITE but for now
|
|
|
+ * the dirty bit in the pud is meaningless. And if the dirty
|
|
|
+ * bit will become meaningful and we'll only set it with
|
|
|
+ * FOLL_WRITE, an atomic set_bit will be required on the pud to
|
|
|
+ * set the young bit, instead of the current set_pud_at.
|
|
|
+ */
|
|
|
+ _pud = pud_mkyoung(pud_mkdirty(*pud));
|
|
|
+ if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
|
|
|
+ pud, _pud, 1))
|
|
|
+ update_mmu_cache_pud(vma, addr, pud);
|
|
|
+}
|
|
|
+
|
|
|
+struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
|
|
|
+ pud_t *pud, int flags)
|
|
|
+{
|
|
|
+ unsigned long pfn = pud_pfn(*pud);
|
|
|
+ struct mm_struct *mm = vma->vm_mm;
|
|
|
+ struct dev_pagemap *pgmap;
|
|
|
+ struct page *page;
|
|
|
+
|
|
|
+ assert_spin_locked(pud_lockptr(mm, pud));
|
|
|
+
|
|
|
+ if (flags & FOLL_WRITE && !pud_write(*pud))
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ if (pud_present(*pud) && pud_devmap(*pud))
|
|
|
+ /* pass */;
|
|
|
+ else
|
|
|
+ return NULL;
|
|
|
+
|
|
|
+ if (flags & FOLL_TOUCH)
|
|
|
+ touch_pud(vma, addr, pud);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * device mapped pages can only be returned if the
|
|
|
+ * caller will manage the page reference count.
|
|
|
+ */
|
|
|
+ if (!(flags & FOLL_GET))
|
|
|
+ return ERR_PTR(-EEXIST);
|
|
|
+
|
|
|
+ pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
|
|
|
+ pgmap = get_dev_pagemap(pfn, NULL);
|
|
|
+ if (!pgmap)
|
|
|
+ return ERR_PTR(-EFAULT);
|
|
|
+ page = pfn_to_page(pfn);
|
|
|
+ get_page(page);
|
|
|
+ put_dev_pagemap(pgmap);
|
|
|
+
|
|
|
+ return page;
|
|
|
+}
|
|
|
+
|
|
|
+int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
|
|
|
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
|
|
|
+ struct vm_area_struct *vma)
|
|
|
+{
|
|
|
+ spinlock_t *dst_ptl, *src_ptl;
|
|
|
+ pud_t pud;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ dst_ptl = pud_lock(dst_mm, dst_pud);
|
|
|
+ src_ptl = pud_lockptr(src_mm, src_pud);
|
|
|
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
|
|
|
+
|
|
|
+ ret = -EAGAIN;
|
|
|
+ pud = *src_pud;
|
|
|
+ if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
|
|
|
+ goto out_unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * When page table lock is held, the huge zero pud should not be
|
|
|
+ * under splitting since we don't split the page itself, only pud to
|
|
|
+ * a page table.
|
|
|
+ */
|
|
|
+ if (is_huge_zero_pud(pud)) {
|
|
|
+ /* No huge zero pud yet */
|
|
|
+ }
|
|
|
+
|
|
|
+ pudp_set_wrprotect(src_mm, addr, src_pud);
|
|
|
+ pud = pud_mkold(pud_wrprotect(pud));
|
|
|
+ set_pud_at(dst_mm, addr, dst_pud, pud);
|
|
|
+
|
|
|
+ ret = 0;
|
|
|
+out_unlock:
|
|
|
+ spin_unlock(src_ptl);
|
|
|
+ spin_unlock(dst_ptl);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
|
|
|
+{
|
|
|
+ pud_t entry;
|
|
|
+ unsigned long haddr;
|
|
|
+ bool write = vmf->flags & FAULT_FLAG_WRITE;
|
|
|
+
|
|
|
+ vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
|
|
|
+ if (unlikely(!pud_same(*vmf->pud, orig_pud)))
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ entry = pud_mkyoung(orig_pud);
|
|
|
+ if (write)
|
|
|
+ entry = pud_mkdirty(entry);
|
|
|
+ haddr = vmf->address & HPAGE_PUD_MASK;
|
|
|
+ if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
|
|
|
+ update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
|
|
|
+
|
|
|
+unlock:
|
|
|
+ spin_unlock(vmf->ptl);
|
|
|
+}
|
|
|
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
|
|
|
+
|
|
|
void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
|
|
|
{
|
|
|
pmd_t entry;
|
|
@@ -1601,6 +1772,84 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
|
|
|
return NULL;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Returns true if a given pud maps a thp, false otherwise.
|
|
|
+ *
|
|
|
+ * Note that if it returns true, this routine returns without unlocking page
|
|
|
+ * table lock. So callers must unlock it.
|
|
|
+ */
|
|
|
+spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
|
|
|
+{
|
|
|
+ spinlock_t *ptl;
|
|
|
+
|
|
|
+ ptl = pud_lock(vma->vm_mm, pud);
|
|
|
+ if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
|
|
|
+ return ptl;
|
|
|
+ spin_unlock(ptl);
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
|
|
|
+int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
|
|
+ pud_t *pud, unsigned long addr)
|
|
|
+{
|
|
|
+ pud_t orig_pud;
|
|
|
+ spinlock_t *ptl;
|
|
|
+
|
|
|
+ ptl = __pud_trans_huge_lock(pud, vma);
|
|
|
+ if (!ptl)
|
|
|
+ return 0;
|
|
|
+ /*
|
|
|
+ * For architectures like ppc64 we look at deposited pgtable
|
|
|
+ * when calling pudp_huge_get_and_clear. So do the
|
|
|
+ * pgtable_trans_huge_withdraw after finishing pudp related
|
|
|
+ * operations.
|
|
|
+ */
|
|
|
+ orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
|
|
|
+ tlb->fullmm);
|
|
|
+ tlb_remove_pud_tlb_entry(tlb, pud, addr);
|
|
|
+ if (vma_is_dax(vma)) {
|
|
|
+ spin_unlock(ptl);
|
|
|
+ /* No zero page support yet */
|
|
|
+ } else {
|
|
|
+ /* No support for anonymous PUD pages yet */
|
|
|
+ BUG();
|
|
|
+ }
|
|
|
+ return 1;
|
|
|
+}
|
|
|
+
|
|
|
+static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
|
|
|
+ unsigned long haddr)
|
|
|
+{
|
|
|
+ VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
|
|
|
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
|
|
|
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
|
|
|
+ VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
|
|
|
+
|
|
|
+ count_vm_event(THP_SPLIT_PMD);
|
|
|
+
|
|
|
+ pudp_huge_clear_flush_notify(vma, haddr, pud);
|
|
|
+}
|
|
|
+
|
|
|
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
|
|
|
+ unsigned long address)
|
|
|
+{
|
|
|
+ spinlock_t *ptl;
|
|
|
+ struct mm_struct *mm = vma->vm_mm;
|
|
|
+ unsigned long haddr = address & HPAGE_PUD_MASK;
|
|
|
+
|
|
|
+ mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
|
|
|
+ ptl = pud_lock(mm, pud);
|
|
|
+ if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
|
|
|
+ goto out;
|
|
|
+ __split_huge_pud_locked(vma, pud, haddr);
|
|
|
+
|
|
|
+out:
|
|
|
+ spin_unlock(ptl);
|
|
|
+ mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
|
|
|
+}
|
|
|
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
|
|
|
+
|
|
|
static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
|
|
|
unsigned long haddr, pmd_t *pmd)
|
|
|
{
|