|
@@ -2739,8 +2739,6 @@ static int do_anonymous_page(struct fault_env *fe)
|
|
|
struct page *page;
|
|
|
pte_t entry;
|
|
|
|
|
|
- pte_unmap(fe->pte);
|
|
|
-
|
|
|
/* File mapping without ->vm_ops ? */
|
|
|
if (vma->vm_flags & VM_SHARED)
|
|
|
return VM_FAULT_SIGBUS;
|
|
@@ -2749,6 +2747,23 @@ static int do_anonymous_page(struct fault_env *fe)
|
|
|
if (check_stack_guard_page(vma, fe->address) < 0)
|
|
|
return VM_FAULT_SIGSEGV;
|
|
|
|
|
|
+ /*
|
|
|
+ * Use pte_alloc() instead of pte_alloc_map(). We can't run
|
|
|
+ * pte_offset_map() on pmds where a huge pmd might be created
|
|
|
+ * from a different thread.
|
|
|
+ *
|
|
|
+ * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
|
|
|
+ * parallel threads are excluded by other means.
|
|
|
+ *
|
|
|
+ * Here we only have down_read(mmap_sem).
|
|
|
+ */
|
|
|
+ if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
|
|
|
+ return VM_FAULT_OOM;
|
|
|
+
|
|
|
+ /* See the comment in pte_alloc_one_map() */
|
|
|
+ if (unlikely(pmd_trans_unstable(fe->pmd)))
|
|
|
+ return 0;
|
|
|
+
|
|
|
/* Use the zero-page for reads */
|
|
|
if (!(fe->flags & FAULT_FLAG_WRITE) &&
|
|
|
!mm_forbids_zeropage(vma->vm_mm)) {
|
|
@@ -2865,23 +2880,76 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+static int pte_alloc_one_map(struct fault_env *fe)
|
|
|
+{
|
|
|
+ struct vm_area_struct *vma = fe->vma;
|
|
|
+
|
|
|
+ if (!pmd_none(*fe->pmd))
|
|
|
+ goto map_pte;
|
|
|
+ if (fe->prealloc_pte) {
|
|
|
+ fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
|
|
|
+ if (unlikely(!pmd_none(*fe->pmd))) {
|
|
|
+ spin_unlock(fe->ptl);
|
|
|
+ goto map_pte;
|
|
|
+ }
|
|
|
+
|
|
|
+ atomic_long_inc(&vma->vm_mm->nr_ptes);
|
|
|
+ pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
|
|
|
+ spin_unlock(fe->ptl);
|
|
|
+ fe->prealloc_pte = 0;
|
|
|
+ } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
|
|
|
+ return VM_FAULT_OOM;
|
|
|
+ }
|
|
|
+map_pte:
|
|
|
+ /*
|
|
|
+ * If a huge pmd materialized under us just retry later. Use
|
|
|
+ * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
|
|
|
+ * didn't become pmd_trans_huge under us and then back to pmd_none, as
|
|
|
+ * a result of MADV_DONTNEED running immediately after a huge pmd fault
|
|
|
+ * in a different thread of this mm, in turn leading to a misleading
|
|
|
+ * pmd_trans_huge() retval. All we have to ensure is that it is a
|
|
|
+ * regular pmd that we can walk with pte_offset_map() and we can do that
|
|
|
+ * through an atomic read in C, which is what pmd_trans_unstable()
|
|
|
+ * provides.
|
|
|
+ */
|
|
|
+ if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
|
|
|
+ return VM_FAULT_NOPAGE;
|
|
|
+
|
|
|
+ fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
|
|
|
+ &fe->ptl);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
- * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
|
|
|
+ * alloc_set_pte - setup new PTE entry for given page and add reverse page
|
|
|
+ * mapping. If needed, the fucntion allocates page table or use pre-allocated.
|
|
|
*
|
|
|
* @fe: fault environment
|
|
|
+ * @memcg: memcg to charge page (only for private mappings)
|
|
|
* @page: page to map
|
|
|
*
|
|
|
- * Caller must hold page table lock relevant for @fe->pte.
|
|
|
+ * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
|
|
|
*
|
|
|
* Target users are page handler itself and implementations of
|
|
|
* vm_ops->map_pages.
|
|
|
*/
|
|
|
-void do_set_pte(struct fault_env *fe, struct page *page)
|
|
|
+int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
|
|
|
+ struct page *page)
|
|
|
{
|
|
|
struct vm_area_struct *vma = fe->vma;
|
|
|
bool write = fe->flags & FAULT_FLAG_WRITE;
|
|
|
pte_t entry;
|
|
|
|
|
|
+ if (!fe->pte) {
|
|
|
+ int ret = pte_alloc_one_map(fe);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Re-check under ptl */
|
|
|
+ if (unlikely(!pte_none(*fe->pte)))
|
|
|
+ return VM_FAULT_NOPAGE;
|
|
|
+
|
|
|
flush_icache_page(vma, page);
|
|
|
entry = mk_pte(page, vma->vm_page_prot);
|
|
|
if (write)
|
|
@@ -2890,6 +2958,8 @@ void do_set_pte(struct fault_env *fe, struct page *page)
|
|
|
if (write && !(vma->vm_flags & VM_SHARED)) {
|
|
|
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
|
|
|
page_add_new_anon_rmap(page, vma, fe->address, false);
|
|
|
+ mem_cgroup_commit_charge(page, memcg, false, false);
|
|
|
+ lru_cache_add_active_or_unevictable(page, vma);
|
|
|
} else {
|
|
|
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
|
|
|
page_add_file_rmap(page);
|
|
@@ -2898,6 +2968,8 @@ void do_set_pte(struct fault_env *fe, struct page *page)
|
|
|
|
|
|
/* no need to invalidate: a not-present page won't be cached */
|
|
|
update_mmu_cache(vma, fe->address, fe->pte);
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
static unsigned long fault_around_bytes __read_mostly =
|
|
@@ -2964,19 +3036,17 @@ late_initcall(fault_around_debugfs);
|
|
|
* fault_around_pages() value (and therefore to page order). This way it's
|
|
|
* easier to guarantee that we don't cross page table boundaries.
|
|
|
*/
|
|
|
-static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
|
|
|
+static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
|
|
|
{
|
|
|
- unsigned long address = fe->address, start_addr, nr_pages, mask;
|
|
|
- pte_t *pte = fe->pte;
|
|
|
+ unsigned long address = fe->address, nr_pages, mask;
|
|
|
pgoff_t end_pgoff;
|
|
|
- int off;
|
|
|
+ int off, ret = 0;
|
|
|
|
|
|
nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
|
|
|
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
|
|
|
|
|
|
- start_addr = max(fe->address & mask, fe->vma->vm_start);
|
|
|
- off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
|
|
|
- fe->pte -= off;
|
|
|
+ fe->address = max(address & mask, fe->vma->vm_start);
|
|
|
+ off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
|
|
|
start_pgoff -= off;
|
|
|
|
|
|
/*
|
|
@@ -2984,30 +3054,45 @@ static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
|
|
|
* or fault_around_pages() from start_pgoff, depending what is nearest.
|
|
|
*/
|
|
|
end_pgoff = start_pgoff -
|
|
|
- ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
|
|
|
+ ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
|
|
|
PTRS_PER_PTE - 1;
|
|
|
end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
|
|
|
start_pgoff + nr_pages - 1);
|
|
|
|
|
|
- /* Check if it makes any sense to call ->map_pages */
|
|
|
- fe->address = start_addr;
|
|
|
- while (!pte_none(*fe->pte)) {
|
|
|
- if (++start_pgoff > end_pgoff)
|
|
|
- goto out;
|
|
|
- fe->address += PAGE_SIZE;
|
|
|
- if (fe->address >= fe->vma->vm_end)
|
|
|
- goto out;
|
|
|
- fe->pte++;
|
|
|
+ if (pmd_none(*fe->pmd)) {
|
|
|
+ fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
|
|
|
+ smp_wmb(); /* See comment in __pte_alloc() */
|
|
|
}
|
|
|
|
|
|
fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
|
|
|
+
|
|
|
+ /* preallocated pagetable is unused: free it */
|
|
|
+ if (fe->prealloc_pte) {
|
|
|
+ pte_free(fe->vma->vm_mm, fe->prealloc_pte);
|
|
|
+ fe->prealloc_pte = 0;
|
|
|
+ }
|
|
|
+ /* Huge page is mapped? Page fault is solved */
|
|
|
+ if (pmd_trans_huge(*fe->pmd)) {
|
|
|
+ ret = VM_FAULT_NOPAGE;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* ->map_pages() haven't done anything useful. Cold page cache? */
|
|
|
+ if (!fe->pte)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /* check if the page fault is solved */
|
|
|
+ fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
|
|
|
+ if (!pte_none(*fe->pte))
|
|
|
+ ret = VM_FAULT_NOPAGE;
|
|
|
+ pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
out:
|
|
|
- /* restore fault_env */
|
|
|
- fe->pte = pte;
|
|
|
fe->address = address;
|
|
|
+ fe->pte = NULL;
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
-static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
|
|
|
+static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
|
|
|
{
|
|
|
struct vm_area_struct *vma = fe->vma;
|
|
|
struct page *fault_page;
|
|
@@ -3019,36 +3104,25 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
|
|
|
* something).
|
|
|
*/
|
|
|
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
|
|
|
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
|
|
|
- &fe->ptl);
|
|
|
- if (!pte_same(*fe->pte, orig_pte))
|
|
|
- goto unlock_out;
|
|
|
- do_fault_around(fe, pgoff);
|
|
|
- /* Check if the fault is handled by faultaround */
|
|
|
- if (!pte_same(*fe->pte, orig_pte))
|
|
|
- goto unlock_out;
|
|
|
- pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
+ ret = do_fault_around(fe, pgoff);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
}
|
|
|
|
|
|
ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
|
|
|
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
|
|
|
return ret;
|
|
|
|
|
|
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl);
|
|
|
- if (unlikely(!pte_same(*fe->pte, orig_pte))) {
|
|
|
+ ret |= alloc_set_pte(fe, NULL, fault_page);
|
|
|
+ if (fe->pte)
|
|
|
pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
- unlock_page(fault_page);
|
|
|
- put_page(fault_page);
|
|
|
- return ret;
|
|
|
- }
|
|
|
- do_set_pte(fe, fault_page);
|
|
|
unlock_page(fault_page);
|
|
|
-unlock_out:
|
|
|
- pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
|
|
|
+ put_page(fault_page);
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
-static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
|
|
|
+static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
|
|
|
{
|
|
|
struct vm_area_struct *vma = fe->vma;
|
|
|
struct page *fault_page, *new_page;
|
|
@@ -3077,29 +3151,17 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
|
|
|
copy_user_highpage(new_page, fault_page, fe->address, vma);
|
|
|
__SetPageUptodate(new_page);
|
|
|
|
|
|
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
|
|
|
- &fe->ptl);
|
|
|
- if (unlikely(!pte_same(*fe->pte, orig_pte))) {
|
|
|
+ ret |= alloc_set_pte(fe, memcg, new_page);
|
|
|
+ if (fe->pte)
|
|
|
pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
- if (!(ret & VM_FAULT_DAX_LOCKED)) {
|
|
|
- unlock_page(fault_page);
|
|
|
- put_page(fault_page);
|
|
|
- } else {
|
|
|
- dax_unlock_mapping_entry(vma->vm_file->f_mapping,
|
|
|
- pgoff);
|
|
|
- }
|
|
|
- goto uncharge_out;
|
|
|
- }
|
|
|
- do_set_pte(fe, new_page);
|
|
|
- mem_cgroup_commit_charge(new_page, memcg, false, false);
|
|
|
- lru_cache_add_active_or_unevictable(new_page, vma);
|
|
|
- pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
if (!(ret & VM_FAULT_DAX_LOCKED)) {
|
|
|
unlock_page(fault_page);
|
|
|
put_page(fault_page);
|
|
|
} else {
|
|
|
dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
|
|
|
}
|
|
|
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
|
|
|
+ goto uncharge_out;
|
|
|
return ret;
|
|
|
uncharge_out:
|
|
|
mem_cgroup_cancel_charge(new_page, memcg, false);
|
|
@@ -3107,7 +3169,7 @@ uncharge_out:
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
-static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
|
|
|
+static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
|
|
|
{
|
|
|
struct vm_area_struct *vma = fe->vma;
|
|
|
struct page *fault_page;
|
|
@@ -3133,16 +3195,15 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
|
|
|
- &fe->ptl);
|
|
|
- if (unlikely(!pte_same(*fe->pte, orig_pte))) {
|
|
|
+ ret |= alloc_set_pte(fe, NULL, fault_page);
|
|
|
+ if (fe->pte)
|
|
|
pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
+ if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
|
|
|
+ VM_FAULT_RETRY))) {
|
|
|
unlock_page(fault_page);
|
|
|
put_page(fault_page);
|
|
|
return ret;
|
|
|
}
|
|
|
- do_set_pte(fe, fault_page);
|
|
|
- pte_unmap_unlock(fe->pte, fe->ptl);
|
|
|
|
|
|
if (set_page_dirty(fault_page))
|
|
|
dirtied = 1;
|
|
@@ -3174,20 +3235,19 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
|
|
|
* The mmap_sem may have been released depending on flags and our
|
|
|
* return value. See filemap_fault() and __lock_page_or_retry().
|
|
|
*/
|
|
|
-static int do_fault(struct fault_env *fe, pte_t orig_pte)
|
|
|
+static int do_fault(struct fault_env *fe)
|
|
|
{
|
|
|
struct vm_area_struct *vma = fe->vma;
|
|
|
pgoff_t pgoff = linear_page_index(vma, fe->address);
|
|
|
|
|
|
- pte_unmap(fe->pte);
|
|
|
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
|
|
|
if (!vma->vm_ops->fault)
|
|
|
return VM_FAULT_SIGBUS;
|
|
|
if (!(fe->flags & FAULT_FLAG_WRITE))
|
|
|
- return do_read_fault(fe, pgoff, orig_pte);
|
|
|
+ return do_read_fault(fe, pgoff);
|
|
|
if (!(vma->vm_flags & VM_SHARED))
|
|
|
- return do_cow_fault(fe, pgoff, orig_pte);
|
|
|
- return do_shared_fault(fe, pgoff, orig_pte);
|
|
|
+ return do_cow_fault(fe, pgoff);
|
|
|
+ return do_shared_fault(fe, pgoff);
|
|
|
}
|
|
|
|
|
|
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
|
|
@@ -3327,37 +3387,63 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
|
|
|
* with external mmu caches can use to update those (ie the Sparc or
|
|
|
* PowerPC hashed page tables that act as extended TLBs).
|
|
|
*
|
|
|
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
|
|
|
- * but allow concurrent faults), and pte mapped but not yet locked.
|
|
|
- * We return with pte unmapped and unlocked.
|
|
|
+ * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
|
|
|
+ * concurrent faults).
|
|
|
*
|
|
|
- * The mmap_sem may have been released depending on flags and our
|
|
|
- * return value. See filemap_fault() and __lock_page_or_retry().
|
|
|
+ * The mmap_sem may have been released depending on flags and our return value.
|
|
|
+ * See filemap_fault() and __lock_page_or_retry().
|
|
|
*/
|
|
|
static int handle_pte_fault(struct fault_env *fe)
|
|
|
{
|
|
|
pte_t entry;
|
|
|
|
|
|
- /*
|
|
|
- * some architectures can have larger ptes than wordsize,
|
|
|
- * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
|
|
|
- * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
|
|
|
- * The code below just needs a consistent view for the ifs and
|
|
|
- * we later double check anyway with the ptl lock held. So here
|
|
|
- * a barrier will do.
|
|
|
- */
|
|
|
- entry = *fe->pte;
|
|
|
- barrier();
|
|
|
- if (!pte_present(entry)) {
|
|
|
+ if (unlikely(pmd_none(*fe->pmd))) {
|
|
|
+ /*
|
|
|
+ * Leave __pte_alloc() until later: because vm_ops->fault may
|
|
|
+ * want to allocate huge page, and if we expose page table
|
|
|
+ * for an instant, it will be difficult to retract from
|
|
|
+ * concurrent faults and from rmap lookups.
|
|
|
+ */
|
|
|
+ fe->pte = NULL;
|
|
|
+ } else {
|
|
|
+ /* See comment in pte_alloc_one_map() */
|
|
|
+ if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
|
|
|
+ return 0;
|
|
|
+ /*
|
|
|
+ * A regular pmd is established and it can't morph into a huge
|
|
|
+ * pmd from under us anymore at this point because we hold the
|
|
|
+ * mmap_sem read mode and khugepaged takes it in write mode.
|
|
|
+ * So now it's safe to run pte_offset_map().
|
|
|
+ */
|
|
|
+ fe->pte = pte_offset_map(fe->pmd, fe->address);
|
|
|
+
|
|
|
+ entry = *fe->pte;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * some architectures can have larger ptes than wordsize,
|
|
|
+ * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
|
|
|
+ * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee
|
|
|
+ * atomic accesses. The code below just needs a consistent
|
|
|
+ * view for the ifs and we later double check anyway with the
|
|
|
+ * ptl lock held. So here a barrier will do.
|
|
|
+ */
|
|
|
+ barrier();
|
|
|
if (pte_none(entry)) {
|
|
|
- if (vma_is_anonymous(fe->vma))
|
|
|
- return do_anonymous_page(fe);
|
|
|
- else
|
|
|
- return do_fault(fe, entry);
|
|
|
+ pte_unmap(fe->pte);
|
|
|
+ fe->pte = NULL;
|
|
|
}
|
|
|
- return do_swap_page(fe, entry);
|
|
|
}
|
|
|
|
|
|
+ if (!fe->pte) {
|
|
|
+ if (vma_is_anonymous(fe->vma))
|
|
|
+ return do_anonymous_page(fe);
|
|
|
+ else
|
|
|
+ return do_fault(fe);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (!pte_present(entry))
|
|
|
+ return do_swap_page(fe, entry);
|
|
|
+
|
|
|
if (pte_protnone(entry))
|
|
|
return do_numa_page(fe, entry);
|
|
|
|
|
@@ -3439,34 +3525,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * Use pte_alloc() instead of pte_alloc_map, because we can't
|
|
|
- * run pte_offset_map on the pmd, if an huge pmd could
|
|
|
- * materialize from under us from a different thread.
|
|
|
- */
|
|
|
- if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address)))
|
|
|
- return VM_FAULT_OOM;
|
|
|
- /*
|
|
|
- * If a huge pmd materialized under us just retry later. Use
|
|
|
- * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
|
|
|
- * didn't become pmd_trans_huge under us and then back to pmd_none, as
|
|
|
- * a result of MADV_DONTNEED running immediately after a huge pmd fault
|
|
|
- * in a different thread of this mm, in turn leading to a misleading
|
|
|
- * pmd_trans_huge() retval. All we have to ensure is that it is a
|
|
|
- * regular pmd that we can walk with pte_offset_map() and we can do that
|
|
|
- * through an atomic read in C, which is what pmd_trans_unstable()
|
|
|
- * provides.
|
|
|
- */
|
|
|
- if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd)))
|
|
|
- return 0;
|
|
|
- /*
|
|
|
- * A regular pmd is established and it can't morph into a huge pmd
|
|
|
- * from under us anymore at this point because we hold the mmap_sem
|
|
|
- * read mode and khugepaged takes it in write mode. So now it's
|
|
|
- * safe to run pte_offset_map().
|
|
|
- */
|
|
|
- fe.pte = pte_offset_map(fe.pmd, fe.address);
|
|
|
-
|
|
|
return handle_pte_fault(&fe);
|
|
|
}
|
|
|
|