|
@@ -14,6 +14,7 @@
|
|
|
#include <linux/userfaultfd_k.h>
|
|
|
#include <linux/page_idle.h>
|
|
|
#include <linux/swapops.h>
|
|
|
+#include <linux/shmem_fs.h>
|
|
|
|
|
|
#include <asm/tlb.h>
|
|
|
#include <asm/pgalloc.h>
|
|
@@ -42,7 +43,8 @@ enum scan_result {
|
|
|
SCAN_DEL_PAGE_LRU,
|
|
|
SCAN_ALLOC_HUGE_PAGE_FAIL,
|
|
|
SCAN_CGROUP_CHARGE_FAIL,
|
|
|
- SCAN_EXCEED_SWAP_PTE
|
|
|
+ SCAN_EXCEED_SWAP_PTE,
|
|
|
+ SCAN_TRUNCATED,
|
|
|
};
|
|
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
@@ -294,7 +296,7 @@ struct attribute_group khugepaged_attr_group = {
|
|
|
.name = "khugepaged",
|
|
|
};
|
|
|
|
|
|
-#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
|
|
|
+#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
|
|
|
|
|
|
int hugepage_madvise(struct vm_area_struct *vma,
|
|
|
unsigned long *vm_flags, int advice)
|
|
@@ -816,6 +818,10 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
|
|
|
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
|
|
|
(vma->vm_flags & VM_NOHUGEPAGE))
|
|
|
return false;
|
|
|
+ if (shmem_file(vma->vm_file)) {
|
|
|
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
|
|
|
+ HPAGE_PMD_NR);
|
|
|
+ }
|
|
|
if (!vma->anon_vma || vma->vm_ops)
|
|
|
return false;
|
|
|
if (is_vma_temporary_stack(vma))
|
|
@@ -1216,6 +1222,412 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_SHMEM
|
|
|
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
|
|
|
+{
|
|
|
+ struct vm_area_struct *vma;
|
|
|
+ unsigned long addr;
|
|
|
+ pmd_t *pmd, _pmd;
|
|
|
+
|
|
|
+ i_mmap_lock_write(mapping);
|
|
|
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
|
|
|
+ /* probably overkill */
|
|
|
+ if (vma->anon_vma)
|
|
|
+ continue;
|
|
|
+ addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
|
|
|
+ if (addr & ~HPAGE_PMD_MASK)
|
|
|
+ continue;
|
|
|
+ if (vma->vm_end < addr + HPAGE_PMD_SIZE)
|
|
|
+ continue;
|
|
|
+ pmd = mm_find_pmd(vma->vm_mm, addr);
|
|
|
+ if (!pmd)
|
|
|
+ continue;
|
|
|
+ /*
|
|
|
+ * We need exclusive mmap_sem to retract page table.
|
|
|
+ * If trylock fails we would end up with pte-mapped THP after
|
|
|
+ * re-fault. Not ideal, but it's more important to not disturb
|
|
|
+ * the system too much.
|
|
|
+ */
|
|
|
+ if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
|
|
|
+ spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
|
|
|
+ /* assume page table is clear */
|
|
|
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
|
|
|
+ spin_unlock(ptl);
|
|
|
+ up_write(&vma->vm_mm->mmap_sem);
|
|
|
+ atomic_long_dec(&vma->vm_mm->nr_ptes);
|
|
|
+ pte_free(vma->vm_mm, pmd_pgtable(_pmd));
|
|
|
+ }
|
|
|
+ }
|
|
|
+ i_mmap_unlock_write(mapping);
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
|
|
|
+ *
|
|
|
+ * Basic scheme is simple, details are more complex:
|
|
|
+ * - allocate and freeze a new huge page;
|
|
|
+ * - scan over radix tree replacing old pages the new one
|
|
|
+ * + swap in pages if necessary;
|
|
|
+ * + fill in gaps;
|
|
|
+ * + keep old pages around in case if rollback is required;
|
|
|
+ * - if replacing succeed:
|
|
|
+ * + copy data over;
|
|
|
+ * + free old pages;
|
|
|
+ * + unfreeze huge page;
|
|
|
+ * - if replacing failed;
|
|
|
+ * + put all pages back and unfreeze them;
|
|
|
+ * + restore gaps in the radix-tree;
|
|
|
+ * + free huge page;
|
|
|
+ */
|
|
|
+static void collapse_shmem(struct mm_struct *mm,
|
|
|
+ struct address_space *mapping, pgoff_t start,
|
|
|
+ struct page **hpage, int node)
|
|
|
+{
|
|
|
+ gfp_t gfp;
|
|
|
+ struct page *page, *new_page, *tmp;
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
+ pgoff_t index, end = start + HPAGE_PMD_NR;
|
|
|
+ LIST_HEAD(pagelist);
|
|
|
+ struct radix_tree_iter iter;
|
|
|
+ void **slot;
|
|
|
+ int nr_none = 0, result = SCAN_SUCCEED;
|
|
|
+
|
|
|
+ VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
|
|
|
+
|
|
|
+ /* Only allocate from the target node */
|
|
|
+ gfp = alloc_hugepage_khugepaged_gfpmask() |
|
|
|
+ __GFP_OTHER_NODE | __GFP_THISNODE;
|
|
|
+
|
|
|
+ new_page = khugepaged_alloc_page(hpage, gfp, node);
|
|
|
+ if (!new_page) {
|
|
|
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
|
|
|
+ result = SCAN_CGROUP_CHARGE_FAIL;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ new_page->index = start;
|
|
|
+ new_page->mapping = mapping;
|
|
|
+ __SetPageSwapBacked(new_page);
|
|
|
+ __SetPageLocked(new_page);
|
|
|
+ BUG_ON(!page_ref_freeze(new_page, 1));
|
|
|
+
|
|
|
+
|
|
|
+ /*
|
|
|
+ * At this point the new_page is 'frozen' (page_count() is zero), locked
|
|
|
+ * and not up-to-date. It's safe to insert it into radix tree, because
|
|
|
+ * nobody would be able to map it or use it in other way until we
|
|
|
+ * unfreeze it.
|
|
|
+ */
|
|
|
+
|
|
|
+ index = start;
|
|
|
+ spin_lock_irq(&mapping->tree_lock);
|
|
|
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
|
|
|
+ int n = min(iter.index, end) - index;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Handle holes in the radix tree: charge it from shmem and
|
|
|
+ * insert relevant subpage of new_page into the radix-tree.
|
|
|
+ */
|
|
|
+ if (n && !shmem_charge(mapping->host, n)) {
|
|
|
+ result = SCAN_FAIL;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ nr_none += n;
|
|
|
+ for (; index < min(iter.index, end); index++) {
|
|
|
+ radix_tree_insert(&mapping->page_tree, index,
|
|
|
+ new_page + (index % HPAGE_PMD_NR));
|
|
|
+ }
|
|
|
+
|
|
|
+ /* We are done. */
|
|
|
+ if (index >= end)
|
|
|
+ break;
|
|
|
+
|
|
|
+ page = radix_tree_deref_slot_protected(slot,
|
|
|
+ &mapping->tree_lock);
|
|
|
+ if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
|
|
|
+ spin_unlock_irq(&mapping->tree_lock);
|
|
|
+ /* swap in or instantiate fallocated page */
|
|
|
+ if (shmem_getpage(mapping->host, index, &page,
|
|
|
+ SGP_NOHUGE)) {
|
|
|
+ result = SCAN_FAIL;
|
|
|
+ goto tree_unlocked;
|
|
|
+ }
|
|
|
+ spin_lock_irq(&mapping->tree_lock);
|
|
|
+ } else if (trylock_page(page)) {
|
|
|
+ get_page(page);
|
|
|
+ } else {
|
|
|
+ result = SCAN_PAGE_LOCK;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The page must be locked, so we can drop the tree_lock
|
|
|
+ * without racing with truncate.
|
|
|
+ */
|
|
|
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
|
|
|
+ VM_BUG_ON_PAGE(!PageUptodate(page), page);
|
|
|
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
|
|
|
+
|
|
|
+ if (page_mapping(page) != mapping) {
|
|
|
+ result = SCAN_TRUNCATED;
|
|
|
+ goto out_unlock;
|
|
|
+ }
|
|
|
+ spin_unlock_irq(&mapping->tree_lock);
|
|
|
+
|
|
|
+ if (isolate_lru_page(page)) {
|
|
|
+ result = SCAN_DEL_PAGE_LRU;
|
|
|
+ goto out_isolate_failed;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (page_mapped(page))
|
|
|
+ unmap_mapping_range(mapping, index << PAGE_SHIFT,
|
|
|
+ PAGE_SIZE, 0);
|
|
|
+
|
|
|
+ spin_lock_irq(&mapping->tree_lock);
|
|
|
+
|
|
|
+ VM_BUG_ON_PAGE(page_mapped(page), page);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The page is expected to have page_count() == 3:
|
|
|
+ * - we hold a pin on it;
|
|
|
+ * - one reference from radix tree;
|
|
|
+ * - one from isolate_lru_page;
|
|
|
+ */
|
|
|
+ if (!page_ref_freeze(page, 3)) {
|
|
|
+ result = SCAN_PAGE_COUNT;
|
|
|
+ goto out_lru;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Add the page to the list to be able to undo the collapse if
|
|
|
+ * something go wrong.
|
|
|
+ */
|
|
|
+ list_add_tail(&page->lru, &pagelist);
|
|
|
+
|
|
|
+ /* Finally, replace with the new page. */
|
|
|
+ radix_tree_replace_slot(slot,
|
|
|
+ new_page + (index % HPAGE_PMD_NR));
|
|
|
+
|
|
|
+ index++;
|
|
|
+ continue;
|
|
|
+out_lru:
|
|
|
+ spin_unlock_irq(&mapping->tree_lock);
|
|
|
+ putback_lru_page(page);
|
|
|
+out_isolate_failed:
|
|
|
+ unlock_page(page);
|
|
|
+ put_page(page);
|
|
|
+ goto tree_unlocked;
|
|
|
+out_unlock:
|
|
|
+ unlock_page(page);
|
|
|
+ put_page(page);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Handle hole in radix tree at the end of the range.
|
|
|
+ * This code only triggers if there's nothing in radix tree
|
|
|
+ * beyond 'end'.
|
|
|
+ */
|
|
|
+ if (result == SCAN_SUCCEED && index < end) {
|
|
|
+ int n = end - index;
|
|
|
+
|
|
|
+ if (!shmem_charge(mapping->host, n)) {
|
|
|
+ result = SCAN_FAIL;
|
|
|
+ goto tree_locked;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (; index < end; index++) {
|
|
|
+ radix_tree_insert(&mapping->page_tree, index,
|
|
|
+ new_page + (index % HPAGE_PMD_NR));
|
|
|
+ }
|
|
|
+ nr_none += n;
|
|
|
+ }
|
|
|
+
|
|
|
+tree_locked:
|
|
|
+ spin_unlock_irq(&mapping->tree_lock);
|
|
|
+tree_unlocked:
|
|
|
+
|
|
|
+ if (result == SCAN_SUCCEED) {
|
|
|
+ unsigned long flags;
|
|
|
+ struct zone *zone = page_zone(new_page);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Replacing old pages with new one has succeed, now we need to
|
|
|
+ * copy the content and free old pages.
|
|
|
+ */
|
|
|
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
|
|
|
+ copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
|
|
|
+ page);
|
|
|
+ list_del(&page->lru);
|
|
|
+ unlock_page(page);
|
|
|
+ page_ref_unfreeze(page, 1);
|
|
|
+ page->mapping = NULL;
|
|
|
+ ClearPageActive(page);
|
|
|
+ ClearPageUnevictable(page);
|
|
|
+ put_page(page);
|
|
|
+ }
|
|
|
+
|
|
|
+ local_irq_save(flags);
|
|
|
+ __inc_zone_page_state(new_page, NR_SHMEM_THPS);
|
|
|
+ if (nr_none) {
|
|
|
+ __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none);
|
|
|
+ __mod_zone_page_state(zone, NR_SHMEM, nr_none);
|
|
|
+ }
|
|
|
+ local_irq_restore(flags);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Remove pte page tables, so we can re-faulti
|
|
|
+ * the page as huge.
|
|
|
+ */
|
|
|
+ retract_page_tables(mapping, start);
|
|
|
+
|
|
|
+ /* Everything is ready, let's unfreeze the new_page */
|
|
|
+ set_page_dirty(new_page);
|
|
|
+ SetPageUptodate(new_page);
|
|
|
+ page_ref_unfreeze(new_page, HPAGE_PMD_NR);
|
|
|
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
|
|
|
+ lru_cache_add_anon(new_page);
|
|
|
+ unlock_page(new_page);
|
|
|
+
|
|
|
+ *hpage = NULL;
|
|
|
+ } else {
|
|
|
+ /* Something went wrong: rollback changes to the radix-tree */
|
|
|
+ shmem_uncharge(mapping->host, nr_none);
|
|
|
+ spin_lock_irq(&mapping->tree_lock);
|
|
|
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
|
|
|
+ start) {
|
|
|
+ if (iter.index >= end)
|
|
|
+ break;
|
|
|
+ page = list_first_entry_or_null(&pagelist,
|
|
|
+ struct page, lru);
|
|
|
+ if (!page || iter.index < page->index) {
|
|
|
+ if (!nr_none)
|
|
|
+ break;
|
|
|
+ /* Put holes back where they were */
|
|
|
+ radix_tree_replace_slot(slot, NULL);
|
|
|
+ nr_none--;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ VM_BUG_ON_PAGE(page->index != iter.index, page);
|
|
|
+
|
|
|
+ /* Unfreeze the page. */
|
|
|
+ list_del(&page->lru);
|
|
|
+ page_ref_unfreeze(page, 2);
|
|
|
+ radix_tree_replace_slot(slot, page);
|
|
|
+ spin_unlock_irq(&mapping->tree_lock);
|
|
|
+ putback_lru_page(page);
|
|
|
+ unlock_page(page);
|
|
|
+ spin_lock_irq(&mapping->tree_lock);
|
|
|
+ }
|
|
|
+ VM_BUG_ON(nr_none);
|
|
|
+ spin_unlock_irq(&mapping->tree_lock);
|
|
|
+
|
|
|
+ /* Unfreeze new_page, caller would take care about freeing it */
|
|
|
+ page_ref_unfreeze(new_page, 1);
|
|
|
+ mem_cgroup_cancel_charge(new_page, memcg, true);
|
|
|
+ unlock_page(new_page);
|
|
|
+ new_page->mapping = NULL;
|
|
|
+ }
|
|
|
+out:
|
|
|
+ VM_BUG_ON(!list_empty(&pagelist));
|
|
|
+ /* TODO: tracepoints */
|
|
|
+}
|
|
|
+
|
|
|
+static void khugepaged_scan_shmem(struct mm_struct *mm,
|
|
|
+ struct address_space *mapping,
|
|
|
+ pgoff_t start, struct page **hpage)
|
|
|
+{
|
|
|
+ struct page *page = NULL;
|
|
|
+ struct radix_tree_iter iter;
|
|
|
+ void **slot;
|
|
|
+ int present, swap;
|
|
|
+ int node = NUMA_NO_NODE;
|
|
|
+ int result = SCAN_SUCCEED;
|
|
|
+
|
|
|
+ present = 0;
|
|
|
+ swap = 0;
|
|
|
+ memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
|
|
|
+ rcu_read_lock();
|
|
|
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
|
|
|
+ if (iter.index >= start + HPAGE_PMD_NR)
|
|
|
+ break;
|
|
|
+
|
|
|
+ page = radix_tree_deref_slot(slot);
|
|
|
+ if (radix_tree_deref_retry(page)) {
|
|
|
+ slot = radix_tree_iter_retry(&iter);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (radix_tree_exception(page)) {
|
|
|
+ if (++swap > khugepaged_max_ptes_swap) {
|
|
|
+ result = SCAN_EXCEED_SWAP_PTE;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (PageTransCompound(page)) {
|
|
|
+ result = SCAN_PAGE_COMPOUND;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ node = page_to_nid(page);
|
|
|
+ if (khugepaged_scan_abort(node)) {
|
|
|
+ result = SCAN_SCAN_ABORT;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ khugepaged_node_load[node]++;
|
|
|
+
|
|
|
+ if (!PageLRU(page)) {
|
|
|
+ result = SCAN_PAGE_LRU;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (page_count(page) != 1 + page_mapcount(page)) {
|
|
|
+ result = SCAN_PAGE_COUNT;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We probably should check if the page is referenced here, but
|
|
|
+ * nobody would transfer pte_young() to PageReferenced() for us.
|
|
|
+ * And rmap walk here is just too costly...
|
|
|
+ */
|
|
|
+
|
|
|
+ present++;
|
|
|
+
|
|
|
+ if (need_resched()) {
|
|
|
+ cond_resched_rcu();
|
|
|
+ slot = radix_tree_iter_next(&iter);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ rcu_read_unlock();
|
|
|
+
|
|
|
+ if (result == SCAN_SUCCEED) {
|
|
|
+ if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
|
|
|
+ result = SCAN_EXCEED_NONE_PTE;
|
|
|
+ } else {
|
|
|
+ node = khugepaged_find_target_node();
|
|
|
+ collapse_shmem(mm, mapping, start, hpage, node);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /* TODO: tracepoints */
|
|
|
+}
|
|
|
+#else
|
|
|
+static void khugepaged_scan_shmem(struct mm_struct *mm,
|
|
|
+ struct address_space *mapping,
|
|
|
+ pgoff_t start, struct page **hpage)
|
|
|
+{
|
|
|
+ BUILD_BUG();
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
|
|
|
struct page **hpage)
|
|
|
__releases(&khugepaged_mm_lock)
|
|
@@ -1269,6 +1681,8 @@ skip:
|
|
|
if (khugepaged_scan.address < hstart)
|
|
|
khugepaged_scan.address = hstart;
|
|
|
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
|
|
|
+ if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
|
|
|
+ goto skip;
|
|
|
|
|
|
while (khugepaged_scan.address < hend) {
|
|
|
int ret;
|
|
@@ -1279,9 +1693,20 @@ skip:
|
|
|
VM_BUG_ON(khugepaged_scan.address < hstart ||
|
|
|
khugepaged_scan.address + HPAGE_PMD_SIZE >
|
|
|
hend);
|
|
|
- ret = khugepaged_scan_pmd(mm, vma,
|
|
|
- khugepaged_scan.address,
|
|
|
- hpage);
|
|
|
+ if (shmem_file(vma->vm_file)) {
|
|
|
+ struct file *file = get_file(vma->vm_file);
|
|
|
+ pgoff_t pgoff = linear_page_index(vma,
|
|
|
+ khugepaged_scan.address);
|
|
|
+ up_read(&mm->mmap_sem);
|
|
|
+ ret = 1;
|
|
|
+ khugepaged_scan_shmem(mm, file->f_mapping,
|
|
|
+ pgoff, hpage);
|
|
|
+ fput(file);
|
|
|
+ } else {
|
|
|
+ ret = khugepaged_scan_pmd(mm, vma,
|
|
|
+ khugepaged_scan.address,
|
|
|
+ hpage);
|
|
|
+ }
|
|
|
/* move to next address */
|
|
|
khugepaged_scan.address += HPAGE_PMD_SIZE;
|
|
|
progress += HPAGE_PMD_NR;
|