7 年之前 · ab5ac90aec
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
 
				 						long freed);
			
 
				 bool isolate_huge_page(struct page *page, struct list_head *list);
			
 
				 void putback_active_hugepage(struct page *page);
			
 
				+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
			
 
				 void free_huge_page(struct page *page);
			
 
				 void hugetlb_fix_reserve_counts(struct inode *inode);
			
 
				 extern struct mutex *hugetlb_fault_mutex_table;
			
@@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 
				 		unsigned long address, unsigned long end, pgprot_t newprot);
			
 
				 
			
 
				 bool is_hugetlb_entry_migration(pte_t pte);
			
 
				+
			
 
				 #else /* !CONFIG_HUGETLB_PAGE */
			
 
				 
			
 
				 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
			
@@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
 
				 	return false;
			
 
				 }
			
 
				 #define putback_active_hugepage(p)	do {} while (0)
			
 
				+#define move_hugetlb_state(old, new, reason)	do {} while (0)
			
 
				 
			
 
				 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
			
 
				 		unsigned long address, unsigned long end, pgprot_t newprot)
			
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
 
				 #include <linux/hugetlb_cgroup.h>
			
 
				 #include <linux/node.h>
			
 
				 #include <linux/userfaultfd_k.h>
			
 
				+#include <linux/page_owner.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				 int hugetlb_max_hstate __read_mostly;
			
@@ -1219,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
 
				 	ClearPagePrivate(&page[1]);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
			
 
				+ * code
			
 
				+ */
			
 
				+static inline bool PageHugeTemporary(struct page *page)
			
 
				+{
			
 
				+	if (!PageHuge(page))
			
 
				+		return false;
			
 
				+
			
 
				+	return (unsigned long)page[2].mapping == -1U;
			
 
				+}
			
 
				+
			
 
				+static inline void SetPageHugeTemporary(struct page *page)
			
 
				+{
			
 
				+	page[2].mapping = (void *)-1U;
			
 
				+}
			
 
				+
			
 
				+static inline void ClearPageHugeTemporary(struct page *page)
			
 
				+{
			
 
				+	page[2].mapping = NULL;
			
 
				+}
			
 
				+
			
 
				 void free_huge_page(struct page *page)
			
 
				 {
			
 
				 	/*
			
@@ -1253,7 +1276,11 @@ void free_huge_page(struct page *page)
 
				 	if (restore_reserve)
			
 
				 		h->resv_huge_pages++;
			
 
				 
			
 
				-	if (h->surplus_huge_pages_node[nid]) {
			
 
				+	if (PageHugeTemporary(page)) {
			
 
				+		list_del(&page->lru);
			
 
				+		ClearPageHugeTemporary(page);
			
 
				+		update_and_free_page(h, page);
			
 
				+	} else if (h->surplus_huge_pages_node[nid]) {
			
 
				 		/* remove the page from active list */
			
 
				 		list_del(&page->lru);
			
 
				 		update_and_free_page(h, page);
			
@@ -1507,7 +1534,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 
				 	return rc;
			
 
				 }
			
 
				 
			
 
				-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
			
 
				+/*
			
 
				+ * Allocates a fresh surplus page from the page allocator.
			
 
				+ */
			
 
				+static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
			
 
				 		int nid, nodemask_t *nmask)
			
 
				 {
			
 
				 	struct page *page;
			
@@ -1571,6 +1601,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				+static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
			
 
				+		int nid, nodemask_t *nmask)
			
 
				+{
			
 
				+	struct page *page;
			
 
				+
			
 
				+	if (hstate_is_gigantic(h))
			
 
				+		return NULL;
			
 
				+
			
 
				+	page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
			
 
				+	if (!page)
			
 
				+		return NULL;
			
 
				+
			
 
				+	/*
			
 
				+	 * We do not account these pages as surplus because they are only
			
 
				+	 * temporary and will be released properly on the last reference
			
 
				+	 */
			
 
				+	prep_new_huge_page(h, page, page_to_nid(page));
			
 
				+	SetPageHugeTemporary(page);
			
 
				+
			
 
				+	return page;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Use the VMA's mpolicy to allocate a huge page from the buddy.
			
 
				  */
			
@@ -1585,17 +1637,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
 
				 	nodemask_t *nodemask;
			
 
				 
			
 
				 	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
			
 
				-	page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
			
 
				+	page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
			
 
				 	mpol_cond_put(mpol);
			
 
				 
			
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * This allocation function is useful in the context where vma is irrelevant.
			
 
				- * E.g. soft-offlining uses this function because it only cares physical
			
 
				- * address of error page.
			
 
				- */
			
 
				+/* page migration callback function */
			
 
				 struct page *alloc_huge_page_node(struct hstate *h, int nid)
			
 
				 {
			
 
				 	gfp_t gfp_mask = htlb_alloc_mask(h);
			
@@ -1610,12 +1658,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 
			
 
				 	if (!page)
			
 
				-		page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
			
 
				+		page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
			
 
				 
			
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				-
			
 
				+/* page migration callback function */
			
 
				 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
			
 
				 		nodemask_t *nmask)
			
 
				 {
			
@@ -1633,9 +1681,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
 
				 	}
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 
			
 
				-	/* No reservations, try to overcommit */
			
 
				-
			
 
				-	return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
			
 
				+	return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1663,7 +1709,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 
				 retry:
			
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 	for (i = 0; i < needed; i++) {
			
 
				-		page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
			
 
				+		page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
			
 
				 				NUMA_NO_NODE, NULL);
			
 
				 		if (!page) {
			
 
				 			alloc_ok = false;
			
@@ -2260,7 +2306,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 
				 	 * First take pages out of surplus state.  Then make up the
			
 
				 	 * remaining difference by allocating fresh huge pages.
			
 
				 	 *
			
 
				-	 * We might race with __alloc_buddy_huge_page() here and be unable
			
 
				+	 * We might race with __alloc_surplus_huge_page() here and be unable
			
 
				 	 * to convert a surplus huge page to a normal huge page. That is
			
 
				 	 * not critical, though, it just means the overall size of the
			
 
				 	 * pool might be one hugepage larger than it needs to be, but
			
@@ -2303,7 +2349,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
 
				 	 * By placing pages into the surplus state independent of the
			
 
				 	 * overcommit value, we are allowing the surplus pool size to
			
 
				 	 * exceed overcommit. There are few sane options here. Since
			
 
				-	 * __alloc_buddy_huge_page() is checking the global counter,
			
 
				+	 * __alloc_surplus_huge_page() is checking the global counter,
			
 
				 	 * though, we'll note that we're not allowed to exceed surplus
			
 
				 	 * and won't grow the pool anywhere else. Not until one of the
			
 
				 	 * sysctls are changed, or the surplus pages go out of use.
			
@@ -4779,3 +4825,36 @@ void putback_active_hugepage(struct page *page)
 
				 	spin_unlock(&hugetlb_lock);
			
 
				 	put_page(page);
			
 
				 }
			
 
				+
			
 
				+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
			
 
				+{
			
 
				+	struct hstate *h = page_hstate(oldpage);
			
 
				+
			
 
				+	hugetlb_cgroup_migrate(oldpage, newpage);
			
 
				+	set_page_owner_migrate_reason(newpage, reason);
			
 
				+
			
 
				+	/*
			
 
				+	 * transfer temporary state of the new huge page. This is
			
 
				+	 * reverse to other transitions because the newpage is going to
			
 
				+	 * be final while the old one will be freed so it takes over
			
 
				+	 * the temporary status.
			
 
				+	 *
			
 
				+	 * Also note that we have to transfer the per-node surplus state
			
 
				+	 * here as well otherwise the global surplus count will not match
			
 
				+	 * the per-node's.
			
 
				+	 */
			
 
				+	if (PageHugeTemporary(newpage)) {
			
 
				+		int old_nid = page_to_nid(oldpage);
			
 
				+		int new_nid = page_to_nid(newpage);
			
 
				+
			
 
				+		SetPageHugeTemporary(oldpage);
			
 
				+		ClearPageHugeTemporary(newpage);
			
 
				+
			
 
				+		spin_lock(&hugetlb_lock);
			
 
				+		if (h->surplus_huge_pages_node[old_nid]) {
			
 
				+			h->surplus_huge_pages_node[old_nid]--;
			
 
				+			h->surplus_huge_pages_node[new_nid]++;
			
 
				+		}
			
 
				+		spin_unlock(&hugetlb_lock);
			
 
				+	}
			
 
				+}
			
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
 
				 		put_anon_vma(anon_vma);
			
 
				 
			
 
				 	if (rc == MIGRATEPAGE_SUCCESS) {
			
 
				-		hugetlb_cgroup_migrate(hpage, new_hpage);
			
 
				+		move_hugetlb_state(hpage, new_hpage, reason);
			
 
				 		put_new_page = NULL;
			
 
				-		set_page_owner_migrate_reason(new_hpage, reason);
			
 
				 	}
			
 
				 
			
 
				 	unlock_page(hpage);