%!s(int64=11) %!d(string=hai) anos · 4ffeaf3560
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -534,6 +534,7 @@ typedef enum {
 
				 	ZONE_WRITEBACK,			/* reclaim scanning has recently found
			
 
				 					 * many pages under writeback
			
 
				 					 */
			
 
				+	ZONE_FAIR_DEPLETED,		/* fair zone policy batch depleted */
			
 
				 } zone_flags_t;
			
 
				 
			
 
				 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
			
@@ -571,6 +572,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone)
 
				 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
			
 
				 }
			
 
				 
			
 
				+static inline int zone_is_fair_depleted(const struct zone *zone)
			
 
				+{
			
 
				+	return test_bit(ZONE_FAIR_DEPLETED, &zone->flags);
			
 
				+}
			
 
				+
			
 
				 static inline int zone_is_oom_locked(const struct zone *zone)
			
 
				 {
			
 
				 	return test_bit(ZONE_OOM_LOCKED, &zone->flags);
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1612,6 +1612,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 
				 	}
			
 
				 
			
 
				 	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
			
 
				+	if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
			
 
				+	    !zone_is_fair_depleted(zone))
			
 
				+		zone_set_flag(zone, ZONE_FAIR_DEPLETED);
			
 
				 
			
 
				 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
			
 
				 	zone_statistics(preferred_zone, zone, gfp_flags);
			
@@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 
				 
			
 
				 #endif	/* CONFIG_NUMA */
			
 
				 
			
 
				+static void reset_alloc_batches(struct zone *preferred_zone)
			
 
				+{
			
 
				+	struct zone *zone = preferred_zone->zone_pgdat->node_zones;
			
 
				+
			
 
				+	do {
			
 
				+		mod_zone_page_state(zone, NR_ALLOC_BATCH,
			
 
				+			high_wmark_pages(zone) - low_wmark_pages(zone) -
			
 
				+			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
			
 
				+		zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
			
 
				+	} while (zone++ != preferred_zone);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * get_page_from_freelist goes through the zonelist trying to allocate
			
 
				  * a page.
			
@@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
				 	int did_zlc_setup = 0;		/* just call zlc_setup() one time */
			
 
				 	bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
			
 
				 				(gfp_mask & __GFP_WRITE);
			
 
				+	int nr_fair_skipped = 0;
			
 
				+	bool zonelist_rescan;
			
 
				 
			
 
				 zonelist_scan:
			
 
				+	zonelist_rescan = false;
			
 
				+
			
 
				 	/*
			
 
				 	 * Scan zonelist, looking for a zone with enough free.
			
 
				 	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
			
@@ -1966,8 +1985,10 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
				 		if (alloc_flags & ALLOC_FAIR) {
			
 
				 			if (!zone_local(preferred_zone, zone))
			
 
				 				break;
			
 
				-			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
			
 
				+			if (zone_is_fair_depleted(zone)) {
			
 
				+				nr_fair_skipped++;
			
 
				 				continue;
			
 
				+			}
			
 
				 		}
			
 
				 		/*
			
 
				 		 * When allocating a page cache page for writing, we
			
@@ -2073,13 +2094,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
				 			zlc_mark_zone_full(zonelist, z);
			
 
				 	}
			
 
				 
			
 
				-	if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
			
 
				-		/* Disable zlc cache for second zonelist scan */
			
 
				-		zlc_active = 0;
			
 
				-		goto zonelist_scan;
			
 
				-	}
			
 
				-
			
 
				-	if (page)
			
 
				+	if (page) {
			
 
				 		/*
			
 
				 		 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
			
 
				 		 * necessary to allocate the page. The expectation is
			
@@ -2088,8 +2103,37 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 
				 		 * for !PFMEMALLOC purposes.
			
 
				 		 */
			
 
				 		page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
			
 
				+		return page;
			
 
				+	}
			
 
				 
			
 
				-	return page;
			
 
				+	/*
			
 
				+	 * The first pass makes sure allocations are spread fairly within the
			
 
				+	 * local node.  However, the local node might have free pages left
			
 
				+	 * after the fairness batches are exhausted, and remote zones haven't
			
 
				+	 * even been considered yet.  Try once more without fairness, and
			
 
				+	 * include remote zones now, before entering the slowpath and waking
			
 
				+	 * kswapd: prefer spilling to a remote zone over swapping locally.
			
 
				+	 */
			
 
				+	if (alloc_flags & ALLOC_FAIR) {
			
 
				+		alloc_flags &= ~ALLOC_FAIR;
			
 
				+		if (nr_fair_skipped) {
			
 
				+			zonelist_rescan = true;
			
 
				+			reset_alloc_batches(preferred_zone);
			
 
				+		}
			
 
				+		if (nr_online_nodes > 1)
			
 
				+			zonelist_rescan = true;
			
 
				+	}
			
 
				+
			
 
				+	if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
			
 
				+		/* Disable zlc cache for second zonelist scan */
			
 
				+		zlc_active = 0;
			
 
				+		zonelist_rescan = true;
			
 
				+	}
			
 
				+
			
 
				+	if (zonelist_rescan)
			
 
				+		goto zonelist_scan;
			
 
				+
			
 
				+	return NULL;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 
				 	return page;
			
 
				 }
			
 
				 
			
 
				-static void reset_alloc_batches(struct zonelist *zonelist,
			
 
				-				enum zone_type high_zoneidx,
			
 
				-				struct zone *preferred_zone)
			
 
				-{
			
 
				-	struct zoneref *z;
			
 
				-	struct zone *zone;
			
 
				-
			
 
				-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
			
 
				-		/*
			
 
				-		 * Only reset the batches of zones that were actually
			
 
				-		 * considered in the fairness pass, we don't want to
			
 
				-		 * trash fairness information for zones that are not
			
 
				-		 * actually part of this zonelist's round-robin cycle.
			
 
				-		 */
			
 
				-		if (!zone_local(preferred_zone, zone))
			
 
				-			continue;
			
 
				-		mod_zone_page_state(zone, NR_ALLOC_BATCH,
			
 
				-			high_wmark_pages(zone) - low_wmark_pages(zone) -
			
 
				-			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void wake_all_kswapds(unsigned int order,
			
 
				 			     struct zonelist *zonelist,
			
 
				 			     enum zone_type high_zoneidx,
			
@@ -2767,28 +2789,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 
				 	if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
			
 
				 		alloc_flags |= ALLOC_CMA;
			
 
				 #endif
			
 
				-retry:
			
 
				 	/* First allocation attempt */
			
 
				 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
			
 
				 			zonelist, high_zoneidx, alloc_flags,
			
 
				 			preferred_zone, classzone_idx, migratetype);
			
 
				 	if (unlikely(!page)) {
			
 
				-		/*
			
 
				-		 * The first pass makes sure allocations are spread
			
 
				-		 * fairly within the local node.  However, the local
			
 
				-		 * node might have free pages left after the fairness
			
 
				-		 * batches are exhausted, and remote zones haven't
			
 
				-		 * even been considered yet.  Try once more without
			
 
				-		 * fairness, and include remote zones now, before
			
 
				-		 * entering the slowpath and waking kswapd: prefer
			
 
				-		 * spilling to a remote zone over swapping locally.
			
 
				-		 */
			
 
				-		if (alloc_flags & ALLOC_FAIR) {
			
 
				-			reset_alloc_batches(zonelist, high_zoneidx,
			
 
				-					    preferred_zone);
			
 
				-			alloc_flags &= ~ALLOC_FAIR;
			
 
				-			goto retry;
			
 
				-		}
			
 
				 		/*
			
 
				 		 * Runtime PM, block IO and its error handling path
			
 
				 		 * can deadlock because I/O on the device might not