|
@@ -1612,6 +1612,9 @@ again:
|
|
|
}
|
|
|
|
|
|
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
|
|
|
+ if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
|
|
|
+ !zone_is_fair_depleted(zone))
|
|
|
+ zone_set_flag(zone, ZONE_FAIR_DEPLETED);
|
|
|
|
|
|
__count_zone_vm_events(PGALLOC, zone, 1 << order);
|
|
|
zone_statistics(preferred_zone, zone, gfp_flags);
|
|
@@ -1923,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
|
|
|
|
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
|
|
+static void reset_alloc_batches(struct zone *preferred_zone)
|
|
|
+{
|
|
|
+ struct zone *zone = preferred_zone->zone_pgdat->node_zones;
|
|
|
+
|
|
|
+ do {
|
|
|
+ mod_zone_page_state(zone, NR_ALLOC_BATCH,
|
|
|
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
|
|
|
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
|
|
|
+ zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
|
|
|
+ } while (zone++ != preferred_zone);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* get_page_from_freelist goes through the zonelist trying to allocate
|
|
|
* a page.
|
|
@@ -1940,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
|
|
|
int did_zlc_setup = 0; /* just call zlc_setup() one time */
|
|
|
bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
|
|
|
(gfp_mask & __GFP_WRITE);
|
|
|
+ int nr_fair_skipped = 0;
|
|
|
+ bool zonelist_rescan;
|
|
|
|
|
|
zonelist_scan:
|
|
|
+ zonelist_rescan = false;
|
|
|
+
|
|
|
/*
|
|
|
* Scan zonelist, looking for a zone with enough free.
|
|
|
* See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
|
|
@@ -1966,8 +1985,10 @@ zonelist_scan:
|
|
|
if (alloc_flags & ALLOC_FAIR) {
|
|
|
if (!zone_local(preferred_zone, zone))
|
|
|
break;
|
|
|
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
|
|
|
+ if (zone_is_fair_depleted(zone)) {
|
|
|
+ nr_fair_skipped++;
|
|
|
continue;
|
|
|
+ }
|
|
|
}
|
|
|
/*
|
|
|
* When allocating a page cache page for writing, we
|
|
@@ -2073,13 +2094,7 @@ this_zone_full:
|
|
|
zlc_mark_zone_full(zonelist, z);
|
|
|
}
|
|
|
|
|
|
- if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
|
|
|
- /* Disable zlc cache for second zonelist scan */
|
|
|
- zlc_active = 0;
|
|
|
- goto zonelist_scan;
|
|
|
- }
|
|
|
-
|
|
|
- if (page)
|
|
|
+ if (page) {
|
|
|
/*
|
|
|
* page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
|
|
|
* necessary to allocate the page. The expectation is
|
|
@@ -2088,8 +2103,37 @@ this_zone_full:
|
|
|
* for !PFMEMALLOC purposes.
|
|
|
*/
|
|
|
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
|
|
|
+ return page;
|
|
|
+ }
|
|
|
|
|
|
- return page;
|
|
|
+ /*
|
|
|
+ * The first pass makes sure allocations are spread fairly within the
|
|
|
+ * local node. However, the local node might have free pages left
|
|
|
+ * after the fairness batches are exhausted, and remote zones haven't
|
|
|
+ * even been considered yet. Try once more without fairness, and
|
|
|
+ * include remote zones now, before entering the slowpath and waking
|
|
|
+ * kswapd: prefer spilling to a remote zone over swapping locally.
|
|
|
+ */
|
|
|
+ if (alloc_flags & ALLOC_FAIR) {
|
|
|
+ alloc_flags &= ~ALLOC_FAIR;
|
|
|
+ if (nr_fair_skipped) {
|
|
|
+ zonelist_rescan = true;
|
|
|
+ reset_alloc_batches(preferred_zone);
|
|
|
+ }
|
|
|
+ if (nr_online_nodes > 1)
|
|
|
+ zonelist_rescan = true;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
|
|
|
+ /* Disable zlc cache for second zonelist scan */
|
|
|
+ zlc_active = 0;
|
|
|
+ zonelist_rescan = true;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (zonelist_rescan)
|
|
|
+ goto zonelist_scan;
|
|
|
+
|
|
|
+ return NULL;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -2410,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
|
|
|
return page;
|
|
|
}
|
|
|
|
|
|
-static void reset_alloc_batches(struct zonelist *zonelist,
|
|
|
- enum zone_type high_zoneidx,
|
|
|
- struct zone *preferred_zone)
|
|
|
-{
|
|
|
- struct zoneref *z;
|
|
|
- struct zone *zone;
|
|
|
-
|
|
|
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
|
|
|
- /*
|
|
|
- * Only reset the batches of zones that were actually
|
|
|
- * considered in the fairness pass, we don't want to
|
|
|
- * trash fairness information for zones that are not
|
|
|
- * actually part of this zonelist's round-robin cycle.
|
|
|
- */
|
|
|
- if (!zone_local(preferred_zone, zone))
|
|
|
- continue;
|
|
|
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
|
|
|
- high_wmark_pages(zone) - low_wmark_pages(zone) -
|
|
|
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
static void wake_all_kswapds(unsigned int order,
|
|
|
struct zonelist *zonelist,
|
|
|
enum zone_type high_zoneidx,
|
|
@@ -2767,28 +2789,11 @@ retry_cpuset:
|
|
|
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
|
|
|
alloc_flags |= ALLOC_CMA;
|
|
|
#endif
|
|
|
-retry:
|
|
|
/* First allocation attempt */
|
|
|
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
|
|
|
zonelist, high_zoneidx, alloc_flags,
|
|
|
preferred_zone, classzone_idx, migratetype);
|
|
|
if (unlikely(!page)) {
|
|
|
- /*
|
|
|
- * The first pass makes sure allocations are spread
|
|
|
- * fairly within the local node. However, the local
|
|
|
- * node might have free pages left after the fairness
|
|
|
- * batches are exhausted, and remote zones haven't
|
|
|
- * even been considered yet. Try once more without
|
|
|
- * fairness, and include remote zones now, before
|
|
|
- * entering the slowpath and waking kswapd: prefer
|
|
|
- * spilling to a remote zone over swapping locally.
|
|
|
- */
|
|
|
- if (alloc_flags & ALLOC_FAIR) {
|
|
|
- reset_alloc_batches(zonelist, high_zoneidx,
|
|
|
- preferred_zone);
|
|
|
- alloc_flags &= ~ALLOC_FAIR;
|
|
|
- goto retry;
|
|
|
- }
|
|
|
/*
|
|
|
* Runtime PM, block IO and its error handling path
|
|
|
* can deadlock because I/O on the device might not
|