|
@@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
|
|
|
}
|
|
|
local_irq_restore(flags);
|
|
|
}
|
|
|
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
|
|
|
-{
|
|
|
- return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
|
|
|
-}
|
|
|
-#else
|
|
|
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
|
|
|
-{
|
|
|
- return false;
|
|
|
-}
|
|
|
#endif
|
|
|
|
|
|
/*
|
|
@@ -1584,12 +1575,7 @@ again:
|
|
|
get_pageblock_migratetype(page));
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * NOTE: GFP_THISNODE allocations do not partake in the kswapd
|
|
|
- * aging protocol, so they can't be fair.
|
|
|
- */
|
|
|
- if (!gfp_thisnode_allocation(gfp_flags))
|
|
|
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
|
|
|
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
|
|
|
|
|
|
__count_zone_vm_events(PGALLOC, zone, 1 << order);
|
|
|
zone_statistics(preferred_zone, zone, gfp_flags);
|
|
@@ -1955,23 +1941,12 @@ zonelist_scan:
|
|
|
* zone size to ensure fair page aging. The zone a
|
|
|
* page was allocated in should have no effect on the
|
|
|
* time the page has in memory before being reclaimed.
|
|
|
- *
|
|
|
- * Try to stay in local zones in the fastpath. If
|
|
|
- * that fails, the slowpath is entered, which will do
|
|
|
- * another pass starting with the local zones, but
|
|
|
- * ultimately fall back to remote zones that do not
|
|
|
- * partake in the fairness round-robin cycle of this
|
|
|
- * zonelist.
|
|
|
- *
|
|
|
- * NOTE: GFP_THISNODE allocations do not partake in
|
|
|
- * the kswapd aging protocol, so they can't be fair.
|
|
|
*/
|
|
|
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
|
|
|
- !gfp_thisnode_allocation(gfp_mask)) {
|
|
|
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
|
|
|
- continue;
|
|
|
+ if (alloc_flags & ALLOC_FAIR) {
|
|
|
if (!zone_local(preferred_zone, zone))
|
|
|
continue;
|
|
|
+ if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
|
|
|
+ continue;
|
|
|
}
|
|
|
/*
|
|
|
* When allocating a page cache page for writing, we
|
|
@@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
|
|
|
return page;
|
|
|
}
|
|
|
|
|
|
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
|
|
|
- struct zonelist *zonelist,
|
|
|
- enum zone_type high_zoneidx,
|
|
|
- struct zone *preferred_zone)
|
|
|
+static void reset_alloc_batches(struct zonelist *zonelist,
|
|
|
+ enum zone_type high_zoneidx,
|
|
|
+ struct zone *preferred_zone)
|
|
|
{
|
|
|
struct zoneref *z;
|
|
|
struct zone *zone;
|
|
|
|
|
|
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
|
|
|
- if (!(gfp_mask & __GFP_NO_KSWAPD))
|
|
|
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
|
|
|
/*
|
|
|
* Only reset the batches of zones that were actually
|
|
|
- * considered in the fast path, we don't want to
|
|
|
- * thrash fairness information for zones that are not
|
|
|
+ * considered in the fairness pass, we don't want to
|
|
|
+ * trash fairness information for zones that are not
|
|
|
* actually part of this zonelist's round-robin cycle.
|
|
|
*/
|
|
|
if (!zone_local(preferred_zone, zone))
|
|
|
continue;
|
|
|
mod_zone_page_state(zone, NR_ALLOC_BATCH,
|
|
|
- high_wmark_pages(zone) -
|
|
|
- low_wmark_pages(zone) -
|
|
|
- zone_page_state(zone, NR_ALLOC_BATCH));
|
|
|
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
|
|
|
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static void wake_all_kswapds(unsigned int order,
|
|
|
+ struct zonelist *zonelist,
|
|
|
+ enum zone_type high_zoneidx,
|
|
|
+ struct zone *preferred_zone)
|
|
|
+{
|
|
|
+ struct zoneref *z;
|
|
|
+ struct zone *zone;
|
|
|
+
|
|
|
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
|
|
|
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
|
|
|
+}
|
|
|
+
|
|
|
static inline int
|
|
|
gfp_to_alloc_flags(gfp_t gfp_mask)
|
|
|
{
|
|
@@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
|
|
* allowed per node queues are empty and that nodes are
|
|
|
* over allocated.
|
|
|
*/
|
|
|
- if (gfp_thisnode_allocation(gfp_mask))
|
|
|
+ if (IS_ENABLED(CONFIG_NUMA) &&
|
|
|
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
|
|
|
goto nopage;
|
|
|
|
|
|
restart:
|
|
|
- prepare_slowpath(gfp_mask, order, zonelist,
|
|
|
- high_zoneidx, preferred_zone);
|
|
|
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
|
|
|
+ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
|
|
|
|
|
|
/*
|
|
|
* OK, we're below the kswapd watermark and have kicked background
|
|
@@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
|
|
|
struct page *page = NULL;
|
|
|
int migratetype = allocflags_to_migratetype(gfp_mask);
|
|
|
unsigned int cpuset_mems_cookie;
|
|
|
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
|
|
|
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
|
|
|
struct mem_cgroup *memcg = NULL;
|
|
|
|
|
|
gfp_mask &= gfp_allowed_mask;
|
|
@@ -2753,11 +2737,28 @@ retry_cpuset:
|
|
|
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
|
|
|
alloc_flags |= ALLOC_CMA;
|
|
|
#endif
|
|
|
+retry:
|
|
|
/* First allocation attempt */
|
|
|
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
|
|
|
zonelist, high_zoneidx, alloc_flags,
|
|
|
preferred_zone, migratetype);
|
|
|
if (unlikely(!page)) {
|
|
|
+ /*
|
|
|
+ * The first pass makes sure allocations are spread
|
|
|
+ * fairly within the local node. However, the local
|
|
|
+ * node might have free pages left after the fairness
|
|
|
+ * batches are exhausted, and remote zones haven't
|
|
|
+ * even been considered yet. Try once more without
|
|
|
+ * fairness, and include remote zones now, before
|
|
|
+ * entering the slowpath and waking kswapd: prefer
|
|
|
+ * spilling to a remote zone over swapping locally.
|
|
|
+ */
|
|
|
+ if (alloc_flags & ALLOC_FAIR) {
|
|
|
+ reset_alloc_batches(zonelist, high_zoneidx,
|
|
|
+ preferred_zone);
|
|
|
+ alloc_flags &= ~ALLOC_FAIR;
|
|
|
+ goto retry;
|
|
|
+ }
|
|
|
/*
|
|
|
* Runtime PM, block IO and its error handling path
|
|
|
* can deadlock because I/O on the device might not
|