9 years ago · accf62422b
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1191,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 
				 
			
 
				 		/*
			
 
				 		 * Mark that the PG_migrate_skip information should be cleared
			
 
				-		 * by kswapd when it goes to sleep. kswapd does not set the
			
 
				+		 * by kswapd when it goes to sleep. kcompactd does not set the
			
 
				 		 * flag itself as the decision to be clear should be directly
			
 
				 		 * based on an allocation request.
			
 
				 		 */
			
 
				-		if (!current_is_kswapd())
			
 
				+		if (cc->direct_compaction)
			
 
				 			zone->compact_blockskip_flush = true;
			
 
				 
			
 
				 		return COMPACT_COMPLETE;
			
@@ -1338,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 
				 
			
 
				 	/*
			
 
				 	 * Clear pageblock skip if there were failures recently and compaction
			
 
				-	 * is about to be retried after being deferred. kswapd does not do
			
 
				-	 * this reset as it'll reset the cached information when going to sleep.
			
 
				+	 * is about to be retried after being deferred.
			
 
				 	 */
			
 
				-	if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
			
 
				+	if (compaction_restarting(zone, cc->order))
			
 
				 		__reset_isolation_suitable(zone);
			
 
				 
			
 
				 	/*
			
@@ -1477,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
 
				 		.mode = mode,
			
 
				 		.alloc_flags = alloc_flags,
			
 
				 		.classzone_idx = classzone_idx,
			
 
				+		.direct_compaction = true,
			
 
				 	};
			
 
				 	INIT_LIST_HEAD(&cc.freepages);
			
 
				 	INIT_LIST_HEAD(&cc.migratepages);
			
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -172,6 +172,7 @@ struct compact_control {
 
				 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
			
 
				 	enum migrate_mode mode;		/* Async or sync migration mode */
			
 
				 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
			
 
				+	bool direct_compaction;		/* False from kcompactd or /proc/... */
			
 
				 	int order;			/* order a direct compactor needs */
			
 
				 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
			
 
				 	const int alloc_flags;		/* alloc flags of a direct compactor */
			
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2968,18 +2968,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 
				 	} while (memcg);
			
 
				 }
			
 
				 
			
 
				-static bool zone_balanced(struct zone *zone, int order,
			
 
				-			  unsigned long balance_gap, int classzone_idx)
			
 
				+static bool zone_balanced(struct zone *zone, int order, bool highorder,
			
 
				+			unsigned long balance_gap, int classzone_idx)
			
 
				 {
			
 
				-	if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
			
 
				-				    balance_gap, classzone_idx))
			
 
				-		return false;
			
 
				+	unsigned long mark = high_wmark_pages(zone) + balance_gap;
			
 
				 
			
 
				-	if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
			
 
				-				order, 0, classzone_idx) == COMPACT_SKIPPED)
			
 
				-		return false;
			
 
				+	/*
			
 
				+	 * When checking from pgdat_balanced(), kswapd should stop and sleep
			
 
				+	 * when it reaches the high order-0 watermark and let kcompactd take
			
 
				+	 * over. Other callers such as wakeup_kswapd() want to determine the
			
 
				+	 * true high-order watermark.
			
 
				+	 */
			
 
				+	if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
			
 
				+		mark += (1UL << order);
			
 
				+		order = 0;
			
 
				+	}
			
 
				 
			
 
				-	return true;
			
 
				+	return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3029,7 +3034,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 
				 			continue;
			
 
				 		}
			
 
				 
			
 
				-		if (zone_balanced(zone, order, 0, i))
			
 
				+		if (zone_balanced(zone, order, false, 0, i))
			
 
				 			balanced_pages += zone->managed_pages;
			
 
				 		else if (!order)
			
 
				 			return false;
			
@@ -3083,27 +3088,14 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 
				  */
			
 
				 static bool kswapd_shrink_zone(struct zone *zone,
			
 
				 			       int classzone_idx,
			
 
				-			       struct scan_control *sc,
			
 
				-			       unsigned long *nr_attempted)
			
 
				+			       struct scan_control *sc)
			
 
				 {
			
 
				-	int testorder = sc->order;
			
 
				 	unsigned long balance_gap;
			
 
				 	bool lowmem_pressure;
			
 
				 
			
 
				 	/* Reclaim above the high watermark. */
			
 
				 	sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
			
 
				 
			
 
				-	/*
			
 
				-	 * Kswapd reclaims only single pages with compaction enabled. Trying
			
 
				-	 * too hard to reclaim until contiguous free pages have become
			
 
				-	 * available can hurt performance by evicting too much useful data
			
 
				-	 * from memory. Do not reclaim more than needed for compaction.
			
 
				-	 */
			
 
				-	if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
			
 
				-			compaction_suitable(zone, sc->order, 0, classzone_idx)
			
 
				-							!= COMPACT_SKIPPED)
			
 
				-		testorder = 0;
			
 
				-
			
 
				 	/*
			
 
				 	 * We put equal pressure on every zone, unless one zone has way too
			
 
				 	 * many pages free already. The "too many pages" is defined as the
			
@@ -3118,15 +3110,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
 
				 	 * reclaim is necessary
			
 
				 	 */
			
 
				 	lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
			
 
				-	if (!lowmem_pressure && zone_balanced(zone, testorder,
			
 
				+	if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
			
 
				 						balance_gap, classzone_idx))
			
 
				 		return true;
			
 
				 
			
 
				 	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
			
 
				 
			
 
				-	/* Account for the number of pages attempted to reclaim */
			
 
				-	*nr_attempted += sc->nr_to_reclaim;
			
 
				-
			
 
				 	clear_bit(ZONE_WRITEBACK, &zone->flags);
			
 
				 
			
 
				 	/*
			
@@ -3136,7 +3125,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 
				 	 * waits.
			
 
				 	 */
			
 
				 	if (zone_reclaimable(zone) &&
			
 
				-	    zone_balanced(zone, testorder, 0, classzone_idx)) {
			
 
				+	    zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
			
 
				 		clear_bit(ZONE_CONGESTED, &zone->flags);
			
 
				 		clear_bit(ZONE_DIRTY, &zone->flags);
			
 
				 	}
			
@@ -3148,7 +3137,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 
				  * For kswapd, balance_pgdat() will work across all this node's zones until
			
 
				  * they are all at high_wmark_pages(zone).
			
 
				  *
			
 
				- * Returns the final order kswapd was reclaiming at
			
 
				+ * Returns the highest zone idx kswapd was reclaiming at
			
 
				  *
			
 
				  * There is special handling here for zones which are full of pinned pages.
			
 
				  * This can happen if the pages are all mlocked, or if they are all used by
			
@@ -3165,8 +3154,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 
				  * interoperates with the page allocator fallback scheme to ensure that aging
			
 
				  * of pages is balanced across the zones.
			
 
				  */
			
 
				-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
			
 
				-							int *classzone_idx)
			
 
				+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
			
 
				 {
			
 
				 	int i;
			
 
				 	int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
			
@@ -3183,9 +3171,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 	count_vm_event(PAGEOUTRUN);
			
 
				 
			
 
				 	do {
			
 
				-		unsigned long nr_attempted = 0;
			
 
				 		bool raise_priority = true;
			
 
				-		bool pgdat_needs_compaction = (order > 0);
			
 
				 
			
 
				 		sc.nr_reclaimed = 0;
			
 
				 
			
@@ -3220,7 +3206,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 				break;
			
 
				 			}
			
 
				 
			
 
				-			if (!zone_balanced(zone, order, 0, 0)) {
			
 
				+			if (!zone_balanced(zone, order, false, 0, 0)) {
			
 
				 				end_zone = i;
			
 
				 				break;
			
 
				 			} else {
			
@@ -3236,24 +3222,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 		if (i < 0)
			
 
				 			goto out;
			
 
				 
			
 
				-		for (i = 0; i <= end_zone; i++) {
			
 
				-			struct zone *zone = pgdat->node_zones + i;
			
 
				-
			
 
				-			if (!populated_zone(zone))
			
 
				-				continue;
			
 
				-
			
 
				-			/*
			
 
				-			 * If any zone is currently balanced then kswapd will
			
 
				-			 * not call compaction as it is expected that the
			
 
				-			 * necessary pages are already available.
			
 
				-			 */
			
 
				-			if (pgdat_needs_compaction &&
			
 
				-					zone_watermark_ok(zone, order,
			
 
				-						low_wmark_pages(zone),
			
 
				-						*classzone_idx, 0))
			
 
				-				pgdat_needs_compaction = false;
			
 
				-		}
			
 
				-
			
 
				 		/*
			
 
				 		 * If we're getting trouble reclaiming, start doing writepage
			
 
				 		 * even in laptop mode.
			
@@ -3297,8 +3265,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 			 * that that high watermark would be met at 100%
			
 
				 			 * efficiency.
			
 
				 			 */
			
 
				-			if (kswapd_shrink_zone(zone, end_zone,
			
 
				-					       &sc, &nr_attempted))
			
 
				+			if (kswapd_shrink_zone(zone, end_zone, &sc))
			
 
				 				raise_priority = false;
			
 
				 		}
			
 
				 
			
@@ -3311,28 +3278,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 				pfmemalloc_watermark_ok(pgdat))
			
 
				 			wake_up_all(&pgdat->pfmemalloc_wait);
			
 
				 
			
 
				-		/*
			
 
				-		 * Fragmentation may mean that the system cannot be rebalanced
			
 
				-		 * for high-order allocations in all zones. If twice the
			
 
				-		 * allocation size has been reclaimed and the zones are still
			
 
				-		 * not balanced then recheck the watermarks at order-0 to
			
 
				-		 * prevent kswapd reclaiming excessively. Assume that a
			
 
				-		 * process requested a high-order can direct reclaim/compact.
			
 
				-		 */
			
 
				-		if (order && sc.nr_reclaimed >= 2UL << order)
			
 
				-			order = sc.order = 0;
			
 
				-
			
 
				 		/* Check if kswapd should be suspending */
			
 
				 		if (try_to_freeze() || kthread_should_stop())
			
 
				 			break;
			
 
				 
			
 
				-		/*
			
 
				-		 * Compact if necessary and kswapd is reclaiming at least the
			
 
				-		 * high watermark number of pages as requsted
			
 
				-		 */
			
 
				-		if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
			
 
				-			compact_pgdat(pgdat, order);
			
 
				-
			
 
				 		/*
			
 
				 		 * Raise priority if scanning rate is too low or there was no
			
 
				 		 * progress in reclaiming pages
			
@@ -3340,20 +3289,18 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 
				 		if (raise_priority || !sc.nr_reclaimed)
			
 
				 			sc.priority--;
			
 
				 	} while (sc.priority >= 1 &&
			
 
				-		 !pgdat_balanced(pgdat, order, *classzone_idx));
			
 
				+			!pgdat_balanced(pgdat, order, classzone_idx));
			
 
				 
			
 
				 out:
			
 
				 	/*
			
 
				-	 * Return the order we were reclaiming at so prepare_kswapd_sleep()
			
 
				-	 * makes a decision on the order we were last reclaiming at. However,
			
 
				-	 * if another caller entered the allocator slow path while kswapd
			
 
				-	 * was awake, order will remain at the higher level
			
 
				+	 * Return the highest zone idx we were reclaiming at so
			
 
				+	 * prepare_kswapd_sleep() makes the same decisions as here.
			
 
				 	 */
			
 
				-	*classzone_idx = end_zone;
			
 
				-	return order;
			
 
				+	return end_zone;
			
 
				 }
			
 
				 
			
 
				-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
			
 
				+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
			
 
				+				int classzone_idx, int balanced_classzone_idx)
			
 
				 {
			
 
				 	long remaining = 0;
			
 
				 	DEFINE_WAIT(wait);
			
@@ -3364,7 +3311,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
				 	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
			
 
				 
			
 
				 	/* Try to sleep for a short interval */
			
 
				-	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
			
 
				+	if (prepare_kswapd_sleep(pgdat, order, remaining,
			
 
				+						balanced_classzone_idx)) {
			
 
				 		remaining = schedule_timeout(HZ/10);
			
 
				 		finish_wait(&pgdat->kswapd_wait, &wait);
			
 
				 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
			
@@ -3374,7 +3322,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
				 	 * After a short sleep, check if it was a premature sleep. If not, then
			
 
				 	 * go fully to sleep until explicitly woken up.
			
 
				 	 */
			
 
				-	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
			
 
				+	if (prepare_kswapd_sleep(pgdat, order, remaining,
			
 
				+						balanced_classzone_idx)) {
			
 
				 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
			
 
				 
			
 
				 		/*
			
@@ -3395,6 +3344,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
				 		 */
			
 
				 		reset_isolation_suitable(pgdat);
			
 
				 
			
 
				+		/*
			
 
				+		 * We have freed the memory, now we should compact it to make
			
 
				+		 * allocation of the requested order possible.
			
 
				+		 */
			
 
				+		wakeup_kcompactd(pgdat, order, classzone_idx);
			
 
				+
			
 
				 		if (!kthread_should_stop())
			
 
				 			schedule();
			
 
				 
			
@@ -3424,7 +3379,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
				 static int kswapd(void *p)
			
 
				 {
			
 
				 	unsigned long order, new_order;
			
 
				-	unsigned balanced_order;
			
 
				 	int classzone_idx, new_classzone_idx;
			
 
				 	int balanced_classzone_idx;
			
 
				 	pg_data_t *pgdat = (pg_data_t*)p;
			
@@ -3457,23 +3411,19 @@ static int kswapd(void *p)
 
				 	set_freezable();
			
 
				 
			
 
				 	order = new_order = 0;
			
 
				-	balanced_order = 0;
			
 
				 	classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
			
 
				 	balanced_classzone_idx = classzone_idx;
			
 
				 	for ( ; ; ) {
			
 
				 		bool ret;
			
 
				 
			
 
				 		/*
			
 
				-		 * If the last balance_pgdat was unsuccessful it's unlikely a
			
 
				-		 * new request of a similar or harder type will succeed soon
			
 
				-		 * so consider going to sleep on the basis we reclaimed at
			
 
				+		 * While we were reclaiming, there might have been another
			
 
				+		 * wakeup, so check the values.
			
 
				 		 */
			
 
				-		if (balanced_order == new_order) {
			
 
				-			new_order = pgdat->kswapd_max_order;
			
 
				-			new_classzone_idx = pgdat->classzone_idx;
			
 
				-			pgdat->kswapd_max_order =  0;
			
 
				-			pgdat->classzone_idx = pgdat->nr_zones - 1;
			
 
				-		}
			
 
				+		new_order = pgdat->kswapd_max_order;
			
 
				+		new_classzone_idx = pgdat->classzone_idx;
			
 
				+		pgdat->kswapd_max_order =  0;
			
 
				+		pgdat->classzone_idx = pgdat->nr_zones - 1;
			
 
				 
			
 
				 		if (order < new_order || classzone_idx > new_classzone_idx) {
			
 
				 			/*
			
@@ -3483,7 +3433,7 @@ static int kswapd(void *p)
 
				 			order = new_order;
			
 
				 			classzone_idx = new_classzone_idx;
			
 
				 		} else {
			
 
				-			kswapd_try_to_sleep(pgdat, balanced_order,
			
 
				+			kswapd_try_to_sleep(pgdat, order, classzone_idx,
			
 
				 						balanced_classzone_idx);
			
 
				 			order = pgdat->kswapd_max_order;
			
 
				 			classzone_idx = pgdat->classzone_idx;
			
@@ -3503,9 +3453,8 @@ static int kswapd(void *p)
 
				 		 */
			
 
				 		if (!ret) {
			
 
				 			trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
			
 
				-			balanced_classzone_idx = classzone_idx;
			
 
				-			balanced_order = balance_pgdat(pgdat, order,
			
 
				-						&balanced_classzone_idx);
			
 
				+			balanced_classzone_idx = balance_pgdat(pgdat, order,
			
 
				+								classzone_idx);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -3535,7 +3484,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 
				 	}
			
 
				 	if (!waitqueue_active(&pgdat->kswapd_wait))
			
 
				 		return;
			
 
				-	if (zone_balanced(zone, order, 0, 0))
			
 
				+	if (zone_balanced(zone, order, true, 0, 0))
			
 
				 		return;
			
 
				 
			
 
				 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);