il y a 9 ans · 698b1b3064
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -52,6 +52,10 @@ extern void compaction_defer_reset(struct zone *zone, int order,
 
				 				bool alloc_success);
			
 
				 extern bool compaction_restarting(struct zone *zone, int order);
			
 
				 
			
 
				+extern int kcompactd_run(int nid);
			
 
				+extern void kcompactd_stop(int nid);
			
 
				+extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
			
 
				+
			
 
				 #else
			
 
				 static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
			
 
				 			unsigned int order, int alloc_flags,
			
@@ -84,6 +88,18 @@ static inline bool compaction_deferred(struct zone *zone, int order)
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				+static inline int kcompactd_run(int nid)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+static inline void kcompactd_stop(int nid)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 #endif /* CONFIG_COMPACTION */
			
 
				 
			
 
				 #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
			
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -668,6 +668,12 @@ typedef struct pglist_data {
 
				 					   mem_hotplug_begin/end() */
			
 
				 	int kswapd_max_order;
			
 
				 	enum zone_type classzone_idx;
			
 
				+#ifdef CONFIG_COMPACTION
			
 
				+	int kcompactd_max_order;
			
 
				+	enum zone_type kcompactd_classzone_idx;
			
 
				+	wait_queue_head_t kcompactd_wait;
			
 
				+	struct task_struct *kcompactd;
			
 
				+#endif
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				 	/* Lock serializing the migrate rate limiting window */
			
 
				 	spinlock_t numabalancing_migrate_lock;
			
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -53,6 +53,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 
				 		COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
			
 
				 		COMPACTISOLATED,
			
 
				 		COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
			
 
				+		KCOMPACTD_WAKE,
			
 
				 #endif
			
 
				 #ifdef CONFIG_HUGETLB_PAGE
			
 
				 		HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
			
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -350,6 +350,61 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
 
				 );
			
 
				 #endif
			
 
				 
			
 
				+TRACE_EVENT(mm_compaction_kcompactd_sleep,
			
 
				+
			
 
				+	TP_PROTO(int nid),
			
 
				+
			
 
				+	TP_ARGS(nid),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(int, nid)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		__entry->nid = nid;
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("nid=%d", __entry->nid)
			
 
				+);
			
 
				+
			
 
				+DECLARE_EVENT_CLASS(kcompactd_wake_template,
			
 
				+
			
 
				+	TP_PROTO(int nid, int order, enum zone_type classzone_idx),
			
 
				+
			
 
				+	TP_ARGS(nid, order, classzone_idx),
			
 
				+
			
 
				+	TP_STRUCT__entry(
			
 
				+		__field(int, nid)
			
 
				+		__field(int, order)
			
 
				+		__field(enum zone_type, classzone_idx)
			
 
				+	),
			
 
				+
			
 
				+	TP_fast_assign(
			
 
				+		__entry->nid = nid;
			
 
				+		__entry->order = order;
			
 
				+		__entry->classzone_idx = classzone_idx;
			
 
				+	),
			
 
				+
			
 
				+	TP_printk("nid=%d order=%d classzone_idx=%-8s",
			
 
				+		__entry->nid,
			
 
				+		__entry->order,
			
 
				+		__print_symbolic(__entry->classzone_idx, ZONE_TYPE))
			
 
				+);
			
 
				+
			
 
				+DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd,
			
 
				+
			
 
				+	TP_PROTO(int nid, int order, enum zone_type classzone_idx),
			
 
				+
			
 
				+	TP_ARGS(nid, order, classzone_idx)
			
 
				+);
			
 
				+
			
 
				+DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
			
 
				+
			
 
				+	TP_PROTO(int nid, int order, enum zone_type classzone_idx),
			
 
				+
			
 
				+	TP_ARGS(nid, order, classzone_idx)
			
 
				+);
			
 
				+
			
 
				 #endif /* _TRACE_COMPACTION_H */
			
 
				 
			
 
				 /* This part must be outside protection */
			
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
 
				  *
			
 
				  * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
			
 
				  */
			
 
				+#include <linux/cpu.h>
			
 
				 #include <linux/swap.h>
			
 
				 #include <linux/migrate.h>
			
 
				 #include <linux/compaction.h>
			
@@ -17,6 +18,8 @@
 
				 #include <linux/balloon_compaction.h>
			
 
				 #include <linux/page-isolation.h>
			
 
				 #include <linux/kasan.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/freezer.h>
			
 
				 #include "internal.h"
			
 
				 
			
 
				 #ifdef CONFIG_COMPACTION
			
@@ -1736,4 +1739,223 @@ void compaction_unregister_node(struct node *node)
 
				 }
			
 
				 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
			
 
				 
			
 
				+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
			
 
				+{
			
 
				+	return pgdat->kcompactd_max_order > 0;
			
 
				+}
			
 
				+
			
 
				+static bool kcompactd_node_suitable(pg_data_t *pgdat)
			
 
				+{
			
 
				+	int zoneid;
			
 
				+	struct zone *zone;
			
 
				+	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
			
 
				+
			
 
				+	for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
			
 
				+		zone = &pgdat->node_zones[zoneid];
			
 
				+
			
 
				+		if (!populated_zone(zone))
			
 
				+			continue;
			
 
				+
			
 
				+		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
			
 
				+					classzone_idx) == COMPACT_CONTINUE)
			
 
				+			return true;
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+static void kcompactd_do_work(pg_data_t *pgdat)
			
 
				+{
			
 
				+	/*
			
 
				+	 * With no special task, compact all zones so that a page of requested
			
 
				+	 * order is allocatable.
			
 
				+	 */
			
 
				+	int zoneid;
			
 
				+	struct zone *zone;
			
 
				+	struct compact_control cc = {
			
 
				+		.order = pgdat->kcompactd_max_order,
			
 
				+		.classzone_idx = pgdat->kcompactd_classzone_idx,
			
 
				+		.mode = MIGRATE_SYNC_LIGHT,
			
 
				+		.ignore_skip_hint = true,
			
 
				+
			
 
				+	};
			
 
				+	bool success = false;
			
 
				+
			
 
				+	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
			
 
				+							cc.classzone_idx);
			
 
				+	count_vm_event(KCOMPACTD_WAKE);
			
 
				+
			
 
				+	for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
			
 
				+		int status;
			
 
				+
			
 
				+		zone = &pgdat->node_zones[zoneid];
			
 
				+		if (!populated_zone(zone))
			
 
				+			continue;
			
 
				+
			
 
				+		if (compaction_deferred(zone, cc.order))
			
 
				+			continue;
			
 
				+
			
 
				+		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
			
 
				+							COMPACT_CONTINUE)
			
 
				+			continue;
			
 
				+
			
 
				+		cc.nr_freepages = 0;
			
 
				+		cc.nr_migratepages = 0;
			
 
				+		cc.zone = zone;
			
 
				+		INIT_LIST_HEAD(&cc.freepages);
			
 
				+		INIT_LIST_HEAD(&cc.migratepages);
			
 
				+
			
 
				+		status = compact_zone(zone, &cc);
			
 
				+
			
 
				+		if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
			
 
				+						cc.classzone_idx, 0)) {
			
 
				+			success = true;
			
 
				+			compaction_defer_reset(zone, cc.order, false);
			
 
				+		} else if (status == COMPACT_COMPLETE) {
			
 
				+			/*
			
 
				+			 * We use sync migration mode here, so we defer like
			
 
				+			 * sync direct compaction does.
			
 
				+			 */
			
 
				+			defer_compaction(zone, cc.order);
			
 
				+		}
			
 
				+
			
 
				+		VM_BUG_ON(!list_empty(&cc.freepages));
			
 
				+		VM_BUG_ON(!list_empty(&cc.migratepages));
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Regardless of success, we are done until woken up next. But remember
			
 
				+	 * the requested order/classzone_idx in case it was higher/tighter than
			
 
				+	 * our current ones
			
 
				+	 */
			
 
				+	if (pgdat->kcompactd_max_order <= cc.order)
			
 
				+		pgdat->kcompactd_max_order = 0;
			
 
				+	if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
			
 
				+		pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
			
 
				+}
			
 
				+
			
 
				+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
			
 
				+{
			
 
				+	if (!order)
			
 
				+		return;
			
 
				+
			
 
				+	if (pgdat->kcompactd_max_order < order)
			
 
				+		pgdat->kcompactd_max_order = order;
			
 
				+
			
 
				+	if (pgdat->kcompactd_classzone_idx > classzone_idx)
			
 
				+		pgdat->kcompactd_classzone_idx = classzone_idx;
			
 
				+
			
 
				+	if (!waitqueue_active(&pgdat->kcompactd_wait))
			
 
				+		return;
			
 
				+
			
 
				+	if (!kcompactd_node_suitable(pgdat))
			
 
				+		return;
			
 
				+
			
 
				+	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
			
 
				+							classzone_idx);
			
 
				+	wake_up_interruptible(&pgdat->kcompactd_wait);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * The background compaction daemon, started as a kernel thread
			
 
				+ * from the init process.
			
 
				+ */
			
 
				+static int kcompactd(void *p)
			
 
				+{
			
 
				+	pg_data_t *pgdat = (pg_data_t*)p;
			
 
				+	struct task_struct *tsk = current;
			
 
				+
			
 
				+	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
			
 
				+
			
 
				+	if (!cpumask_empty(cpumask))
			
 
				+		set_cpus_allowed_ptr(tsk, cpumask);
			
 
				+
			
 
				+	set_freezable();
			
 
				+
			
 
				+	pgdat->kcompactd_max_order = 0;
			
 
				+	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
			
 
				+
			
 
				+	while (!kthread_should_stop()) {
			
 
				+		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
			
 
				+		wait_event_freezable(pgdat->kcompactd_wait,
			
 
				+				kcompactd_work_requested(pgdat));
			
 
				+
			
 
				+		kcompactd_do_work(pgdat);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This kcompactd start function will be called by init and node-hot-add.
			
 
				+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
			
 
				+ */
			
 
				+int kcompactd_run(int nid)
			
 
				+{
			
 
				+	pg_data_t *pgdat = NODE_DATA(nid);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (pgdat->kcompactd)
			
 
				+		return 0;
			
 
				+
			
 
				+	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
			
 
				+	if (IS_ERR(pgdat->kcompactd)) {
			
 
				+		pr_err("Failed to start kcompactd on node %d\n", nid);
			
 
				+		ret = PTR_ERR(pgdat->kcompactd);
			
 
				+		pgdat->kcompactd = NULL;
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Called by memory hotplug when all memory in a node is offlined. Caller must
			
 
				+ * hold mem_hotplug_begin/end().
			
 
				+ */
			
 
				+void kcompactd_stop(int nid)
			
 
				+{
			
 
				+	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
			
 
				+
			
 
				+	if (kcompactd) {
			
 
				+		kthread_stop(kcompactd);
			
 
				+		NODE_DATA(nid)->kcompactd = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
			
 
				+ * not required for correctness. So if the last cpu in a node goes
			
 
				+ * away, we get changed to run anywhere: as the first one comes back,
			
 
				+ * restore their cpu bindings.
			
 
				+ */
			
 
				+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
			
 
				+			void *hcpu)
			
 
				+{
			
 
				+	int nid;
			
 
				+
			
 
				+	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
			
 
				+		for_each_node_state(nid, N_MEMORY) {
			
 
				+			pg_data_t *pgdat = NODE_DATA(nid);
			
 
				+			const struct cpumask *mask;
			
 
				+
			
 
				+			mask = cpumask_of_node(pgdat->node_id);
			
 
				+
			
 
				+			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
			
 
				+				/* One of our CPUs online: restore mask */
			
 
				+				set_cpus_allowed_ptr(pgdat->kcompactd, mask);
			
 
				+		}
			
 
				+	}
			
 
				+	return NOTIFY_OK;
			
 
				+}
			
 
				+
			
 
				+static int __init kcompactd_init(void)
			
 
				+{
			
 
				+	int nid;
			
 
				+
			
 
				+	for_each_node_state(nid, N_MEMORY)
			
 
				+		kcompactd_run(nid);
			
 
				+	hotcpu_notifier(cpu_callback, 0);
			
 
				+	return 0;
			
 
				+}
			
 
				+subsys_initcall(kcompactd_init)
			
 
				+
			
 
				 #endif /* CONFIG_COMPACTION */
			
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -33,6 +33,7 @@
 
				 #include <linux/hugetlb.h>
			
 
				 #include <linux/memblock.h>
			
 
				 #include <linux/bootmem.h>
			
 
				+#include <linux/compaction.h>
			
 
				 
			
 
				 #include <asm/tlbflush.h>
			
 
				 
			
@@ -1105,8 +1106,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 
				 
			
 
				 	init_per_zone_wmark_min();
			
 
				 
			
 
				-	if (onlined_pages)
			
 
				+	if (onlined_pages) {
			
 
				 		kswapd_run(zone_to_nid(zone));
			
 
				+		kcompactd_run(nid);
			
 
				+	}
			
 
				 
			
 
				 	vm_total_pages = nr_free_pagecache_pages();
			
 
				 
			
@@ -1880,8 +1883,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
 
				 		zone_pcp_update(zone);
			
 
				 
			
 
				 	node_states_clear_node(node, &arg);
			
 
				-	if (arg.status_change_nid >= 0)
			
 
				+	if (arg.status_change_nid >= 0) {
			
 
				 		kswapd_stop(node);
			
 
				+		kcompactd_stop(node);
			
 
				+	}
			
 
				 
			
 
				 	vm_total_pages = nr_free_pagecache_pages();
			
 
				 	writeback_set_ratelimit();
			
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5405,6 +5405,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 
				 #endif
			
 
				 	init_waitqueue_head(&pgdat->kswapd_wait);
			
 
				 	init_waitqueue_head(&pgdat->pfmemalloc_wait);
			
 
				+#ifdef CONFIG_COMPACTION
			
 
				+	init_waitqueue_head(&pgdat->kcompactd_wait);
			
 
				+#endif
			
 
				 	pgdat_page_ext_init(pgdat);
			
 
				 
			
 
				 	for (j = 0; j < MAX_NR_ZONES; j++) {
			
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -826,6 +826,7 @@ const char * const vmstat_text[] = {
 
				 	"compact_stall",
			
 
				 	"compact_fail",
			
 
				 	"compact_success",
			
 
				+	"compact_daemon_wake",
			
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_HUGETLB_PAGE