9 years ago · 4142c3ebb6
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -932,10 +932,11 @@ struct numa_group {
 
				 	spinlock_t lock; /* nr_tasks, tasks */
			
 
				 	int nr_tasks;
			
 
				 	pid_t gid;
			
 
				+	int active_nodes;
			
 
				 
			
 
				 	struct rcu_head rcu;
			
 
				-	nodemask_t active_nodes;
			
 
				 	unsigned long total_faults;
			
 
				+	unsigned long max_faults_cpu;
			
 
				 	/*
			
 
				 	 * Faults_cpu is used to decide whether memory should move
			
 
				 	 * towards the CPU. As a consequence, these stats are weighted
			
@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 
				 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
			
 
				+ * considered part of a numa group's pseudo-interleaving set. Migrations
			
 
				+ * between these nodes are slowed down, to allow things to settle down.
			
 
				+ */
			
 
				+#define ACTIVE_NODE_FRACTION 3
			
 
				+
			
 
				+static bool numa_is_active_node(int nid, struct numa_group *ng)
			
 
				+{
			
 
				+	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
			
 
				+}
			
 
				+
			
 
				 /* Handle placement on systems where not all nodes are directly connected. */
			
 
				 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
			
 
				 					int maxdist, bool task)
			
@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 
				 		return true;
			
 
				 
			
 
				 	/*
			
 
				-	 * Do not migrate if the destination is not a node that
			
 
				-	 * is actively used by this numa group.
			
 
				+	 * Destination node is much more heavily used than the source
			
 
				+	 * node? Allow migration.
			
 
				 	 */
			
 
				-	if (!node_isset(dst_nid, ng->active_nodes))
			
 
				-		return false;
			
 
				-
			
 
				-	/*
			
 
				-	 * Source is a node that is not actively used by this
			
 
				-	 * numa group, while the destination is. Migrate.
			
 
				-	 */
			
 
				-	if (!node_isset(src_nid, ng->active_nodes))
			
 
				+	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
			
 
				+					ACTIVE_NODE_FRACTION)
			
 
				 		return true;
			
 
				 
			
 
				 	/*
			
 
				-	 * Both source and destination are nodes in active
			
 
				-	 * use by this numa group. Maximize memory bandwidth
			
 
				-	 * by migrating from more heavily used groups, to less
			
 
				-	 * heavily used ones, spreading the load around.
			
 
				-	 * Use a 1/4 hysteresis to avoid spurious page movement.
			
 
				+	 * Distribute memory according to CPU & memory use on each node,
			
 
				+	 * with 3/4 hysteresis to avoid unnecessary memory migrations:
			
 
				+	 *
			
 
				+	 * faults_cpu(dst)   3   faults_cpu(src)
			
 
				+	 * --------------- * - > ---------------
			
 
				+	 * faults_mem(dst)   4   faults_mem(src)
			
 
				 	 */
			
 
				-	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
			
 
				+	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
			
 
				+	       group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
			
 
				 }
			
 
				 
			
 
				 static unsigned long weighted_cpuload(const int cpu);
			
@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
 
				 
			
 
				 		.best_task = NULL,
			
 
				 		.best_imp = 0,
			
 
				-		.best_cpu = -1
			
 
				+		.best_cpu = -1,
			
 
				 	};
			
 
				 	struct sched_domain *sd;
			
 
				 	unsigned long taskweight, groupweight;
			
@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
 
				 	 *   multiple NUMA nodes; in order to better consolidate the group,
			
 
				 	 *   we need to check other locations.
			
 
				 	 */
			
 
				-	if (env.best_cpu == -1 || (p->numa_group &&
			
 
				-			nodes_weight(p->numa_group->active_nodes) > 1)) {
			
 
				+	if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
			
 
				 		for_each_online_node(nid) {
			
 
				 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
			
 
				 				continue;
			
@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
 
				 	 * trying for a better one later. Do not set the preferred node here.
			
 
				 	 */
			
 
				 	if (p->numa_group) {
			
 
				+		struct numa_group *ng = p->numa_group;
			
 
				+
			
 
				 		if (env.best_cpu == -1)
			
 
				 			nid = env.src_nid;
			
 
				 		else
			
 
				 			nid = env.dst_nid;
			
 
				 
			
 
				-		if (node_isset(nid, p->numa_group->active_nodes))
			
 
				+		if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
			
 
				 			sched_setnuma(p, env.dst_nid);
			
 
				 	}
			
 
				 
			
@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Find the nodes on which the workload is actively running. We do this by
			
 
				+ * Find out how many nodes on the workload is actively running on. Do this by
			
 
				  * tracking the nodes from which NUMA hinting faults are triggered. This can
			
 
				  * be different from the set of nodes where the workload's memory is currently
			
 
				  * located.
			
 
				- *
			
 
				- * The bitmask is used to make smarter decisions on when to do NUMA page
			
 
				- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
			
 
				- * are added when they cause over 6/16 of the maximum number of faults, but
			
 
				- * only removed when they drop below 3/16.
			
 
				  */
			
 
				-static void update_numa_active_node_mask(struct numa_group *numa_group)
			
 
				+static void numa_group_count_active_nodes(struct numa_group *numa_group)
			
 
				 {
			
 
				 	unsigned long faults, max_faults = 0;
			
 
				-	int nid;
			
 
				+	int nid, active_nodes = 0;
			
 
				 
			
 
				 	for_each_online_node(nid) {
			
 
				 		faults = group_faults_cpu(numa_group, nid);
			
@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
 
				 
			
 
				 	for_each_online_node(nid) {
			
 
				 		faults = group_faults_cpu(numa_group, nid);
			
 
				-		if (!node_isset(nid, numa_group->active_nodes)) {
			
 
				-			if (faults > max_faults * 6 / 16)
			
 
				-				node_set(nid, numa_group->active_nodes);
			
 
				-		} else if (faults < max_faults * 3 / 16)
			
 
				-			node_clear(nid, numa_group->active_nodes);
			
 
				+		if (faults * ACTIVE_NODE_FRACTION > max_faults)
			
 
				+			active_nodes++;
			
 
				 	}
			
 
				+
			
 
				+	numa_group->max_faults_cpu = max_faults;
			
 
				+	numa_group->active_nodes = active_nodes;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
 
				 	update_task_scan_period(p, fault_types[0], fault_types[1]);
			
 
				 
			
 
				 	if (p->numa_group) {
			
 
				-		update_numa_active_node_mask(p->numa_group);
			
 
				+		numa_group_count_active_nodes(p->numa_group);
			
 
				 		spin_unlock_irq(group_lock);
			
 
				 		max_nid = preferred_group_nid(p, max_group_nid);
			
 
				 	}
			
@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 
				 			return;
			
 
				 
			
 
				 		atomic_set(&grp->refcount, 1);
			
 
				+		grp->active_nodes = 1;
			
 
				+		grp->max_faults_cpu = 0;
			
 
				 		spin_lock_init(&grp->lock);
			
 
				 		grp->gid = p->pid;
			
 
				 		/* Second half of the array tracks nids where faults happen */
			
 
				 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
			
 
				 						nr_node_ids;
			
 
				 
			
 
				-		node_set(task_node(current), grp->active_nodes);
			
 
				-
			
 
				 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
			
 
				 			grp->faults[i] = p->numa_faults[i];
			
 
				 
			
@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
				 	bool migrated = flags & TNF_MIGRATED;
			
 
				 	int cpu_node = task_node(current);
			
 
				 	int local = !!(flags & TNF_FAULT_LOCAL);
			
 
				+	struct numa_group *ng;
			
 
				 	int priv;
			
 
				 
			
 
				 	if (!static_branch_likely(&sched_numa_balancing))
			
@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
				 	 * actively using should be counted as local. This allows the
			
 
				 	 * scan rate to slow down when a workload has settled down.
			
 
				 	 */
			
 
				-	if (!priv && !local && p->numa_group &&
			
 
				-			node_isset(cpu_node, p->numa_group->active_nodes) &&
			
 
				-			node_isset(mem_node, p->numa_group->active_nodes))
			
 
				+	ng = p->numa_group;
			
 
				+	if (!priv && !local && ng && ng->active_nodes > 1 &&
			
 
				+				numa_is_active_node(cpu_node, ng) &&
			
 
				+				numa_is_active_node(mem_node, ng))
			
 
				 		local = 1;
			
 
				 
			
 
				 	task_numa_placement(p);