|
@@ -925,6 +925,71 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
|
|
group->faults_cpu[task_faults_idx(nid, 1)];
|
|
|
}
|
|
|
|
|
|
+/* Handle placement on systems where not all nodes are directly connected. */
|
|
|
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
|
|
|
+ int maxdist, bool task)
|
|
|
+{
|
|
|
+ unsigned long score = 0;
|
|
|
+ int node;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * All nodes are directly connected, and the same distance
|
|
|
+ * from each other. No need for fancy placement algorithms.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_DIRECT)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This code is called for each node, introducing N^2 complexity,
|
|
|
+ * which should be ok given the number of nodes rarely exceeds 8.
|
|
|
+ */
|
|
|
+ for_each_online_node(node) {
|
|
|
+ unsigned long faults;
|
|
|
+ int dist = node_distance(nid, node);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The furthest away nodes in the system are not interesting
|
|
|
+ * for placement; nid was already counted.
|
|
|
+ */
|
|
|
+ if (dist == sched_max_numa_distance || node == nid)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On systems with a backplane NUMA topology, compare groups
|
|
|
+ * of nodes, and move tasks towards the group with the most
|
|
|
+ * memory accesses. When comparing two nodes at distance
|
|
|
+ * "hoplimit", only nodes closer by than "hoplimit" are part
|
|
|
+ * of each group. Skip other nodes.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
|
|
|
+ dist > maxdist)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /* Add up the faults from nearby nodes. */
|
|
|
+ if (task)
|
|
|
+ faults = task_faults(p, node);
|
|
|
+ else
|
|
|
+ faults = group_faults(p, node);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On systems with a glueless mesh NUMA topology, there are
|
|
|
+ * no fixed "groups of nodes". Instead, nodes that are not
|
|
|
+ * directly connected bounce traffic through intermediate
|
|
|
+ * nodes; a numa_group can occupy any set of nodes.
|
|
|
+ * The further away a node is, the less the faults count.
|
|
|
+ * This seems to result in good task placement.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
|
|
|
+ faults *= (sched_max_numa_distance - dist);
|
|
|
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
|
|
|
+ }
|
|
|
+
|
|
|
+ score += faults;
|
|
|
+ }
|
|
|
+
|
|
|
+ return score;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* These return the fraction of accesses done by a particular task, or
|
|
|
* task group, on a particular numa node. The group weight is given a
|
|
@@ -945,6 +1010,8 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
|
|
|
return 0;
|
|
|
|
|
|
faults = task_faults(p, nid);
|
|
|
+ faults += score_nearby_nodes(p, nid, dist, true);
|
|
|
+
|
|
|
return 1000 * faults / total_faults;
|
|
|
}
|
|
|
|
|
@@ -962,6 +1029,8 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
|
|
|
return 0;
|
|
|
|
|
|
faults = group_faults(p, nid);
|
|
|
+ faults += score_nearby_nodes(p, nid, dist, false);
|
|
|
+
|
|
|
return 1000 * faults / total_faults;
|
|
|
}
|
|
|
|
|
@@ -1374,6 +1443,11 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
continue;
|
|
|
|
|
|
dist = node_distance(env.src_nid, env.dst_nid);
|
|
|
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
|
|
|
+ dist != env.dist) {
|
|
|
+ taskweight = task_weight(p, env.src_nid, dist);
|
|
|
+ groupweight = group_weight(p, env.src_nid, dist);
|
|
|
+ }
|
|
|
|
|
|
/* Only consider nodes where both task and groups benefit */
|
|
|
taskimp = task_weight(p, nid, dist) - taskweight;
|