|
@@ -1659,6 +1659,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
|
|
|
return delta;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Determine the preferred nid for a task in a numa_group. This needs to
|
|
|
+ * be done in a way that produces consistent results with group_weight,
|
|
|
+ * otherwise workloads might not converge.
|
|
|
+ */
|
|
|
+static int preferred_group_nid(struct task_struct *p, int nid)
|
|
|
+{
|
|
|
+ nodemask_t nodes;
|
|
|
+ int dist;
|
|
|
+
|
|
|
+ /* Direct connections between all NUMA nodes. */
|
|
|
+ if (sched_numa_topology_type == NUMA_DIRECT)
|
|
|
+ return nid;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On a system with glueless mesh NUMA topology, group_weight
|
|
|
+ * scores nodes according to the number of NUMA hinting faults on
|
|
|
+ * both the node itself, and on nearby nodes.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
|
|
|
+ unsigned long score, max_score = 0;
|
|
|
+ int node, max_node = nid;
|
|
|
+
|
|
|
+ dist = sched_max_numa_distance;
|
|
|
+
|
|
|
+ for_each_online_node(node) {
|
|
|
+ score = group_weight(p, node, dist);
|
|
|
+ if (score > max_score) {
|
|
|
+ max_score = score;
|
|
|
+ max_node = node;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return max_node;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Finding the preferred nid in a system with NUMA backplane
|
|
|
+ * interconnect topology is more involved. The goal is to locate
|
|
|
+ * tasks from numa_groups near each other in the system, and
|
|
|
+ * untangle workloads from different sides of the system. This requires
|
|
|
+ * searching down the hierarchy of node groups, recursively searching
|
|
|
+ * inside the highest scoring group of nodes. The nodemask tricks
|
|
|
+ * keep the complexity of the search down.
|
|
|
+ */
|
|
|
+ nodes = node_online_map;
|
|
|
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
|
|
|
+ unsigned long max_faults = 0;
|
|
|
+ nodemask_t max_group;
|
|
|
+ int a, b;
|
|
|
+
|
|
|
+ /* Are there nodes at this distance from each other? */
|
|
|
+ if (!find_numa_distance(dist))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ for_each_node_mask(a, nodes) {
|
|
|
+ unsigned long faults = 0;
|
|
|
+ nodemask_t this_group;
|
|
|
+ nodes_clear(this_group);
|
|
|
+
|
|
|
+ /* Sum group's NUMA faults; includes a==b case. */
|
|
|
+ for_each_node_mask(b, nodes) {
|
|
|
+ if (node_distance(a, b) < dist) {
|
|
|
+ faults += group_faults(p, b);
|
|
|
+ node_set(b, this_group);
|
|
|
+ node_clear(b, nodes);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Remember the top group. */
|
|
|
+ if (faults > max_faults) {
|
|
|
+ max_faults = faults;
|
|
|
+ max_group = this_group;
|
|
|
+ /*
|
|
|
+ * subtle: at the smallest distance there is
|
|
|
+ * just one node left in each "group", the
|
|
|
+ * winner is the preferred nid.
|
|
|
+ */
|
|
|
+ nid = a;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ /* Next round, evaluate the nodes within max_group. */
|
|
|
+ nodes = max_group;
|
|
|
+ }
|
|
|
+ return nid;
|
|
|
+}
|
|
|
+
|
|
|
static void task_numa_placement(struct task_struct *p)
|
|
|
{
|
|
|
int seq, nid, max_nid = -1, max_group_nid = -1;
|
|
@@ -1741,7 +1827,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
if (p->numa_group) {
|
|
|
update_numa_active_node_mask(p->numa_group);
|
|
|
spin_unlock_irq(group_lock);
|
|
|
- max_nid = max_group_nid;
|
|
|
+ max_nid = preferred_group_nid(p, max_group_nid);
|
|
|
}
|
|
|
|
|
|
if (max_faults) {
|