|
@@ -932,10 +932,11 @@ struct numa_group {
|
|
|
spinlock_t lock; /* nr_tasks, tasks */
|
|
|
int nr_tasks;
|
|
|
pid_t gid;
|
|
|
+ int active_nodes;
|
|
|
|
|
|
struct rcu_head rcu;
|
|
|
- nodemask_t active_nodes;
|
|
|
unsigned long total_faults;
|
|
|
+ unsigned long max_faults_cpu;
|
|
|
/*
|
|
|
* Faults_cpu is used to decide whether memory should move
|
|
|
* towards the CPU. As a consequence, these stats are weighted
|
|
@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
|
|
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
|
|
|
+ * considered part of a numa group's pseudo-interleaving set. Migrations
|
|
|
+ * between these nodes are slowed down, to allow things to settle down.
|
|
|
+ */
|
|
|
+#define ACTIVE_NODE_FRACTION 3
|
|
|
+
|
|
|
+static bool numa_is_active_node(int nid, struct numa_group *ng)
|
|
|
+{
|
|
|
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
|
|
|
+}
|
|
|
+
|
|
|
/* Handle placement on systems where not all nodes are directly connected. */
|
|
|
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
|
|
|
int maxdist, bool task)
|
|
@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
|
|
return true;
|
|
|
|
|
|
/*
|
|
|
- * Do not migrate if the destination is not a node that
|
|
|
- * is actively used by this numa group.
|
|
|
+ * Destination node is much more heavily used than the source
|
|
|
+ * node? Allow migration.
|
|
|
*/
|
|
|
- if (!node_isset(dst_nid, ng->active_nodes))
|
|
|
- return false;
|
|
|
-
|
|
|
- /*
|
|
|
- * Source is a node that is not actively used by this
|
|
|
- * numa group, while the destination is. Migrate.
|
|
|
- */
|
|
|
- if (!node_isset(src_nid, ng->active_nodes))
|
|
|
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
|
|
|
+ ACTIVE_NODE_FRACTION)
|
|
|
return true;
|
|
|
|
|
|
/*
|
|
|
- * Both source and destination are nodes in active
|
|
|
- * use by this numa group. Maximize memory bandwidth
|
|
|
- * by migrating from more heavily used groups, to less
|
|
|
- * heavily used ones, spreading the load around.
|
|
|
- * Use a 1/4 hysteresis to avoid spurious page movement.
|
|
|
+ * Distribute memory according to CPU & memory use on each node,
|
|
|
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
|
|
|
+ *
|
|
|
+ * faults_cpu(dst) 3 faults_cpu(src)
|
|
|
+ * --------------- * - > ---------------
|
|
|
+ * faults_mem(dst) 4 faults_mem(src)
|
|
|
*/
|
|
|
- return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
|
|
|
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
|
|
|
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
|
|
|
}
|
|
|
|
|
|
static unsigned long weighted_cpuload(const int cpu);
|
|
@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
|
|
|
.best_task = NULL,
|
|
|
.best_imp = 0,
|
|
|
- .best_cpu = -1
|
|
|
+ .best_cpu = -1,
|
|
|
};
|
|
|
struct sched_domain *sd;
|
|
|
unsigned long taskweight, groupweight;
|
|
@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
* multiple NUMA nodes; in order to better consolidate the group,
|
|
|
* we need to check other locations.
|
|
|
*/
|
|
|
- if (env.best_cpu == -1 || (p->numa_group &&
|
|
|
- nodes_weight(p->numa_group->active_nodes) > 1)) {
|
|
|
+ if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
|
|
|
for_each_online_node(nid) {
|
|
|
if (nid == env.src_nid || nid == p->numa_preferred_nid)
|
|
|
continue;
|
|
@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
* trying for a better one later. Do not set the preferred node here.
|
|
|
*/
|
|
|
if (p->numa_group) {
|
|
|
+ struct numa_group *ng = p->numa_group;
|
|
|
+
|
|
|
if (env.best_cpu == -1)
|
|
|
nid = env.src_nid;
|
|
|
else
|
|
|
nid = env.dst_nid;
|
|
|
|
|
|
- if (node_isset(nid, p->numa_group->active_nodes))
|
|
|
+ if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
|
|
|
sched_setnuma(p, env.dst_nid);
|
|
|
}
|
|
|
|
|
@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Find the nodes on which the workload is actively running. We do this by
|
|
|
+ * Find out how many nodes on the workload is actively running on. Do this by
|
|
|
* tracking the nodes from which NUMA hinting faults are triggered. This can
|
|
|
* be different from the set of nodes where the workload's memory is currently
|
|
|
* located.
|
|
|
- *
|
|
|
- * The bitmask is used to make smarter decisions on when to do NUMA page
|
|
|
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
|
|
|
- * are added when they cause over 6/16 of the maximum number of faults, but
|
|
|
- * only removed when they drop below 3/16.
|
|
|
*/
|
|
|
-static void update_numa_active_node_mask(struct numa_group *numa_group)
|
|
|
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
|
|
|
{
|
|
|
unsigned long faults, max_faults = 0;
|
|
|
- int nid;
|
|
|
+ int nid, active_nodes = 0;
|
|
|
|
|
|
for_each_online_node(nid) {
|
|
|
faults = group_faults_cpu(numa_group, nid);
|
|
@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
|
|
|
|
|
|
for_each_online_node(nid) {
|
|
|
faults = group_faults_cpu(numa_group, nid);
|
|
|
- if (!node_isset(nid, numa_group->active_nodes)) {
|
|
|
- if (faults > max_faults * 6 / 16)
|
|
|
- node_set(nid, numa_group->active_nodes);
|
|
|
- } else if (faults < max_faults * 3 / 16)
|
|
|
- node_clear(nid, numa_group->active_nodes);
|
|
|
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
|
|
|
+ active_nodes++;
|
|
|
}
|
|
|
+
|
|
|
+ numa_group->max_faults_cpu = max_faults;
|
|
|
+ numa_group->active_nodes = active_nodes;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
update_task_scan_period(p, fault_types[0], fault_types[1]);
|
|
|
|
|
|
if (p->numa_group) {
|
|
|
- update_numa_active_node_mask(p->numa_group);
|
|
|
+ numa_group_count_active_nodes(p->numa_group);
|
|
|
spin_unlock_irq(group_lock);
|
|
|
max_nid = preferred_group_nid(p, max_group_nid);
|
|
|
}
|
|
@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
return;
|
|
|
|
|
|
atomic_set(&grp->refcount, 1);
|
|
|
+ grp->active_nodes = 1;
|
|
|
+ grp->max_faults_cpu = 0;
|
|
|
spin_lock_init(&grp->lock);
|
|
|
grp->gid = p->pid;
|
|
|
/* Second half of the array tracks nids where faults happen */
|
|
|
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
|
|
|
nr_node_ids;
|
|
|
|
|
|
- node_set(task_node(current), grp->active_nodes);
|
|
|
-
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
grp->faults[i] = p->numa_faults[i];
|
|
|
|
|
@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
bool migrated = flags & TNF_MIGRATED;
|
|
|
int cpu_node = task_node(current);
|
|
|
int local = !!(flags & TNF_FAULT_LOCAL);
|
|
|
+ struct numa_group *ng;
|
|
|
int priv;
|
|
|
|
|
|
if (!static_branch_likely(&sched_numa_balancing))
|
|
@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
* actively using should be counted as local. This allows the
|
|
|
* scan rate to slow down when a workload has settled down.
|
|
|
*/
|
|
|
- if (!priv && !local && p->numa_group &&
|
|
|
- node_isset(cpu_node, p->numa_group->active_nodes) &&
|
|
|
- node_isset(mem_node, p->numa_group->active_nodes))
|
|
|
+ ng = p->numa_group;
|
|
|
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
|
|
|
+ numa_is_active_node(cpu_node, ng) &&
|
|
|
+ numa_is_active_node(mem_node, ng))
|
|
|
local = 1;
|
|
|
|
|
|
task_numa_placement(p);
|