|
@@ -873,7 +873,6 @@ struct numa_group {
|
|
|
spinlock_t lock; /* nr_tasks, tasks */
|
|
|
int nr_tasks;
|
|
|
pid_t gid;
|
|
|
- struct list_head task_list;
|
|
|
|
|
|
struct rcu_head rcu;
|
|
|
nodemask_t active_nodes;
|
|
@@ -901,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
|
|
|
return p->numa_group ? p->numa_group->gid : 0;
|
|
|
}
|
|
|
|
|
|
-static inline int task_faults_idx(int nid, int priv)
|
|
|
+/*
|
|
|
+ * The averaged statistics, shared & private, memory & cpu,
|
|
|
+ * occupy the first half of the array. The second half of the
|
|
|
+ * array is for current counters, which are averaged into the
|
|
|
+ * first set by task_numa_placement.
|
|
|
+ */
|
|
|
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
|
|
|
{
|
|
|
- return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
|
|
|
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
|
|
|
}
|
|
|
|
|
|
static inline unsigned long task_faults(struct task_struct *p, int nid)
|
|
|
{
|
|
|
- if (!p->numa_faults_memory)
|
|
|
+ if (!p->numa_faults)
|
|
|
return 0;
|
|
|
|
|
|
- return p->numa_faults_memory[task_faults_idx(nid, 0)] +
|
|
|
- p->numa_faults_memory[task_faults_idx(nid, 1)];
|
|
|
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
|
|
|
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
}
|
|
|
|
|
|
static inline unsigned long group_faults(struct task_struct *p, int nid)
|
|
@@ -920,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
|
|
|
if (!p->numa_group)
|
|
|
return 0;
|
|
|
|
|
|
- return p->numa_group->faults[task_faults_idx(nid, 0)] +
|
|
|
- p->numa_group->faults[task_faults_idx(nid, 1)];
|
|
|
+ return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
|
|
|
+ p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
}
|
|
|
|
|
|
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
|
|
{
|
|
|
- return group->faults_cpu[task_faults_idx(nid, 0)] +
|
|
|
- group->faults_cpu[task_faults_idx(nid, 1)];
|
|
|
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
|
|
|
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
+}
|
|
|
+
|
|
|
+/* Handle placement on systems where not all nodes are directly connected. */
|
|
|
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
|
|
|
+ int maxdist, bool task)
|
|
|
+{
|
|
|
+ unsigned long score = 0;
|
|
|
+ int node;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * All nodes are directly connected, and the same distance
|
|
|
+ * from each other. No need for fancy placement algorithms.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_DIRECT)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This code is called for each node, introducing N^2 complexity,
|
|
|
+ * which should be ok given the number of nodes rarely exceeds 8.
|
|
|
+ */
|
|
|
+ for_each_online_node(node) {
|
|
|
+ unsigned long faults;
|
|
|
+ int dist = node_distance(nid, node);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The furthest away nodes in the system are not interesting
|
|
|
+ * for placement; nid was already counted.
|
|
|
+ */
|
|
|
+ if (dist == sched_max_numa_distance || node == nid)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On systems with a backplane NUMA topology, compare groups
|
|
|
+ * of nodes, and move tasks towards the group with the most
|
|
|
+ * memory accesses. When comparing two nodes at distance
|
|
|
+ * "hoplimit", only nodes closer by than "hoplimit" are part
|
|
|
+ * of each group. Skip other nodes.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
|
|
|
+ dist > maxdist)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /* Add up the faults from nearby nodes. */
|
|
|
+ if (task)
|
|
|
+ faults = task_faults(p, node);
|
|
|
+ else
|
|
|
+ faults = group_faults(p, node);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On systems with a glueless mesh NUMA topology, there are
|
|
|
+ * no fixed "groups of nodes". Instead, nodes that are not
|
|
|
+ * directly connected bounce traffic through intermediate
|
|
|
+ * nodes; a numa_group can occupy any set of nodes.
|
|
|
+ * The further away a node is, the less the faults count.
|
|
|
+ * This seems to result in good task placement.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
|
|
|
+ faults *= (sched_max_numa_distance - dist);
|
|
|
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
|
|
|
+ }
|
|
|
+
|
|
|
+ score += faults;
|
|
|
+ }
|
|
|
+
|
|
|
+ return score;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -936,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
|
|
* larger multiplier, in order to group tasks together that are almost
|
|
|
* evenly spread out between numa nodes.
|
|
|
*/
|
|
|
-static inline unsigned long task_weight(struct task_struct *p, int nid)
|
|
|
+static inline unsigned long task_weight(struct task_struct *p, int nid,
|
|
|
+ int dist)
|
|
|
{
|
|
|
- unsigned long total_faults;
|
|
|
+ unsigned long faults, total_faults;
|
|
|
|
|
|
- if (!p->numa_faults_memory)
|
|
|
+ if (!p->numa_faults)
|
|
|
return 0;
|
|
|
|
|
|
total_faults = p->total_numa_faults;
|
|
@@ -948,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
|
|
|
if (!total_faults)
|
|
|
return 0;
|
|
|
|
|
|
- return 1000 * task_faults(p, nid) / total_faults;
|
|
|
+ faults = task_faults(p, nid);
|
|
|
+ faults += score_nearby_nodes(p, nid, dist, true);
|
|
|
+
|
|
|
+ return 1000 * faults / total_faults;
|
|
|
}
|
|
|
|
|
|
-static inline unsigned long group_weight(struct task_struct *p, int nid)
|
|
|
+static inline unsigned long group_weight(struct task_struct *p, int nid,
|
|
|
+ int dist)
|
|
|
{
|
|
|
- if (!p->numa_group || !p->numa_group->total_faults)
|
|
|
+ unsigned long faults, total_faults;
|
|
|
+
|
|
|
+ if (!p->numa_group)
|
|
|
return 0;
|
|
|
|
|
|
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
|
|
|
+ total_faults = p->numa_group->total_faults;
|
|
|
+
|
|
|
+ if (!total_faults)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ faults = group_faults(p, nid);
|
|
|
+ faults += score_nearby_nodes(p, nid, dist, false);
|
|
|
+
|
|
|
+ return 1000 * faults / total_faults;
|
|
|
}
|
|
|
|
|
|
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
|
@@ -1089,6 +1174,7 @@ struct task_numa_env {
|
|
|
struct numa_stats src_stats, dst_stats;
|
|
|
|
|
|
int imbalance_pct;
|
|
|
+ int dist;
|
|
|
|
|
|
struct task_struct *best_task;
|
|
|
long best_imp;
|
|
@@ -1168,6 +1254,7 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
long load;
|
|
|
long imp = env->p->numa_group ? groupimp : taskimp;
|
|
|
long moveimp = imp;
|
|
|
+ int dist = env->dist;
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
@@ -1208,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
* in any group then look only at task weights.
|
|
|
*/
|
|
|
if (cur->numa_group == env->p->numa_group) {
|
|
|
- imp = taskimp + task_weight(cur, env->src_nid) -
|
|
|
- task_weight(cur, env->dst_nid);
|
|
|
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
|
|
|
+ task_weight(cur, env->dst_nid, dist);
|
|
|
/*
|
|
|
* Add some hysteresis to prevent swapping the
|
|
|
* tasks within a group over tiny differences.
|
|
@@ -1223,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
|
* instead.
|
|
|
*/
|
|
|
if (cur->numa_group)
|
|
|
- imp += group_weight(cur, env->src_nid) -
|
|
|
- group_weight(cur, env->dst_nid);
|
|
|
+ imp += group_weight(cur, env->src_nid, dist) -
|
|
|
+ group_weight(cur, env->dst_nid, dist);
|
|
|
else
|
|
|
- imp += task_weight(cur, env->src_nid) -
|
|
|
- task_weight(cur, env->dst_nid);
|
|
|
+ imp += task_weight(cur, env->src_nid, dist) -
|
|
|
+ task_weight(cur, env->dst_nid, dist);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -1326,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
};
|
|
|
struct sched_domain *sd;
|
|
|
unsigned long taskweight, groupweight;
|
|
|
- int nid, ret;
|
|
|
+ int nid, ret, dist;
|
|
|
long taskimp, groupimp;
|
|
|
|
|
|
/*
|
|
@@ -1354,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
return -EINVAL;
|
|
|
}
|
|
|
|
|
|
- taskweight = task_weight(p, env.src_nid);
|
|
|
- groupweight = group_weight(p, env.src_nid);
|
|
|
- update_numa_stats(&env.src_stats, env.src_nid);
|
|
|
env.dst_nid = p->numa_preferred_nid;
|
|
|
- taskimp = task_weight(p, env.dst_nid) - taskweight;
|
|
|
- groupimp = group_weight(p, env.dst_nid) - groupweight;
|
|
|
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
|
|
|
+ taskweight = task_weight(p, env.src_nid, dist);
|
|
|
+ groupweight = group_weight(p, env.src_nid, dist);
|
|
|
+ update_numa_stats(&env.src_stats, env.src_nid);
|
|
|
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
|
|
|
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
|
|
|
update_numa_stats(&env.dst_stats, env.dst_nid);
|
|
|
|
|
|
/* Try to find a spot on the preferred nid. */
|
|
|
task_numa_find_cpu(&env, taskimp, groupimp);
|
|
|
|
|
|
- /* No space available on the preferred nid. Look elsewhere. */
|
|
|
- if (env.best_cpu == -1) {
|
|
|
+ /*
|
|
|
+ * Look at other nodes in these cases:
|
|
|
+ * - there is no space available on the preferred_nid
|
|
|
+ * - the task is part of a numa_group that is interleaved across
|
|
|
+ * multiple NUMA nodes; in order to better consolidate the group,
|
|
|
+ * we need to check other locations.
|
|
|
+ */
|
|
|
+ if (env.best_cpu == -1 || (p->numa_group &&
|
|
|
+ nodes_weight(p->numa_group->active_nodes) > 1)) {
|
|
|
for_each_online_node(nid) {
|
|
|
if (nid == env.src_nid || nid == p->numa_preferred_nid)
|
|
|
continue;
|
|
|
|
|
|
+ dist = node_distance(env.src_nid, env.dst_nid);
|
|
|
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
|
|
|
+ dist != env.dist) {
|
|
|
+ taskweight = task_weight(p, env.src_nid, dist);
|
|
|
+ groupweight = group_weight(p, env.src_nid, dist);
|
|
|
+ }
|
|
|
+
|
|
|
/* Only consider nodes where both task and groups benefit */
|
|
|
- taskimp = task_weight(p, nid) - taskweight;
|
|
|
- groupimp = group_weight(p, nid) - groupweight;
|
|
|
+ taskimp = task_weight(p, nid, dist) - taskweight;
|
|
|
+ groupimp = group_weight(p, nid, dist) - groupweight;
|
|
|
if (taskimp < 0 && groupimp < 0)
|
|
|
continue;
|
|
|
|
|
|
+ env.dist = dist;
|
|
|
env.dst_nid = nid;
|
|
|
update_numa_stats(&env.dst_stats, env.dst_nid);
|
|
|
task_numa_find_cpu(&env, taskimp, groupimp);
|
|
@@ -1431,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
|
|
|
unsigned long interval = HZ;
|
|
|
|
|
|
/* This task has no NUMA fault statistics yet */
|
|
|
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
|
|
|
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
|
|
|
return;
|
|
|
|
|
|
/* Periodically retry migrating the task to the preferred node */
|
|
@@ -1580,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
|
|
|
return delta;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Determine the preferred nid for a task in a numa_group. This needs to
|
|
|
+ * be done in a way that produces consistent results with group_weight,
|
|
|
+ * otherwise workloads might not converge.
|
|
|
+ */
|
|
|
+static int preferred_group_nid(struct task_struct *p, int nid)
|
|
|
+{
|
|
|
+ nodemask_t nodes;
|
|
|
+ int dist;
|
|
|
+
|
|
|
+ /* Direct connections between all NUMA nodes. */
|
|
|
+ if (sched_numa_topology_type == NUMA_DIRECT)
|
|
|
+ return nid;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * On a system with glueless mesh NUMA topology, group_weight
|
|
|
+ * scores nodes according to the number of NUMA hinting faults on
|
|
|
+ * both the node itself, and on nearby nodes.
|
|
|
+ */
|
|
|
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
|
|
|
+ unsigned long score, max_score = 0;
|
|
|
+ int node, max_node = nid;
|
|
|
+
|
|
|
+ dist = sched_max_numa_distance;
|
|
|
+
|
|
|
+ for_each_online_node(node) {
|
|
|
+ score = group_weight(p, node, dist);
|
|
|
+ if (score > max_score) {
|
|
|
+ max_score = score;
|
|
|
+ max_node = node;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return max_node;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Finding the preferred nid in a system with NUMA backplane
|
|
|
+ * interconnect topology is more involved. The goal is to locate
|
|
|
+ * tasks from numa_groups near each other in the system, and
|
|
|
+ * untangle workloads from different sides of the system. This requires
|
|
|
+ * searching down the hierarchy of node groups, recursively searching
|
|
|
+ * inside the highest scoring group of nodes. The nodemask tricks
|
|
|
+ * keep the complexity of the search down.
|
|
|
+ */
|
|
|
+ nodes = node_online_map;
|
|
|
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
|
|
|
+ unsigned long max_faults = 0;
|
|
|
+ nodemask_t max_group;
|
|
|
+ int a, b;
|
|
|
+
|
|
|
+ /* Are there nodes at this distance from each other? */
|
|
|
+ if (!find_numa_distance(dist))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ for_each_node_mask(a, nodes) {
|
|
|
+ unsigned long faults = 0;
|
|
|
+ nodemask_t this_group;
|
|
|
+ nodes_clear(this_group);
|
|
|
+
|
|
|
+ /* Sum group's NUMA faults; includes a==b case. */
|
|
|
+ for_each_node_mask(b, nodes) {
|
|
|
+ if (node_distance(a, b) < dist) {
|
|
|
+ faults += group_faults(p, b);
|
|
|
+ node_set(b, this_group);
|
|
|
+ node_clear(b, nodes);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Remember the top group. */
|
|
|
+ if (faults > max_faults) {
|
|
|
+ max_faults = faults;
|
|
|
+ max_group = this_group;
|
|
|
+ /*
|
|
|
+ * subtle: at the smallest distance there is
|
|
|
+ * just one node left in each "group", the
|
|
|
+ * winner is the preferred nid.
|
|
|
+ */
|
|
|
+ nid = a;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ /* Next round, evaluate the nodes within max_group. */
|
|
|
+ nodes = max_group;
|
|
|
+ }
|
|
|
+ return nid;
|
|
|
+}
|
|
|
+
|
|
|
static void task_numa_placement(struct task_struct *p)
|
|
|
{
|
|
|
int seq, nid, max_nid = -1, max_group_nid = -1;
|
|
@@ -1607,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
|
|
|
/* Find the node with the highest number of faults */
|
|
|
for_each_online_node(nid) {
|
|
|
+ /* Keep track of the offsets in numa_faults array */
|
|
|
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
|
|
|
unsigned long faults = 0, group_faults = 0;
|
|
|
- int priv, i;
|
|
|
+ int priv;
|
|
|
|
|
|
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
|
|
|
long diff, f_diff, f_weight;
|
|
|
|
|
|
- i = task_faults_idx(nid, priv);
|
|
|
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
|
|
|
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
|
|
|
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
|
|
|
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
|
|
|
|
|
|
/* Decay existing window, copy faults since last scan */
|
|
|
- diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
|
|
|
- fault_types[priv] += p->numa_faults_buffer_memory[i];
|
|
|
- p->numa_faults_buffer_memory[i] = 0;
|
|
|
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
|
|
|
+ fault_types[priv] += p->numa_faults[membuf_idx];
|
|
|
+ p->numa_faults[membuf_idx] = 0;
|
|
|
|
|
|
/*
|
|
|
* Normalize the faults_from, so all tasks in a group
|
|
@@ -1628,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
* faults are less important.
|
|
|
*/
|
|
|
f_weight = div64_u64(runtime << 16, period + 1);
|
|
|
- f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
|
|
|
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
|
|
|
(total_faults + 1);
|
|
|
- f_diff = f_weight - p->numa_faults_cpu[i] / 2;
|
|
|
- p->numa_faults_buffer_cpu[i] = 0;
|
|
|
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
|
|
|
+ p->numa_faults[cpubuf_idx] = 0;
|
|
|
|
|
|
- p->numa_faults_memory[i] += diff;
|
|
|
- p->numa_faults_cpu[i] += f_diff;
|
|
|
- faults += p->numa_faults_memory[i];
|
|
|
+ p->numa_faults[mem_idx] += diff;
|
|
|
+ p->numa_faults[cpu_idx] += f_diff;
|
|
|
+ faults += p->numa_faults[mem_idx];
|
|
|
p->total_numa_faults += diff;
|
|
|
if (p->numa_group) {
|
|
|
- /* safe because we can only change our own group */
|
|
|
- p->numa_group->faults[i] += diff;
|
|
|
- p->numa_group->faults_cpu[i] += f_diff;
|
|
|
+ /*
|
|
|
+ * safe because we can only change our own group
|
|
|
+ *
|
|
|
+ * mem_idx represents the offset for a given
|
|
|
+ * nid and priv in a specific region because it
|
|
|
+ * is at the beginning of the numa_faults array.
|
|
|
+ */
|
|
|
+ p->numa_group->faults[mem_idx] += diff;
|
|
|
+ p->numa_group->faults_cpu[mem_idx] += f_diff;
|
|
|
p->numa_group->total_faults += diff;
|
|
|
- group_faults += p->numa_group->faults[i];
|
|
|
+ group_faults += p->numa_group->faults[mem_idx];
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -1662,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
if (p->numa_group) {
|
|
|
update_numa_active_node_mask(p->numa_group);
|
|
|
spin_unlock_irq(group_lock);
|
|
|
- max_nid = max_group_nid;
|
|
|
+ max_nid = preferred_group_nid(p, max_group_nid);
|
|
|
}
|
|
|
|
|
|
if (max_faults) {
|
|
@@ -1705,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
|
|
|
atomic_set(&grp->refcount, 1);
|
|
|
spin_lock_init(&grp->lock);
|
|
|
- INIT_LIST_HEAD(&grp->task_list);
|
|
|
grp->gid = p->pid;
|
|
|
/* Second half of the array tracks nids where faults happen */
|
|
|
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
|
|
@@ -1714,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
node_set(task_node(current), grp->active_nodes);
|
|
|
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
- grp->faults[i] = p->numa_faults_memory[i];
|
|
|
+ grp->faults[i] = p->numa_faults[i];
|
|
|
|
|
|
grp->total_faults = p->total_numa_faults;
|
|
|
|
|
|
- list_add(&p->numa_entry, &grp->task_list);
|
|
|
grp->nr_tasks++;
|
|
|
rcu_assign_pointer(p->numa_group, grp);
|
|
|
}
|
|
@@ -1773,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
double_lock_irq(&my_grp->lock, &grp->lock);
|
|
|
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
|
|
|
- my_grp->faults[i] -= p->numa_faults_memory[i];
|
|
|
- grp->faults[i] += p->numa_faults_memory[i];
|
|
|
+ my_grp->faults[i] -= p->numa_faults[i];
|
|
|
+ grp->faults[i] += p->numa_faults[i];
|
|
|
}
|
|
|
my_grp->total_faults -= p->total_numa_faults;
|
|
|
grp->total_faults += p->total_numa_faults;
|
|
|
|
|
|
- list_move(&p->numa_entry, &grp->task_list);
|
|
|
my_grp->nr_tasks--;
|
|
|
grp->nr_tasks++;
|
|
|
|
|
@@ -1799,27 +1996,23 @@ no_join:
|
|
|
void task_numa_free(struct task_struct *p)
|
|
|
{
|
|
|
struct numa_group *grp = p->numa_group;
|
|
|
- void *numa_faults = p->numa_faults_memory;
|
|
|
+ void *numa_faults = p->numa_faults;
|
|
|
unsigned long flags;
|
|
|
int i;
|
|
|
|
|
|
if (grp) {
|
|
|
spin_lock_irqsave(&grp->lock, flags);
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
- grp->faults[i] -= p->numa_faults_memory[i];
|
|
|
+ grp->faults[i] -= p->numa_faults[i];
|
|
|
grp->total_faults -= p->total_numa_faults;
|
|
|
|
|
|
- list_del(&p->numa_entry);
|
|
|
grp->nr_tasks--;
|
|
|
spin_unlock_irqrestore(&grp->lock, flags);
|
|
|
RCU_INIT_POINTER(p->numa_group, NULL);
|
|
|
put_numa_group(grp);
|
|
|
}
|
|
|
|
|
|
- p->numa_faults_memory = NULL;
|
|
|
- p->numa_faults_buffer_memory = NULL;
|
|
|
- p->numa_faults_cpu= NULL;
|
|
|
- p->numa_faults_buffer_cpu = NULL;
|
|
|
+ p->numa_faults = NULL;
|
|
|
kfree(numa_faults);
|
|
|
}
|
|
|
|
|
@@ -1842,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
return;
|
|
|
|
|
|
/* Allocate buffer to track faults on a per-node basis */
|
|
|
- if (unlikely(!p->numa_faults_memory)) {
|
|
|
- int size = sizeof(*p->numa_faults_memory) *
|
|
|
+ if (unlikely(!p->numa_faults)) {
|
|
|
+ int size = sizeof(*p->numa_faults) *
|
|
|
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
|
|
|
|
|
|
- p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
|
|
|
- if (!p->numa_faults_memory)
|
|
|
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
|
|
|
+ if (!p->numa_faults)
|
|
|
return;
|
|
|
|
|
|
- BUG_ON(p->numa_faults_buffer_memory);
|
|
|
- /*
|
|
|
- * The averaged statistics, shared & private, memory & cpu,
|
|
|
- * occupy the first half of the array. The second half of the
|
|
|
- * array is for current counters, which are averaged into the
|
|
|
- * first set by task_numa_placement.
|
|
|
- */
|
|
|
- p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
|
|
|
- p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
|
|
|
- p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
|
|
|
p->total_numa_faults = 0;
|
|
|
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
|
|
|
}
|
|
@@ -1899,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
if (migrated)
|
|
|
p->numa_pages_migrated += pages;
|
|
|
|
|
|
- p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
|
|
|
- p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
|
|
|
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
|
|
|
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
|
|
|
p->numa_faults_locality[local] += pages;
|
|
|
}
|
|
|
|
|
@@ -4469,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
|
|
latest_idle_timestamp = rq->idle_stamp;
|
|
|
shallowest_idle_cpu = i;
|
|
|
}
|
|
|
- } else {
|
|
|
+ } else if (shallowest_idle_cpu == -1) {
|
|
|
load = weighted_cpuload(i);
|
|
|
if (load < min_load || (load == min_load && i == this_cpu)) {
|
|
|
min_load = load;
|
|
@@ -4547,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|
|
int want_affine = 0;
|
|
|
int sync = wake_flags & WF_SYNC;
|
|
|
|
|
|
- if (p->nr_cpus_allowed == 1)
|
|
|
- return prev_cpu;
|
|
|
-
|
|
|
if (sd_flag & SD_BALANCE_WAKE)
|
|
|
want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
|
|
|
|
|
@@ -5189,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
|
|
struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
|
|
int src_nid, dst_nid;
|
|
|
|
|
|
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
|
|
|
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
|
|
|
!(env->sd->flags & SD_NUMA)) {
|
|
|
return false;
|
|
|
}
|
|
@@ -5228,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|
|
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
|
|
return false;
|
|
|
|
|
|
- if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
|
|
|
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
|
|
|
return false;
|
|
|
|
|
|
src_nid = cpu_to_node(env->src_cpu);
|
|
@@ -6172,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|
|
* with a large weight task outweighs the tasks on the system).
|
|
|
*/
|
|
|
if (prefer_sibling && sds->local &&
|
|
|
- sds->local_stat.group_has_free_capacity)
|
|
|
+ sds->local_stat.group_has_free_capacity) {
|
|
|
sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
|
|
|
+ sgs->group_type = group_classify(sg, sgs);
|
|
|
+ }
|
|
|
|
|
|
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
|
|
|
sds->busiest = sg;
|