|
@@ -887,6 +887,11 @@ struct numa_group {
|
|
|
struct rcu_head rcu;
|
|
|
nodemask_t active_nodes;
|
|
|
unsigned long total_faults;
|
|
|
+ /*
|
|
|
+ * Faults_cpu is used to decide whether memory should move
|
|
|
+ * towards the CPU. As a consequence, these stats are weighted
|
|
|
+ * more by CPU use than by memory faults.
|
|
|
+ */
|
|
|
unsigned long *faults_cpu;
|
|
|
unsigned long faults[0];
|
|
|
};
|
|
@@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct *p,
|
|
|
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Get the fraction of time the task has been running since the last
|
|
|
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
|
|
|
+ * decays those on a 32ms period, which is orders of magnitude off
|
|
|
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
|
|
|
+ * stats only if the task is so new there are no NUMA statistics yet.
|
|
|
+ */
|
|
|
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
|
|
|
+{
|
|
|
+ u64 runtime, delta, now;
|
|
|
+ /* Use the start of this time slice to avoid calculations. */
|
|
|
+ now = p->se.exec_start;
|
|
|
+ runtime = p->se.sum_exec_runtime;
|
|
|
+
|
|
|
+ if (p->last_task_numa_placement) {
|
|
|
+ delta = runtime - p->last_sum_exec_runtime;
|
|
|
+ *period = now - p->last_task_numa_placement;
|
|
|
+ } else {
|
|
|
+ delta = p->se.avg.runnable_avg_sum;
|
|
|
+ *period = p->se.avg.runnable_avg_period;
|
|
|
+ }
|
|
|
+
|
|
|
+ p->last_sum_exec_runtime = runtime;
|
|
|
+ p->last_task_numa_placement = now;
|
|
|
+
|
|
|
+ return delta;
|
|
|
+}
|
|
|
+
|
|
|
static void task_numa_placement(struct task_struct *p)
|
|
|
{
|
|
|
int seq, nid, max_nid = -1, max_group_nid = -1;
|
|
|
unsigned long max_faults = 0, max_group_faults = 0;
|
|
|
unsigned long fault_types[2] = { 0, 0 };
|
|
|
+ unsigned long total_faults;
|
|
|
+ u64 runtime, period;
|
|
|
spinlock_t *group_lock = NULL;
|
|
|
|
|
|
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
|
|
@@ -1459,6 +1494,10 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
p->numa_scan_seq = seq;
|
|
|
p->numa_scan_period_max = task_scan_max(p);
|
|
|
|
|
|
+ total_faults = p->numa_faults_locality[0] +
|
|
|
+ p->numa_faults_locality[1];
|
|
|
+ runtime = numa_get_avg_runtime(p, &period);
|
|
|
+
|
|
|
/* If the task is part of a group prevent parallel updates to group stats */
|
|
|
if (p->numa_group) {
|
|
|
group_lock = &p->numa_group->lock;
|
|
@@ -1471,7 +1510,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
int priv, i;
|
|
|
|
|
|
for (priv = 0; priv < 2; priv++) {
|
|
|
- long diff, f_diff;
|
|
|
+ long diff, f_diff, f_weight;
|
|
|
|
|
|
i = task_faults_idx(nid, priv);
|
|
|
diff = -p->numa_faults_memory[i];
|
|
@@ -1483,8 +1522,18 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
fault_types[priv] += p->numa_faults_buffer_memory[i];
|
|
|
p->numa_faults_buffer_memory[i] = 0;
|
|
|
|
|
|
+ /*
|
|
|
+ * Normalize the faults_from, so all tasks in a group
|
|
|
+ * count according to CPU use, instead of by the raw
|
|
|
+ * number of faults. Tasks with little runtime have
|
|
|
+ * little over-all impact on throughput, and thus their
|
|
|
+ * faults are less important.
|
|
|
+ */
|
|
|
+ f_weight = div64_u64(runtime << 16, period + 1);
|
|
|
+ f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
|
|
|
+ (total_faults + 1);
|
|
|
p->numa_faults_cpu[i] >>= 1;
|
|
|
- p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i];
|
|
|
+ p->numa_faults_cpu[i] += f_weight;
|
|
|
p->numa_faults_buffer_cpu[i] = 0;
|
|
|
|
|
|
faults += p->numa_faults_memory[i];
|