11 years ago · 7e2703e609
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1459,6 +1459,8 @@ struct task_struct {
 
				 	int numa_preferred_nid;
			
 
				 	unsigned long numa_migrate_retry;
			
 
				 	u64 node_stamp;			/* migration stamp  */
			
 
				+	u64 last_task_numa_placement;
			
 
				+	u64 last_sum_exec_runtime;
			
 
				 	struct callback_head numa_work;
			
 
				 
			
 
				 	struct list_head numa_entry;
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1746,6 +1746,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
				 	p->numa_work.next = &p->numa_work;
			
 
				 	p->numa_faults_memory = NULL;
			
 
				 	p->numa_faults_buffer_memory = NULL;
			
 
				+	p->last_task_numa_placement = 0;
			
 
				+	p->last_sum_exec_runtime = 0;
			
 
				 
			
 
				 	INIT_LIST_HEAD(&p->numa_entry);
			
 
				 	p->numa_group = NULL;
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -887,6 +887,11 @@ struct numa_group {
 
				 	struct rcu_head rcu;
			
 
				 	nodemask_t active_nodes;
			
 
				 	unsigned long total_faults;
			
 
				+	/*
			
 
				+	 * Faults_cpu is used to decide whether memory should move
			
 
				+	 * towards the CPU. As a consequence, these stats are weighted
			
 
				+	 * more by CPU use than by memory faults.
			
 
				+	 */
			
 
				 	unsigned long *faults_cpu;
			
 
				 	unsigned long faults[0];
			
 
				 };
			
@@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct *p,
 
				 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Get the fraction of time the task has been running since the last
			
 
				+ * NUMA placement cycle. The scheduler keeps similar statistics, but
			
 
				+ * decays those on a 32ms period, which is orders of magnitude off
			
 
				+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
			
 
				+ * stats only if the task is so new there are no NUMA statistics yet.
			
 
				+ */
			
 
				+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
			
 
				+{
			
 
				+	u64 runtime, delta, now;
			
 
				+	/* Use the start of this time slice to avoid calculations. */
			
 
				+	now = p->se.exec_start;
			
 
				+	runtime = p->se.sum_exec_runtime;
			
 
				+
			
 
				+	if (p->last_task_numa_placement) {
			
 
				+		delta = runtime - p->last_sum_exec_runtime;
			
 
				+		*period = now - p->last_task_numa_placement;
			
 
				+	} else {
			
 
				+		delta = p->se.avg.runnable_avg_sum;
			
 
				+		*period = p->se.avg.runnable_avg_period;
			
 
				+	}
			
 
				+
			
 
				+	p->last_sum_exec_runtime = runtime;
			
 
				+	p->last_task_numa_placement = now;
			
 
				+
			
 
				+	return delta;
			
 
				+}
			
 
				+
			
 
				 static void task_numa_placement(struct task_struct *p)
			
 
				 {
			
 
				 	int seq, nid, max_nid = -1, max_group_nid = -1;
			
 
				 	unsigned long max_faults = 0, max_group_faults = 0;
			
 
				 	unsigned long fault_types[2] = { 0, 0 };
			
 
				+	unsigned long total_faults;
			
 
				+	u64 runtime, period;
			
 
				 	spinlock_t *group_lock = NULL;
			
 
				 
			
 
				 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
			
@@ -1459,6 +1494,10 @@ static void task_numa_placement(struct task_struct *p)
 
				 	p->numa_scan_seq = seq;
			
 
				 	p->numa_scan_period_max = task_scan_max(p);
			
 
				 
			
 
				+	total_faults = p->numa_faults_locality[0] +
			
 
				+		       p->numa_faults_locality[1];
			
 
				+	runtime = numa_get_avg_runtime(p, &period);
			
 
				+
			
 
				 	/* If the task is part of a group prevent parallel updates to group stats */
			
 
				 	if (p->numa_group) {
			
 
				 		group_lock = &p->numa_group->lock;
			
@@ -1471,7 +1510,7 @@ static void task_numa_placement(struct task_struct *p)
 
				 		int priv, i;
			
 
				 
			
 
				 		for (priv = 0; priv < 2; priv++) {
			
 
				-			long diff, f_diff;
			
 
				+			long diff, f_diff, f_weight;
			
 
				 
			
 
				 			i = task_faults_idx(nid, priv);
			
 
				 			diff = -p->numa_faults_memory[i];
			
@@ -1483,8 +1522,18 @@ static void task_numa_placement(struct task_struct *p)
 
				 			fault_types[priv] += p->numa_faults_buffer_memory[i];
			
 
				 			p->numa_faults_buffer_memory[i] = 0;
			
 
				 
			
 
				+			/*
			
 
				+			 * Normalize the faults_from, so all tasks in a group
			
 
				+			 * count according to CPU use, instead of by the raw
			
 
				+			 * number of faults. Tasks with little runtime have
			
 
				+			 * little over-all impact on throughput, and thus their
			
 
				+			 * faults are less important.
			
 
				+			 */
			
 
				+			f_weight = div64_u64(runtime << 16, period + 1);
			
 
				+			f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
			
 
				+				   (total_faults + 1);
			
 
				 			p->numa_faults_cpu[i] >>= 1;
			
 
				-			p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i];
			
 
				+			p->numa_faults_cpu[i] += f_weight;
			
 
				 			p->numa_faults_buffer_cpu[i] = 0;
			
 
				 
			
 
				 			faults += p->numa_faults_memory[i];