|
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
|
|
|
|
|
|
/* Do the two (enqueued) entities belong to the same group ? */
|
|
|
-static inline int
|
|
|
+static inline struct cfs_rq *
|
|
|
is_same_group(struct sched_entity *se, struct sched_entity *pse)
|
|
|
{
|
|
|
if (se->cfs_rq == pse->cfs_rq)
|
|
|
- return 1;
|
|
|
+ return se->cfs_rq;
|
|
|
|
|
|
- return 0;
|
|
|
+ return NULL;
|
|
|
}
|
|
|
|
|
|
static inline struct sched_entity *parent_entity(struct sched_entity *se)
|
|
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
|
|
|
return se->parent;
|
|
|
}
|
|
|
|
|
|
-/* return depth at which a sched entity is present in the hierarchy */
|
|
|
-static inline int depth_se(struct sched_entity *se)
|
|
|
-{
|
|
|
- int depth = 0;
|
|
|
-
|
|
|
- for_each_sched_entity(se)
|
|
|
- depth++;
|
|
|
-
|
|
|
- return depth;
|
|
|
-}
|
|
|
-
|
|
|
static void
|
|
|
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
|
|
|
{
|
|
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
|
|
|
*/
|
|
|
|
|
|
/* First walk up until both entities are at same depth */
|
|
|
- se_depth = depth_se(*se);
|
|
|
- pse_depth = depth_se(*pse);
|
|
|
+ se_depth = (*se)->depth;
|
|
|
+ pse_depth = (*pse)->depth;
|
|
|
|
|
|
while (se_depth > pse_depth) {
|
|
|
se_depth--;
|
|
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
|
|
|
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
|
|
|
|
|
|
-static inline int
|
|
|
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
|
|
|
-{
|
|
|
- return 1;
|
|
|
-}
|
|
|
-
|
|
|
static inline struct sched_entity *parent_entity(struct sched_entity *se)
|
|
|
{
|
|
|
return NULL;
|
|
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
|
|
|
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
|
|
|
unsigned int sysctl_numa_balancing_scan_delay = 1000;
|
|
|
|
|
|
-/*
|
|
|
- * After skipping a page migration on a shared page, skip N more numa page
|
|
|
- * migrations unconditionally. This reduces the number of NUMA migrations
|
|
|
- * in shared memory workloads, and has the effect of pulling tasks towards
|
|
|
- * where their memory lives, over pulling the memory towards the task.
|
|
|
- */
|
|
|
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
|
|
|
-
|
|
|
static unsigned int task_nr_scan_windows(struct task_struct *p)
|
|
|
{
|
|
|
unsigned long rss = 0;
|
|
@@ -893,10 +868,26 @@ struct numa_group {
|
|
|
struct list_head task_list;
|
|
|
|
|
|
struct rcu_head rcu;
|
|
|
+ nodemask_t active_nodes;
|
|
|
unsigned long total_faults;
|
|
|
+ /*
|
|
|
+ * Faults_cpu is used to decide whether memory should move
|
|
|
+ * towards the CPU. As a consequence, these stats are weighted
|
|
|
+ * more by CPU use than by memory faults.
|
|
|
+ */
|
|
|
+ unsigned long *faults_cpu;
|
|
|
unsigned long faults[0];
|
|
|
};
|
|
|
|
|
|
+/* Shared or private faults. */
|
|
|
+#define NR_NUMA_HINT_FAULT_TYPES 2
|
|
|
+
|
|
|
+/* Memory and CPU locality */
|
|
|
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
|
|
|
+
|
|
|
+/* Averaged statistics, and temporary buffers. */
|
|
|
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
|
|
|
+
|
|
|
pid_t task_numa_group_id(struct task_struct *p)
|
|
|
{
|
|
|
return p->numa_group ? p->numa_group->gid : 0;
|
|
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
|
|
|
|
|
|
static inline int task_faults_idx(int nid, int priv)
|
|
|
{
|
|
|
- return 2 * nid + priv;
|
|
|
+ return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
|
|
|
}
|
|
|
|
|
|
static inline unsigned long task_faults(struct task_struct *p, int nid)
|
|
|
{
|
|
|
- if (!p->numa_faults)
|
|
|
+ if (!p->numa_faults_memory)
|
|
|
return 0;
|
|
|
|
|
|
- return p->numa_faults[task_faults_idx(nid, 0)] +
|
|
|
- p->numa_faults[task_faults_idx(nid, 1)];
|
|
|
+ return p->numa_faults_memory[task_faults_idx(nid, 0)] +
|
|
|
+ p->numa_faults_memory[task_faults_idx(nid, 1)];
|
|
|
}
|
|
|
|
|
|
static inline unsigned long group_faults(struct task_struct *p, int nid)
|
|
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
|
|
|
p->numa_group->faults[task_faults_idx(nid, 1)];
|
|
|
}
|
|
|
|
|
|
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
|
|
+{
|
|
|
+ return group->faults_cpu[task_faults_idx(nid, 0)] +
|
|
|
+ group->faults_cpu[task_faults_idx(nid, 1)];
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* These return the fraction of accesses done by a particular task, or
|
|
|
* task group, on a particular numa node. The group weight is given a
|
|
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
|
|
|
{
|
|
|
unsigned long total_faults;
|
|
|
|
|
|
- if (!p->numa_faults)
|
|
|
+ if (!p->numa_faults_memory)
|
|
|
return 0;
|
|
|
|
|
|
total_faults = p->total_numa_faults;
|
|
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
|
|
|
return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
|
|
|
}
|
|
|
|
|
|
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
|
|
+ int src_nid, int dst_cpu)
|
|
|
+{
|
|
|
+ struct numa_group *ng = p->numa_group;
|
|
|
+ int dst_nid = cpu_to_node(dst_cpu);
|
|
|
+ int last_cpupid, this_cpupid;
|
|
|
+
|
|
|
+ this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Multi-stage node selection is used in conjunction with a periodic
|
|
|
+ * migration fault to build a temporal task<->page relation. By using
|
|
|
+ * a two-stage filter we remove short/unlikely relations.
|
|
|
+ *
|
|
|
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
|
|
|
+ * a task's usage of a particular page (n_p) per total usage of this
|
|
|
+ * page (n_t) (in a given time-span) to a probability.
|
|
|
+ *
|
|
|
+ * Our periodic faults will sample this probability and getting the
|
|
|
+ * same result twice in a row, given these samples are fully
|
|
|
+ * independent, is then given by P(n)^2, provided our sample period
|
|
|
+ * is sufficiently short compared to the usage pattern.
|
|
|
+ *
|
|
|
+ * This quadric squishes small probabilities, making it less likely we
|
|
|
+ * act on an unlikely task<->page relation.
|
|
|
+ */
|
|
|
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
|
|
|
+ if (!cpupid_pid_unset(last_cpupid) &&
|
|
|
+ cpupid_to_nid(last_cpupid) != dst_nid)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* Always allow migrate on private faults */
|
|
|
+ if (cpupid_match_pid(p, last_cpupid))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ /* A shared fault, but p->numa_group has not been set up yet. */
|
|
|
+ if (!ng)
|
|
|
+ return true;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Do not migrate if the destination is not a node that
|
|
|
+ * is actively used by this numa group.
|
|
|
+ */
|
|
|
+ if (!node_isset(dst_nid, ng->active_nodes))
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Source is a node that is not actively used by this
|
|
|
+ * numa group, while the destination is. Migrate.
|
|
|
+ */
|
|
|
+ if (!node_isset(src_nid, ng->active_nodes))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Both source and destination are nodes in active
|
|
|
+ * use by this numa group. Maximize memory bandwidth
|
|
|
+ * by migrating from more heavily used groups, to less
|
|
|
+ * heavily used ones, spreading the load around.
|
|
|
+ * Use a 1/4 hysteresis to avoid spurious page movement.
|
|
|
+ */
|
|
|
+ return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
|
|
|
+}
|
|
|
+
|
|
|
static unsigned long weighted_cpuload(const int cpu);
|
|
|
static unsigned long source_load(int cpu, int type);
|
|
|
static unsigned long target_load(int cpu, int type);
|
|
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
static void numa_migrate_preferred(struct task_struct *p)
|
|
|
{
|
|
|
/* This task has no NUMA fault statistics yet */
|
|
|
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
|
|
|
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
|
|
|
return;
|
|
|
|
|
|
/* Periodically retry migrating the task to the preferred node */
|
|
@@ -1281,6 +1341,38 @@ static void numa_migrate_preferred(struct task_struct *p)
|
|
|
task_numa_migrate(p);
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Find the nodes on which the workload is actively running. We do this by
|
|
|
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
|
|
|
+ * be different from the set of nodes where the workload's memory is currently
|
|
|
+ * located.
|
|
|
+ *
|
|
|
+ * The bitmask is used to make smarter decisions on when to do NUMA page
|
|
|
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
|
|
|
+ * are added when they cause over 6/16 of the maximum number of faults, but
|
|
|
+ * only removed when they drop below 3/16.
|
|
|
+ */
|
|
|
+static void update_numa_active_node_mask(struct numa_group *numa_group)
|
|
|
+{
|
|
|
+ unsigned long faults, max_faults = 0;
|
|
|
+ int nid;
|
|
|
+
|
|
|
+ for_each_online_node(nid) {
|
|
|
+ faults = group_faults_cpu(numa_group, nid);
|
|
|
+ if (faults > max_faults)
|
|
|
+ max_faults = faults;
|
|
|
+ }
|
|
|
+
|
|
|
+ for_each_online_node(nid) {
|
|
|
+ faults = group_faults_cpu(numa_group, nid);
|
|
|
+ if (!node_isset(nid, numa_group->active_nodes)) {
|
|
|
+ if (faults > max_faults * 6 / 16)
|
|
|
+ node_set(nid, numa_group->active_nodes);
|
|
|
+ } else if (faults < max_faults * 3 / 16)
|
|
|
+ node_clear(nid, numa_group->active_nodes);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
|
|
|
* increments. The more local the fault statistics are, the higher the scan
|
|
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
|
|
|
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Get the fraction of time the task has been running since the last
|
|
|
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
|
|
|
+ * decays those on a 32ms period, which is orders of magnitude off
|
|
|
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
|
|
|
+ * stats only if the task is so new there are no NUMA statistics yet.
|
|
|
+ */
|
|
|
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
|
|
|
+{
|
|
|
+ u64 runtime, delta, now;
|
|
|
+ /* Use the start of this time slice to avoid calculations. */
|
|
|
+ now = p->se.exec_start;
|
|
|
+ runtime = p->se.sum_exec_runtime;
|
|
|
+
|
|
|
+ if (p->last_task_numa_placement) {
|
|
|
+ delta = runtime - p->last_sum_exec_runtime;
|
|
|
+ *period = now - p->last_task_numa_placement;
|
|
|
+ } else {
|
|
|
+ delta = p->se.avg.runnable_avg_sum;
|
|
|
+ *period = p->se.avg.runnable_avg_period;
|
|
|
+ }
|
|
|
+
|
|
|
+ p->last_sum_exec_runtime = runtime;
|
|
|
+ p->last_task_numa_placement = now;
|
|
|
+
|
|
|
+ return delta;
|
|
|
+}
|
|
|
+
|
|
|
static void task_numa_placement(struct task_struct *p)
|
|
|
{
|
|
|
int seq, nid, max_nid = -1, max_group_nid = -1;
|
|
|
unsigned long max_faults = 0, max_group_faults = 0;
|
|
|
unsigned long fault_types[2] = { 0, 0 };
|
|
|
+ unsigned long total_faults;
|
|
|
+ u64 runtime, period;
|
|
|
spinlock_t *group_lock = NULL;
|
|
|
|
|
|
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
|
|
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
p->numa_scan_seq = seq;
|
|
|
p->numa_scan_period_max = task_scan_max(p);
|
|
|
|
|
|
+ total_faults = p->numa_faults_locality[0] +
|
|
|
+ p->numa_faults_locality[1];
|
|
|
+ runtime = numa_get_avg_runtime(p, &period);
|
|
|
+
|
|
|
/* If the task is part of a group prevent parallel updates to group stats */
|
|
|
if (p->numa_group) {
|
|
|
group_lock = &p->numa_group->lock;
|
|
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
unsigned long faults = 0, group_faults = 0;
|
|
|
int priv, i;
|
|
|
|
|
|
- for (priv = 0; priv < 2; priv++) {
|
|
|
- long diff;
|
|
|
+ for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
|
|
|
+ long diff, f_diff, f_weight;
|
|
|
|
|
|
i = task_faults_idx(nid, priv);
|
|
|
- diff = -p->numa_faults[i];
|
|
|
|
|
|
/* Decay existing window, copy faults since last scan */
|
|
|
- p->numa_faults[i] >>= 1;
|
|
|
- p->numa_faults[i] += p->numa_faults_buffer[i];
|
|
|
- fault_types[priv] += p->numa_faults_buffer[i];
|
|
|
- p->numa_faults_buffer[i] = 0;
|
|
|
+ diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
|
|
|
+ fault_types[priv] += p->numa_faults_buffer_memory[i];
|
|
|
+ p->numa_faults_buffer_memory[i] = 0;
|
|
|
|
|
|
- faults += p->numa_faults[i];
|
|
|
- diff += p->numa_faults[i];
|
|
|
+ /*
|
|
|
+ * Normalize the faults_from, so all tasks in a group
|
|
|
+ * count according to CPU use, instead of by the raw
|
|
|
+ * number of faults. Tasks with little runtime have
|
|
|
+ * little over-all impact on throughput, and thus their
|
|
|
+ * faults are less important.
|
|
|
+ */
|
|
|
+ f_weight = div64_u64(runtime << 16, period + 1);
|
|
|
+ f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
|
|
|
+ (total_faults + 1);
|
|
|
+ f_diff = f_weight - p->numa_faults_cpu[i] / 2;
|
|
|
+ p->numa_faults_buffer_cpu[i] = 0;
|
|
|
+
|
|
|
+ p->numa_faults_memory[i] += diff;
|
|
|
+ p->numa_faults_cpu[i] += f_diff;
|
|
|
+ faults += p->numa_faults_memory[i];
|
|
|
p->total_numa_faults += diff;
|
|
|
if (p->numa_group) {
|
|
|
/* safe because we can only change our own group */
|
|
|
p->numa_group->faults[i] += diff;
|
|
|
+ p->numa_group->faults_cpu[i] += f_diff;
|
|
|
p->numa_group->total_faults += diff;
|
|
|
group_faults += p->numa_group->faults[i];
|
|
|
}
|
|
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
update_task_scan_period(p, fault_types[0], fault_types[1]);
|
|
|
|
|
|
if (p->numa_group) {
|
|
|
+ update_numa_active_node_mask(p->numa_group);
|
|
|
/*
|
|
|
* If the preferred task and group nids are different,
|
|
|
* iterate over the nodes again to find the best place.
|
|
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
|
|
|
if (unlikely(!p->numa_group)) {
|
|
|
unsigned int size = sizeof(struct numa_group) +
|
|
|
- 2*nr_node_ids*sizeof(unsigned long);
|
|
|
+ 4*nr_node_ids*sizeof(unsigned long);
|
|
|
|
|
|
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
|
|
|
if (!grp)
|
|
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
spin_lock_init(&grp->lock);
|
|
|
INIT_LIST_HEAD(&grp->task_list);
|
|
|
grp->gid = p->pid;
|
|
|
+ /* Second half of the array tracks nids where faults happen */
|
|
|
+ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
|
|
|
+ nr_node_ids;
|
|
|
+
|
|
|
+ node_set(task_node(current), grp->active_nodes);
|
|
|
|
|
|
- for (i = 0; i < 2*nr_node_ids; i++)
|
|
|
- grp->faults[i] = p->numa_faults[i];
|
|
|
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
+ grp->faults[i] = p->numa_faults_memory[i];
|
|
|
|
|
|
grp->total_faults = p->total_numa_faults;
|
|
|
|
|
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
|
|
|
double_lock(&my_grp->lock, &grp->lock);
|
|
|
|
|
|
- for (i = 0; i < 2*nr_node_ids; i++) {
|
|
|
- my_grp->faults[i] -= p->numa_faults[i];
|
|
|
- grp->faults[i] += p->numa_faults[i];
|
|
|
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
|
|
|
+ my_grp->faults[i] -= p->numa_faults_memory[i];
|
|
|
+ grp->faults[i] += p->numa_faults_memory[i];
|
|
|
}
|
|
|
my_grp->total_faults -= p->total_numa_faults;
|
|
|
grp->total_faults += p->total_numa_faults;
|
|
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
|
|
|
{
|
|
|
struct numa_group *grp = p->numa_group;
|
|
|
int i;
|
|
|
- void *numa_faults = p->numa_faults;
|
|
|
+ void *numa_faults = p->numa_faults_memory;
|
|
|
|
|
|
if (grp) {
|
|
|
spin_lock(&grp->lock);
|
|
|
- for (i = 0; i < 2*nr_node_ids; i++)
|
|
|
- grp->faults[i] -= p->numa_faults[i];
|
|
|
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
+ grp->faults[i] -= p->numa_faults_memory[i];
|
|
|
grp->total_faults -= p->total_numa_faults;
|
|
|
|
|
|
list_del(&p->numa_entry);
|
|
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
|
|
|
put_numa_group(grp);
|
|
|
}
|
|
|
|
|
|
- p->numa_faults = NULL;
|
|
|
- p->numa_faults_buffer = NULL;
|
|
|
+ p->numa_faults_memory = NULL;
|
|
|
+ p->numa_faults_buffer_memory = NULL;
|
|
|
+ p->numa_faults_cpu= NULL;
|
|
|
+ p->numa_faults_buffer_cpu = NULL;
|
|
|
kfree(numa_faults);
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* Got a PROT_NONE fault for a page on @node.
|
|
|
*/
|
|
|
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
|
|
|
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
{
|
|
|
struct task_struct *p = current;
|
|
|
bool migrated = flags & TNF_MIGRATED;
|
|
|
+ int cpu_node = task_node(current);
|
|
|
int priv;
|
|
|
|
|
|
if (!numabalancing_enabled)
|
|
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
|
|
|
return;
|
|
|
|
|
|
/* Allocate buffer to track faults on a per-node basis */
|
|
|
- if (unlikely(!p->numa_faults)) {
|
|
|
- int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
|
|
|
+ if (unlikely(!p->numa_faults_memory)) {
|
|
|
+ int size = sizeof(*p->numa_faults_memory) *
|
|
|
+ NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
|
|
|
|
|
|
- /* numa_faults and numa_faults_buffer share the allocation */
|
|
|
- p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
|
|
|
- if (!p->numa_faults)
|
|
|
+ p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
|
|
|
+ if (!p->numa_faults_memory)
|
|
|
return;
|
|
|
|
|
|
- BUG_ON(p->numa_faults_buffer);
|
|
|
- p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
|
|
|
+ BUG_ON(p->numa_faults_buffer_memory);
|
|
|
+ /*
|
|
|
+ * The averaged statistics, shared & private, memory & cpu,
|
|
|
+ * occupy the first half of the array. The second half of the
|
|
|
+ * array is for current counters, which are averaged into the
|
|
|
+ * first set by task_numa_placement.
|
|
|
+ */
|
|
|
+ p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
|
|
|
+ p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
|
|
|
+ p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
|
|
|
p->total_numa_faults = 0;
|
|
|
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
|
|
|
}
|
|
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
|
|
|
if (migrated)
|
|
|
p->numa_pages_migrated += pages;
|
|
|
|
|
|
- p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
|
|
|
+ p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
|
|
|
+ p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
|
|
|
p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
|
|
|
}
|
|
|
|
|
@@ -2219,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
|
|
|
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
|
|
|
}
|
|
|
}
|
|
|
-#else
|
|
|
+
|
|
|
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
|
|
|
+{
|
|
|
+ __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
|
|
|
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
|
|
|
+}
|
|
|
+#else /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
|
|
|
int force_update) {}
|
|
|
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
|
|
|
struct cfs_rq *cfs_rq) {}
|
|
|
static inline void __update_group_entity_contrib(struct sched_entity *se) {}
|
|
|
-#endif
|
|
|
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
|
|
|
+#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
|
|
static inline void __update_task_entity_contrib(struct sched_entity *se)
|
|
|
{
|
|
@@ -2323,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
|
|
|
__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
|
|
|
}
|
|
|
|
|
|
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
|
|
|
-{
|
|
|
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
|
|
|
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
|
|
|
-}
|
|
|
-
|
|
|
/* Add the load generated by se into cfs_rq's child load-average */
|
|
|
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
|
|
|
struct sched_entity *se,
|
|
@@ -2416,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq)
|
|
|
update_rq_runnable_avg(this_rq, 0);
|
|
|
}
|
|
|
|
|
|
-#else
|
|
|
+static int idle_balance(struct rq *this_rq);
|
|
|
+
|
|
|
+#else /* CONFIG_SMP */
|
|
|
+
|
|
|
static inline void update_entity_load_avg(struct sched_entity *se,
|
|
|
int update_cfs_rq) {}
|
|
|
static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
|
|
@@ -2428,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
|
|
|
int sleep) {}
|
|
|
static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
|
|
|
int force_update) {}
|
|
|
-#endif
|
|
|
+
|
|
|
+static inline int idle_balance(struct rq *rq)
|
|
|
+{
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+#endif /* CONFIG_SMP */
|
|
|
|
|
|
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
@@ -2578,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se)
|
|
|
{
|
|
|
for_each_sched_entity(se) {
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
- if (cfs_rq->last == se)
|
|
|
- cfs_rq->last = NULL;
|
|
|
- else
|
|
|
+ if (cfs_rq->last != se)
|
|
|
break;
|
|
|
+
|
|
|
+ cfs_rq->last = NULL;
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2589,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se)
|
|
|
{
|
|
|
for_each_sched_entity(se) {
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
- if (cfs_rq->next == se)
|
|
|
- cfs_rq->next = NULL;
|
|
|
- else
|
|
|
+ if (cfs_rq->next != se)
|
|
|
break;
|
|
|
+
|
|
|
+ cfs_rq->next = NULL;
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2600,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
|
|
|
{
|
|
|
for_each_sched_entity(se) {
|
|
|
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
|
|
- if (cfs_rq->skip == se)
|
|
|
- cfs_rq->skip = NULL;
|
|
|
- else
|
|
|
+ if (cfs_rq->skip != se)
|
|
|
break;
|
|
|
+
|
|
|
+ cfs_rq->skip = NULL;
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -2746,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
|
|
|
* 3) pick the "last" process, for cache locality
|
|
|
* 4) do not run the "skip" process, if something else is available
|
|
|
*/
|
|
|
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
|
|
|
+static struct sched_entity *
|
|
|
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
|
|
|
{
|
|
|
- struct sched_entity *se = __pick_first_entity(cfs_rq);
|
|
|
- struct sched_entity *left = se;
|
|
|
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
|
|
|
+ struct sched_entity *se;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If curr is set we have to see if its left of the leftmost entity
|
|
|
+ * still in the tree, provided there was anything in the tree at all.
|
|
|
+ */
|
|
|
+ if (!left || (curr && entity_before(curr, left)))
|
|
|
+ left = curr;
|
|
|
+
|
|
|
+ se = left; /* ideally we run the leftmost entity */
|
|
|
|
|
|
/*
|
|
|
* Avoid running the skip buddy, if running something else can
|
|
|
* be done without getting too unfair.
|
|
|
*/
|
|
|
if (cfs_rq->skip == se) {
|
|
|
- struct sched_entity *second = __pick_next_entity(se);
|
|
|
+ struct sched_entity *second;
|
|
|
+
|
|
|
+ if (se == curr) {
|
|
|
+ second = __pick_first_entity(cfs_rq);
|
|
|
+ } else {
|
|
|
+ second = __pick_next_entity(se);
|
|
|
+ if (!second || (curr && entity_before(curr, second)))
|
|
|
+ second = curr;
|
|
|
+ }
|
|
|
+
|
|
|
if (second && wakeup_preempt_entity(second, left) < 1)
|
|
|
se = second;
|
|
|
}
|
|
@@ -2778,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
|
|
|
return se;
|
|
|
}
|
|
|
|
|
|
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
|
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
|
|
|
|
|
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
|
|
{
|
|
@@ -3433,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
|
|
|
}
|
|
|
|
|
|
/* conditionally throttle active cfs_rq's from put_prev_entity() */
|
|
|
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|
|
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|
|
{
|
|
|
if (!cfs_bandwidth_used())
|
|
|
- return;
|
|
|
+ return false;
|
|
|
|
|
|
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
|
|
|
- return;
|
|
|
+ return false;
|
|
|
|
|
|
/*
|
|
|
* it's possible for a throttled entity to be forced into a running
|
|
|
* state (e.g. set_curr_task), in this case we're finished.
|
|
|
*/
|
|
|
if (cfs_rq_throttled(cfs_rq))
|
|
|
- return;
|
|
|
+ return true;
|
|
|
|
|
|
throttle_cfs_rq(cfs_rq);
|
|
|
+ return true;
|
|
|
}
|
|
|
|
|
|
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
|
|
@@ -3558,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
|
|
|
}
|
|
|
|
|
|
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
|
|
|
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
|
|
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
|
|
|
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
|
|
|
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
|
|
|
|
@@ -4213,13 +4400,14 @@ done:
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * sched_balance_self: balance the current task (running on cpu) in domains
|
|
|
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
|
|
|
- * SD_BALANCE_EXEC.
|
|
|
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
|
|
|
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
|
|
|
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
|
|
|
*
|
|
|
- * Balance, ie. select the least loaded group.
|
|
|
+ * Balances load by selecting the idlest cpu in the idlest group, or under
|
|
|
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
|
|
|
*
|
|
|
- * Returns the target CPU number, or the same CPU if no balancing is needed.
|
|
|
+ * Returns the target cpu number.
|
|
|
*
|
|
|
* preempt must be disabled.
|
|
|
*/
|
|
@@ -4494,26 +4682,124 @@ preempt:
|
|
|
set_last_buddy(se);
|
|
|
}
|
|
|
|
|
|
-static struct task_struct *pick_next_task_fair(struct rq *rq)
|
|
|
+static struct task_struct *
|
|
|
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
|
|
|
{
|
|
|
- struct task_struct *p;
|
|
|
struct cfs_rq *cfs_rq = &rq->cfs;
|
|
|
struct sched_entity *se;
|
|
|
+ struct task_struct *p;
|
|
|
+ int new_tasks;
|
|
|
|
|
|
+again:
|
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
if (!cfs_rq->nr_running)
|
|
|
- return NULL;
|
|
|
+ goto idle;
|
|
|
+
|
|
|
+ if (prev->sched_class != &fair_sched_class)
|
|
|
+ goto simple;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
|
|
|
+ * likely that a next task is from the same cgroup as the current.
|
|
|
+ *
|
|
|
+ * Therefore attempt to avoid putting and setting the entire cgroup
|
|
|
+ * hierarchy, only change the part that actually changes.
|
|
|
+ */
|
|
|
+
|
|
|
+ do {
|
|
|
+ struct sched_entity *curr = cfs_rq->curr;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Since we got here without doing put_prev_entity() we also
|
|
|
+ * have to consider cfs_rq->curr. If it is still a runnable
|
|
|
+ * entity, update_curr() will update its vruntime, otherwise
|
|
|
+ * forget we've ever seen it.
|
|
|
+ */
|
|
|
+ if (curr && curr->on_rq)
|
|
|
+ update_curr(cfs_rq);
|
|
|
+ else
|
|
|
+ curr = NULL;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This call to check_cfs_rq_runtime() will do the throttle and
|
|
|
+ * dequeue its entity in the parent(s). Therefore the 'simple'
|
|
|
+ * nr_running test will indeed be correct.
|
|
|
+ */
|
|
|
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
|
|
|
+ goto simple;
|
|
|
+
|
|
|
+ se = pick_next_entity(cfs_rq, curr);
|
|
|
+ cfs_rq = group_cfs_rq(se);
|
|
|
+ } while (cfs_rq);
|
|
|
+
|
|
|
+ p = task_of(se);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Since we haven't yet done put_prev_entity and if the selected task
|
|
|
+ * is a different task than we started out with, try and touch the
|
|
|
+ * least amount of cfs_rqs.
|
|
|
+ */
|
|
|
+ if (prev != p) {
|
|
|
+ struct sched_entity *pse = &prev->se;
|
|
|
+
|
|
|
+ while (!(cfs_rq = is_same_group(se, pse))) {
|
|
|
+ int se_depth = se->depth;
|
|
|
+ int pse_depth = pse->depth;
|
|
|
+
|
|
|
+ if (se_depth <= pse_depth) {
|
|
|
+ put_prev_entity(cfs_rq_of(pse), pse);
|
|
|
+ pse = parent_entity(pse);
|
|
|
+ }
|
|
|
+ if (se_depth >= pse_depth) {
|
|
|
+ set_next_entity(cfs_rq_of(se), se);
|
|
|
+ se = parent_entity(se);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ put_prev_entity(cfs_rq, pse);
|
|
|
+ set_next_entity(cfs_rq, se);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (hrtick_enabled(rq))
|
|
|
+ hrtick_start_fair(rq, p);
|
|
|
+
|
|
|
+ return p;
|
|
|
+simple:
|
|
|
+ cfs_rq = &rq->cfs;
|
|
|
+#endif
|
|
|
+
|
|
|
+ if (!cfs_rq->nr_running)
|
|
|
+ goto idle;
|
|
|
+
|
|
|
+ put_prev_task(rq, prev);
|
|
|
|
|
|
do {
|
|
|
- se = pick_next_entity(cfs_rq);
|
|
|
+ se = pick_next_entity(cfs_rq, NULL);
|
|
|
set_next_entity(cfs_rq, se);
|
|
|
cfs_rq = group_cfs_rq(se);
|
|
|
} while (cfs_rq);
|
|
|
|
|
|
p = task_of(se);
|
|
|
+
|
|
|
if (hrtick_enabled(rq))
|
|
|
hrtick_start_fair(rq, p);
|
|
|
|
|
|
return p;
|
|
|
+
|
|
|
+idle:
|
|
|
+ new_tasks = idle_balance(rq);
|
|
|
+ /*
|
|
|
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
|
|
|
+ * possible for any higher priority task to appear. In that case we
|
|
|
+ * must re-start the pick_next_entity() loop.
|
|
|
+ */
|
|
|
+ if (new_tasks < 0)
|
|
|
+ return RETRY_TASK;
|
|
|
+
|
|
|
+ if (new_tasks > 0)
|
|
|
+ goto again;
|
|
|
+
|
|
|
+ return NULL;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -4751,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
|
|
|
* Is this task likely cache-hot:
|
|
|
*/
|
|
|
static int
|
|
|
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
|
|
|
+task_hot(struct task_struct *p, u64 now)
|
|
|
{
|
|
|
s64 delta;
|
|
|
|
|
@@ -4785,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
|
|
{
|
|
|
int src_nid, dst_nid;
|
|
|
|
|
|
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
|
|
|
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
|
|
|
!(env->sd->flags & SD_NUMA)) {
|
|
|
return false;
|
|
|
}
|
|
@@ -4816,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|
|
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
|
|
return false;
|
|
|
|
|
|
- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
|
|
|
+ if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
|
|
|
return false;
|
|
|
|
|
|
src_nid = cpu_to_node(env->src_cpu);
|
|
@@ -4912,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
* 2) task is cache cold, or
|
|
|
* 3) too many balance attempts have failed.
|
|
|
*/
|
|
|
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
|
|
|
+ tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
|
|
|
if (!tsk_cache_hot)
|
|
|
tsk_cache_hot = migrate_degrades_locality(p, env);
|
|
|
|
|
@@ -5775,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
|
|
|
pwr_now /= SCHED_POWER_SCALE;
|
|
|
|
|
|
/* Amount of load we'd subtract */
|
|
|
- tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
|
|
|
- busiest->group_power;
|
|
|
- if (busiest->avg_load > tmp) {
|
|
|
+ if (busiest->avg_load > scaled_busy_load_per_task) {
|
|
|
pwr_move += busiest->group_power *
|
|
|
min(busiest->load_per_task,
|
|
|
- busiest->avg_load - tmp);
|
|
|
+ busiest->avg_load - scaled_busy_load_per_task);
|
|
|
}
|
|
|
|
|
|
/* Amount of load we'd add */
|
|
@@ -6359,17 +6643,23 @@ out:
|
|
|
* idle_balance is called by schedule() if this_cpu is about to become
|
|
|
* idle. Attempts to pull tasks from other CPUs.
|
|
|
*/
|
|
|
-void idle_balance(int this_cpu, struct rq *this_rq)
|
|
|
+static int idle_balance(struct rq *this_rq)
|
|
|
{
|
|
|
struct sched_domain *sd;
|
|
|
int pulled_task = 0;
|
|
|
unsigned long next_balance = jiffies + HZ;
|
|
|
u64 curr_cost = 0;
|
|
|
+ int this_cpu = this_rq->cpu;
|
|
|
|
|
|
+ idle_enter_fair(this_rq);
|
|
|
+ /*
|
|
|
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
|
|
|
+ * measure the duration of idle_balance() as idle time.
|
|
|
+ */
|
|
|
this_rq->idle_stamp = rq_clock(this_rq);
|
|
|
|
|
|
if (this_rq->avg_idle < sysctl_sched_migration_cost)
|
|
|
- return;
|
|
|
+ goto out;
|
|
|
|
|
|
/*
|
|
|
* Drop the rq->lock, but keep IRQ/preempt disabled.
|
|
@@ -6407,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq)
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
|
if (time_after(next_balance, sd->last_balance + interval))
|
|
|
next_balance = sd->last_balance + interval;
|
|
|
- if (pulled_task) {
|
|
|
- this_rq->idle_stamp = 0;
|
|
|
+ if (pulled_task)
|
|
|
break;
|
|
|
- }
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
raw_spin_lock(&this_rq->lock);
|
|
|
|
|
|
+ /*
|
|
|
+ * While browsing the domains, we released the rq lock.
|
|
|
+ * A task could have be enqueued in the meantime
|
|
|
+ */
|
|
|
+ if (this_rq->cfs.h_nr_running && !pulled_task) {
|
|
|
+ pulled_task = 1;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
|
|
|
/*
|
|
|
* We are going idle. next_balance may be set based on
|
|
@@ -6426,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
|
|
|
|
|
|
if (curr_cost > this_rq->max_idle_balance_cost)
|
|
|
this_rq->max_idle_balance_cost = curr_cost;
|
|
|
+
|
|
|
+out:
|
|
|
+ /* Is there a task of a high priority class? */
|
|
|
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
|
|
|
+ (this_rq->dl.dl_nr_running ||
|
|
|
+ (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
|
|
|
+ pulled_task = -1;
|
|
|
+
|
|
|
+ if (pulled_task) {
|
|
|
+ idle_exit_fair(this_rq);
|
|
|
+ this_rq->idle_stamp = 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ return pulled_task;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -6496,6 +6807,11 @@ out_unlock:
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+static inline int on_null_domain(struct rq *rq)
|
|
|
+{
|
|
|
+ return unlikely(!rcu_dereference_sched(rq->sd));
|
|
|
+}
|
|
|
+
|
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
|
|
/*
|
|
|
* idle load balancing details
|
|
@@ -6550,8 +6866,13 @@ static void nohz_balancer_kick(void)
|
|
|
static inline void nohz_balance_exit_idle(int cpu)
|
|
|
{
|
|
|
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
|
|
|
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
|
|
|
- atomic_dec(&nohz.nr_cpus);
|
|
|
+ /*
|
|
|
+ * Completely isolated CPUs don't ever set, so we must test.
|
|
|
+ */
|
|
|
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
|
|
|
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
|
|
|
+ atomic_dec(&nohz.nr_cpus);
|
|
|
+ }
|
|
|
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
|
|
|
}
|
|
|
}
|
|
@@ -6605,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu)
|
|
|
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
|
|
|
return;
|
|
|
|
|
|
+ /*
|
|
|
+ * If we're a completely isolated CPU, we don't play.
|
|
|
+ */
|
|
|
+ if (on_null_domain(cpu_rq(cpu)))
|
|
|
+ return;
|
|
|
+
|
|
|
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
|
|
|
atomic_inc(&nohz.nr_cpus);
|
|
|
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
|
|
@@ -6867,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h)
|
|
|
nohz_idle_balance(this_rq, idle);
|
|
|
}
|
|
|
|
|
|
-static inline int on_null_domain(struct rq *rq)
|
|
|
-{
|
|
|
- return !rcu_dereference_sched(rq->sd);
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
|
|
|
*/
|
|
@@ -7036,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
|
|
|
*/
|
|
|
static void switched_to_fair(struct rq *rq, struct task_struct *p)
|
|
|
{
|
|
|
- if (!p->se.on_rq)
|
|
|
+ struct sched_entity *se = &p->se;
|
|
|
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
+ /*
|
|
|
+ * Since the real-depth could have been changed (only FAIR
|
|
|
+ * class maintain depth value), reset depth properly.
|
|
|
+ */
|
|
|
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
|
+#endif
|
|
|
+ if (!se->on_rq)
|
|
|
return;
|
|
|
|
|
|
/*
|
|
@@ -7084,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
static void task_move_group_fair(struct task_struct *p, int on_rq)
|
|
|
{
|
|
|
+ struct sched_entity *se = &p->se;
|
|
|
struct cfs_rq *cfs_rq;
|
|
|
+
|
|
|
/*
|
|
|
* If the task was not on the rq at the time of this cgroup movement
|
|
|
* it must have been asleep, sleeping tasks keep their ->vruntime
|
|
@@ -7110,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
|
|
|
* To prevent boost or penalty in the new cfs_rq caused by delta
|
|
|
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
|
|
|
*/
|
|
|
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
|
|
|
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
|
|
|
on_rq = 1;
|
|
|
|
|
|
if (!on_rq)
|
|
|
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
|
|
|
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
|
|
|
set_task_rq(p, task_cpu(p));
|
|
|
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
|
|
|
if (!on_rq) {
|
|
|
- cfs_rq = cfs_rq_of(&p->se);
|
|
|
- p->se.vruntime += cfs_rq->min_vruntime;
|
|
|
+ cfs_rq = cfs_rq_of(se);
|
|
|
+ se->vruntime += cfs_rq->min_vruntime;
|
|
|
#ifdef CONFIG_SMP
|
|
|
/*
|
|
|
* migrate_task_rq_fair() will have removed our previous
|
|
|
* contribution, but we must synchronize for ongoing future
|
|
|
* decay.
|
|
|
*/
|
|
|
- p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
|
|
|
- cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
|
|
|
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
|
|
|
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
|
|
|
#endif
|
|
|
}
|
|
|
}
|
|
@@ -7222,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
|
|
if (!se)
|
|
|
return;
|
|
|
|
|
|
- if (!parent)
|
|
|
+ if (!parent) {
|
|
|
se->cfs_rq = &rq->cfs;
|
|
|
- else
|
|
|
+ se->depth = 0;
|
|
|
+ } else {
|
|
|
se->cfs_rq = parent->my_q;
|
|
|
+ se->depth = parent->depth + 1;
|
|
|
+ }
|
|
|
|
|
|
se->my_q = cfs_rq;
|
|
|
/* guarantee group entities always have weight */
|