|
@@ -1095,6 +1095,34 @@ static void task_numa_assign(struct task_numa_env *env,
|
|
env->best_cpu = env->dst_cpu;
|
|
env->best_cpu = env->dst_cpu;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
|
|
|
|
+ long src_load, long dst_load,
|
|
|
|
+ struct task_numa_env *env)
|
|
|
|
+{
|
|
|
|
+ long imb, old_imb;
|
|
|
|
+
|
|
|
|
+ /* We care about the slope of the imbalance, not the direction. */
|
|
|
|
+ if (dst_load < src_load)
|
|
|
|
+ swap(dst_load, src_load);
|
|
|
|
+
|
|
|
|
+ /* Is the difference below the threshold? */
|
|
|
|
+ imb = dst_load * 100 - src_load * env->imbalance_pct;
|
|
|
|
+ if (imb <= 0)
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * The imbalance is above the allowed threshold.
|
|
|
|
+ * Compare it with the old imbalance.
|
|
|
|
+ */
|
|
|
|
+ if (orig_dst_load < orig_src_load)
|
|
|
|
+ swap(orig_dst_load, orig_src_load);
|
|
|
|
+
|
|
|
|
+ old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
|
|
|
|
+
|
|
|
|
+ /* Would this change make things worse? */
|
|
|
|
+ return (old_imb > imb);
|
|
|
|
+}
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* This checks if the overall compute and NUMA accesses of the system would
|
|
* This checks if the overall compute and NUMA accesses of the system would
|
|
* be improved if the source tasks was migrated to the target dst_cpu taking
|
|
* be improved if the source tasks was migrated to the target dst_cpu taking
|
|
@@ -1107,7 +1135,8 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
struct rq *src_rq = cpu_rq(env->src_cpu);
|
|
struct rq *src_rq = cpu_rq(env->src_cpu);
|
|
struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
|
struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
|
struct task_struct *cur;
|
|
struct task_struct *cur;
|
|
- long dst_load, src_load;
|
|
|
|
|
|
+ long orig_src_load, src_load;
|
|
|
|
+ long orig_dst_load, dst_load;
|
|
long load;
|
|
long load;
|
|
long imp = (groupimp > 0) ? groupimp : taskimp;
|
|
long imp = (groupimp > 0) ? groupimp : taskimp;
|
|
|
|
|
|
@@ -1181,13 +1210,13 @@ static void task_numa_compare(struct task_numa_env *env,
|
|
* In the overloaded case, try and keep the load balanced.
|
|
* In the overloaded case, try and keep the load balanced.
|
|
*/
|
|
*/
|
|
balance:
|
|
balance:
|
|
- dst_load = env->dst_stats.load;
|
|
|
|
- src_load = env->src_stats.load;
|
|
|
|
|
|
+ orig_dst_load = env->dst_stats.load;
|
|
|
|
+ orig_src_load = env->src_stats.load;
|
|
|
|
|
|
/* XXX missing power terms */
|
|
/* XXX missing power terms */
|
|
load = task_h_load(env->p);
|
|
load = task_h_load(env->p);
|
|
- dst_load += load;
|
|
|
|
- src_load -= load;
|
|
|
|
|
|
+ dst_load = orig_dst_load + load;
|
|
|
|
+ src_load = orig_src_load - load;
|
|
|
|
|
|
if (cur) {
|
|
if (cur) {
|
|
load = task_h_load(cur);
|
|
load = task_h_load(cur);
|
|
@@ -1195,11 +1224,8 @@ balance:
|
|
src_load += load;
|
|
src_load += load;
|
|
}
|
|
}
|
|
|
|
|
|
- /* make src_load the smaller */
|
|
|
|
- if (dst_load < src_load)
|
|
|
|
- swap(dst_load, src_load);
|
|
|
|
-
|
|
|
|
- if (src_load * env->imbalance_pct < dst_load * 100)
|
|
|
|
|
|
+ if (load_too_imbalanced(orig_src_load, orig_dst_load,
|
|
|
|
+ src_load, dst_load, env))
|
|
goto unlock;
|
|
goto unlock;
|
|
|
|
|
|
assign:
|
|
assign:
|
|
@@ -1301,7 +1327,16 @@ static int task_numa_migrate(struct task_struct *p)
|
|
if (env.best_cpu == -1)
|
|
if (env.best_cpu == -1)
|
|
return -EAGAIN;
|
|
return -EAGAIN;
|
|
|
|
|
|
- sched_setnuma(p, env.dst_nid);
|
|
|
|
|
|
+ /*
|
|
|
|
+ * If the task is part of a workload that spans multiple NUMA nodes,
|
|
|
|
+ * and is migrating into one of the workload's active nodes, remember
|
|
|
|
+ * this node as the task's preferred numa node, so the workload can
|
|
|
|
+ * settle down.
|
|
|
|
+ * A task that migrated to a second choice node will be better off
|
|
|
|
+ * trying for a better one later. Do not set the preferred node here.
|
|
|
|
+ */
|
|
|
|
+ if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
|
|
|
|
+ sched_setnuma(p, env.dst_nid);
|
|
|
|
|
|
/*
|
|
/*
|
|
* Reset the scan period if the task is being rescheduled on an
|
|
* Reset the scan period if the task is being rescheduled on an
|
|
@@ -1326,12 +1361,15 @@ static int task_numa_migrate(struct task_struct *p)
|
|
/* Attempt to migrate a task to a CPU on the preferred node. */
|
|
/* Attempt to migrate a task to a CPU on the preferred node. */
|
|
static void numa_migrate_preferred(struct task_struct *p)
|
|
static void numa_migrate_preferred(struct task_struct *p)
|
|
{
|
|
{
|
|
|
|
+ unsigned long interval = HZ;
|
|
|
|
+
|
|
/* This task has no NUMA fault statistics yet */
|
|
/* This task has no NUMA fault statistics yet */
|
|
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
|
|
if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
|
|
return;
|
|
return;
|
|
|
|
|
|
/* Periodically retry migrating the task to the preferred node */
|
|
/* Periodically retry migrating the task to the preferred node */
|
|
- p->numa_migrate_retry = jiffies + HZ;
|
|
|
|
|
|
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
|
|
|
|
+ p->numa_migrate_retry = jiffies + interval;
|
|
|
|
|
|
/* Success if task is already running on preferred CPU */
|
|
/* Success if task is already running on preferred CPU */
|
|
if (task_node(p) == p->numa_preferred_nid)
|
|
if (task_node(p) == p->numa_preferred_nid)
|
|
@@ -1738,6 +1776,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
struct task_struct *p = current;
|
|
struct task_struct *p = current;
|
|
bool migrated = flags & TNF_MIGRATED;
|
|
bool migrated = flags & TNF_MIGRATED;
|
|
int cpu_node = task_node(current);
|
|
int cpu_node = task_node(current);
|
|
|
|
+ int local = !!(flags & TNF_FAULT_LOCAL);
|
|
int priv;
|
|
int priv;
|
|
|
|
|
|
if (!numabalancing_enabled)
|
|
if (!numabalancing_enabled)
|
|
@@ -1786,6 +1825,17 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
task_numa_group(p, last_cpupid, flags, &priv);
|
|
task_numa_group(p, last_cpupid, flags, &priv);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ /*
|
|
|
|
+ * If a workload spans multiple NUMA nodes, a shared fault that
|
|
|
|
+ * occurs wholly within the set of nodes that the workload is
|
|
|
|
+ * actively using should be counted as local. This allows the
|
|
|
|
+ * scan rate to slow down when a workload has settled down.
|
|
|
|
+ */
|
|
|
|
+ if (!priv && !local && p->numa_group &&
|
|
|
|
+ node_isset(cpu_node, p->numa_group->active_nodes) &&
|
|
|
|
+ node_isset(mem_node, p->numa_group->active_nodes))
|
|
|
|
+ local = 1;
|
|
|
|
+
|
|
task_numa_placement(p);
|
|
task_numa_placement(p);
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -1800,7 +1850,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
|
|
|
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
|
|
p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
|
|
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
|
|
p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
|
|
- p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
|
|
|
|
|
|
+ p->numa_faults_locality[local] += pages;
|
|
}
|
|
}
|
|
|
|
|
|
static void reset_ptenuma_scan(struct task_struct *p)
|
|
static void reset_ptenuma_scan(struct task_struct *p)
|
|
@@ -3301,7 +3351,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
|
}
|
|
}
|
|
|
|
|
|
if (!se)
|
|
if (!se)
|
|
- rq->nr_running -= task_delta;
|
|
|
|
|
|
+ sub_nr_running(rq, task_delta);
|
|
|
|
|
|
cfs_rq->throttled = 1;
|
|
cfs_rq->throttled = 1;
|
|
cfs_rq->throttled_clock = rq_clock(rq);
|
|
cfs_rq->throttled_clock = rq_clock(rq);
|
|
@@ -3352,7 +3402,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
|
}
|
|
}
|
|
|
|
|
|
if (!se)
|
|
if (!se)
|
|
- rq->nr_running += task_delta;
|
|
|
|
|
|
+ add_nr_running(rq, task_delta);
|
|
|
|
|
|
/* determine whether we need to wake up potentially idle cpu */
|
|
/* determine whether we need to wake up potentially idle cpu */
|
|
if (rq->curr == rq->idle && rq->cfs.nr_running)
|
|
if (rq->curr == rq->idle && rq->cfs.nr_running)
|
|
@@ -3884,7 +3934,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
|
|
if (!se) {
|
|
if (!se) {
|
|
update_rq_runnable_avg(rq, rq->nr_running);
|
|
update_rq_runnable_avg(rq, rq->nr_running);
|
|
- inc_nr_running(rq);
|
|
|
|
|
|
+ add_nr_running(rq, 1);
|
|
}
|
|
}
|
|
hrtick_update(rq);
|
|
hrtick_update(rq);
|
|
}
|
|
}
|
|
@@ -3944,7 +3994,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|
}
|
|
}
|
|
|
|
|
|
if (!se) {
|
|
if (!se) {
|
|
- dec_nr_running(rq);
|
|
|
|
|
|
+ sub_nr_running(rq, 1);
|
|
update_rq_runnable_avg(rq, 1);
|
|
update_rq_runnable_avg(rq, 1);
|
|
}
|
|
}
|
|
hrtick_update(rq);
|
|
hrtick_update(rq);
|
|
@@ -4015,7 +4065,7 @@ static void record_wakee(struct task_struct *p)
|
|
* about the loss.
|
|
* about the loss.
|
|
*/
|
|
*/
|
|
if (jiffies > current->wakee_flip_decay_ts + HZ) {
|
|
if (jiffies > current->wakee_flip_decay_ts + HZ) {
|
|
- current->wakee_flips = 0;
|
|
|
|
|
|
+ current->wakee_flips >>= 1;
|
|
current->wakee_flip_decay_ts = jiffies;
|
|
current->wakee_flip_decay_ts = jiffies;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -4449,10 +4499,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|
sd = tmp;
|
|
sd = tmp;
|
|
}
|
|
}
|
|
|
|
|
|
- if (affine_sd) {
|
|
|
|
- if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
|
|
|
|
- prev_cpu = cpu;
|
|
|
|
|
|
+ if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
|
|
|
|
+ prev_cpu = cpu;
|
|
|
|
|
|
|
|
+ if (sd_flag & SD_BALANCE_WAKE) {
|
|
new_cpu = select_idle_sibling(p, prev_cpu);
|
|
new_cpu = select_idle_sibling(p, prev_cpu);
|
|
goto unlock;
|
|
goto unlock;
|
|
}
|
|
}
|
|
@@ -4520,6 +4570,9 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
|
|
atomic_long_add(se->avg.load_avg_contrib,
|
|
atomic_long_add(se->avg.load_avg_contrib,
|
|
&cfs_rq->removed_load);
|
|
&cfs_rq->removed_load);
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ /* We have migrated, no longer consider this task hot */
|
|
|
|
+ se->exec_start = 0;
|
|
}
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
@@ -5070,6 +5123,7 @@ task_hot(struct task_struct *p, u64 now)
|
|
/* Returns true if the destination node has incurred more faults */
|
|
/* Returns true if the destination node has incurred more faults */
|
|
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
|
static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
|
{
|
|
{
|
|
|
|
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
|
int src_nid, dst_nid;
|
|
int src_nid, dst_nid;
|
|
|
|
|
|
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
|
|
if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
|
|
@@ -5083,21 +5137,29 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
|
if (src_nid == dst_nid)
|
|
if (src_nid == dst_nid)
|
|
return false;
|
|
return false;
|
|
|
|
|
|
- /* Always encourage migration to the preferred node. */
|
|
|
|
- if (dst_nid == p->numa_preferred_nid)
|
|
|
|
- return true;
|
|
|
|
|
|
+ if (numa_group) {
|
|
|
|
+ /* Task is already in the group's interleave set. */
|
|
|
|
+ if (node_isset(src_nid, numa_group->active_nodes))
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ /* Task is moving into the group's interleave set. */
|
|
|
|
+ if (node_isset(dst_nid, numa_group->active_nodes))
|
|
|
|
+ return true;
|
|
|
|
|
|
- /* If both task and group weight improve, this move is a winner. */
|
|
|
|
- if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
|
|
|
|
- group_weight(p, dst_nid) > group_weight(p, src_nid))
|
|
|
|
|
|
+ return group_faults(p, dst_nid) > group_faults(p, src_nid);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /* Encourage migration to the preferred node. */
|
|
|
|
+ if (dst_nid == p->numa_preferred_nid)
|
|
return true;
|
|
return true;
|
|
|
|
|
|
- return false;
|
|
|
|
|
|
+ return task_faults(p, dst_nid) > task_faults(p, src_nid);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|
static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|
{
|
|
{
|
|
|
|
+ struct numa_group *numa_group = rcu_dereference(p->numa_group);
|
|
int src_nid, dst_nid;
|
|
int src_nid, dst_nid;
|
|
|
|
|
|
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
|
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
|
@@ -5112,16 +5174,23 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|
if (src_nid == dst_nid)
|
|
if (src_nid == dst_nid)
|
|
return false;
|
|
return false;
|
|
|
|
|
|
|
|
+ if (numa_group) {
|
|
|
|
+ /* Task is moving within/into the group's interleave set. */
|
|
|
|
+ if (node_isset(dst_nid, numa_group->active_nodes))
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ /* Task is moving out of the group's interleave set. */
|
|
|
|
+ if (node_isset(src_nid, numa_group->active_nodes))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ return group_faults(p, dst_nid) < group_faults(p, src_nid);
|
|
|
|
+ }
|
|
|
|
+
|
|
/* Migrating away from the preferred node is always bad. */
|
|
/* Migrating away from the preferred node is always bad. */
|
|
if (src_nid == p->numa_preferred_nid)
|
|
if (src_nid == p->numa_preferred_nid)
|
|
return true;
|
|
return true;
|
|
|
|
|
|
- /* If either task or group weight get worse, don't do it. */
|
|
|
|
- if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
|
|
|
|
- group_weight(p, dst_nid) < group_weight(p, src_nid))
|
|
|
|
- return true;
|
|
|
|
-
|
|
|
|
- return false;
|
|
|
|
|
|
+ return task_faults(p, dst_nid) < task_faults(p, src_nid);
|
|
}
|
|
}
|
|
|
|
|
|
#else
|
|
#else
|
|
@@ -5564,6 +5633,7 @@ static unsigned long scale_rt_power(int cpu)
|
|
{
|
|
{
|
|
struct rq *rq = cpu_rq(cpu);
|
|
struct rq *rq = cpu_rq(cpu);
|
|
u64 total, available, age_stamp, avg;
|
|
u64 total, available, age_stamp, avg;
|
|
|
|
+ s64 delta;
|
|
|
|
|
|
/*
|
|
/*
|
|
* Since we're reading these variables without serialization make sure
|
|
* Since we're reading these variables without serialization make sure
|
|
@@ -5572,7 +5642,11 @@ static unsigned long scale_rt_power(int cpu)
|
|
age_stamp = ACCESS_ONCE(rq->age_stamp);
|
|
age_stamp = ACCESS_ONCE(rq->age_stamp);
|
|
avg = ACCESS_ONCE(rq->rt_avg);
|
|
avg = ACCESS_ONCE(rq->rt_avg);
|
|
|
|
|
|
- total = sched_avg_period() + (rq_clock(rq) - age_stamp);
|
|
|
|
|
|
+ delta = rq_clock(rq) - age_stamp;
|
|
|
|
+ if (unlikely(delta < 0))
|
|
|
|
+ delta = 0;
|
|
|
|
+
|
|
|
|
+ total = sched_avg_period() + delta;
|
|
|
|
|
|
if (unlikely(total < avg)) {
|
|
if (unlikely(total < avg)) {
|
|
/* Ensures that power won't end up being negative */
|
|
/* Ensures that power won't end up being negative */
|
|
@@ -6640,17 +6714,44 @@ out:
|
|
return ld_moved;
|
|
return ld_moved;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+static inline unsigned long
|
|
|
|
+get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
|
|
|
|
+{
|
|
|
|
+ unsigned long interval = sd->balance_interval;
|
|
|
|
+
|
|
|
|
+ if (cpu_busy)
|
|
|
|
+ interval *= sd->busy_factor;
|
|
|
|
+
|
|
|
|
+ /* scale ms to jiffies */
|
|
|
|
+ interval = msecs_to_jiffies(interval);
|
|
|
|
+ interval = clamp(interval, 1UL, max_load_balance_interval);
|
|
|
|
+
|
|
|
|
+ return interval;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline void
|
|
|
|
+update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
|
|
|
|
+{
|
|
|
|
+ unsigned long interval, next;
|
|
|
|
+
|
|
|
|
+ interval = get_sd_balance_interval(sd, cpu_busy);
|
|
|
|
+ next = sd->last_balance + interval;
|
|
|
|
+
|
|
|
|
+ if (time_after(*next_balance, next))
|
|
|
|
+ *next_balance = next;
|
|
|
|
+}
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* idle_balance is called by schedule() if this_cpu is about to become
|
|
* idle_balance is called by schedule() if this_cpu is about to become
|
|
* idle. Attempts to pull tasks from other CPUs.
|
|
* idle. Attempts to pull tasks from other CPUs.
|
|
*/
|
|
*/
|
|
static int idle_balance(struct rq *this_rq)
|
|
static int idle_balance(struct rq *this_rq)
|
|
{
|
|
{
|
|
|
|
+ unsigned long next_balance = jiffies + HZ;
|
|
|
|
+ int this_cpu = this_rq->cpu;
|
|
struct sched_domain *sd;
|
|
struct sched_domain *sd;
|
|
int pulled_task = 0;
|
|
int pulled_task = 0;
|
|
- unsigned long next_balance = jiffies + HZ;
|
|
|
|
u64 curr_cost = 0;
|
|
u64 curr_cost = 0;
|
|
- int this_cpu = this_rq->cpu;
|
|
|
|
|
|
|
|
idle_enter_fair(this_rq);
|
|
idle_enter_fair(this_rq);
|
|
|
|
|
|
@@ -6660,8 +6761,15 @@ static int idle_balance(struct rq *this_rq)
|
|
*/
|
|
*/
|
|
this_rq->idle_stamp = rq_clock(this_rq);
|
|
this_rq->idle_stamp = rq_clock(this_rq);
|
|
|
|
|
|
- if (this_rq->avg_idle < sysctl_sched_migration_cost)
|
|
|
|
|
|
+ if (this_rq->avg_idle < sysctl_sched_migration_cost) {
|
|
|
|
+ rcu_read_lock();
|
|
|
|
+ sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
|
|
|
+ if (sd)
|
|
|
|
+ update_next_balance(sd, 0, &next_balance);
|
|
|
|
+ rcu_read_unlock();
|
|
|
|
+
|
|
goto out;
|
|
goto out;
|
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
/*
|
|
* Drop the rq->lock, but keep IRQ/preempt disabled.
|
|
* Drop the rq->lock, but keep IRQ/preempt disabled.
|
|
@@ -6671,20 +6779,20 @@ static int idle_balance(struct rq *this_rq)
|
|
update_blocked_averages(this_cpu);
|
|
update_blocked_averages(this_cpu);
|
|
rcu_read_lock();
|
|
rcu_read_lock();
|
|
for_each_domain(this_cpu, sd) {
|
|
for_each_domain(this_cpu, sd) {
|
|
- unsigned long interval;
|
|
|
|
int continue_balancing = 1;
|
|
int continue_balancing = 1;
|
|
u64 t0, domain_cost;
|
|
u64 t0, domain_cost;
|
|
|
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE))
|
|
if (!(sd->flags & SD_LOAD_BALANCE))
|
|
continue;
|
|
continue;
|
|
|
|
|
|
- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
|
|
|
|
|
|
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
|
|
|
|
+ update_next_balance(sd, 0, &next_balance);
|
|
break;
|
|
break;
|
|
|
|
+ }
|
|
|
|
|
|
if (sd->flags & SD_BALANCE_NEWIDLE) {
|
|
if (sd->flags & SD_BALANCE_NEWIDLE) {
|
|
t0 = sched_clock_cpu(this_cpu);
|
|
t0 = sched_clock_cpu(this_cpu);
|
|
|
|
|
|
- /* If we've pulled tasks over stop searching: */
|
|
|
|
pulled_task = load_balance(this_cpu, this_rq,
|
|
pulled_task = load_balance(this_cpu, this_rq,
|
|
sd, CPU_NEWLY_IDLE,
|
|
sd, CPU_NEWLY_IDLE,
|
|
&continue_balancing);
|
|
&continue_balancing);
|
|
@@ -6696,10 +6804,13 @@ static int idle_balance(struct rq *this_rq)
|
|
curr_cost += domain_cost;
|
|
curr_cost += domain_cost;
|
|
}
|
|
}
|
|
|
|
|
|
- interval = msecs_to_jiffies(sd->balance_interval);
|
|
|
|
- if (time_after(next_balance, sd->last_balance + interval))
|
|
|
|
- next_balance = sd->last_balance + interval;
|
|
|
|
- if (pulled_task)
|
|
|
|
|
|
+ update_next_balance(sd, 0, &next_balance);
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Stop searching for tasks to pull if there are
|
|
|
|
+ * now runnable tasks on this rq.
|
|
|
|
+ */
|
|
|
|
+ if (pulled_task || this_rq->nr_running > 0)
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
rcu_read_unlock();
|
|
@@ -6717,20 +6828,13 @@ static int idle_balance(struct rq *this_rq)
|
|
if (this_rq->cfs.h_nr_running && !pulled_task)
|
|
if (this_rq->cfs.h_nr_running && !pulled_task)
|
|
pulled_task = 1;
|
|
pulled_task = 1;
|
|
|
|
|
|
- if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
|
|
|
|
- /*
|
|
|
|
- * We are going idle. next_balance may be set based on
|
|
|
|
- * a busy processor. So reset next_balance.
|
|
|
|
- */
|
|
|
|
|
|
+out:
|
|
|
|
+ /* Move the next balance forward */
|
|
|
|
+ if (time_after(this_rq->next_balance, next_balance))
|
|
this_rq->next_balance = next_balance;
|
|
this_rq->next_balance = next_balance;
|
|
- }
|
|
|
|
|
|
|
|
-out:
|
|
|
|
/* Is there a task of a high priority class? */
|
|
/* Is there a task of a high priority class? */
|
|
- if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
|
|
|
|
- ((this_rq->stop && this_rq->stop->on_rq) ||
|
|
|
|
- this_rq->dl.dl_nr_running ||
|
|
|
|
- (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
|
|
|
|
|
|
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
|
pulled_task = -1;
|
|
pulled_task = -1;
|
|
|
|
|
|
if (pulled_task) {
|
|
if (pulled_task) {
|
|
@@ -7011,16 +7115,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
- interval = sd->balance_interval;
|
|
|
|
- if (idle != CPU_IDLE)
|
|
|
|
- interval *= sd->busy_factor;
|
|
|
|
-
|
|
|
|
- /* scale ms to jiffies */
|
|
|
|
- interval = msecs_to_jiffies(interval);
|
|
|
|
- interval = clamp(interval, 1UL, max_load_balance_interval);
|
|
|
|
|
|
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
|
|
|
|
|
|
need_serialize = sd->flags & SD_SERIALIZE;
|
|
need_serialize = sd->flags & SD_SERIALIZE;
|
|
-
|
|
|
|
if (need_serialize) {
|
|
if (need_serialize) {
|
|
if (!spin_trylock(&balancing))
|
|
if (!spin_trylock(&balancing))
|
|
goto out;
|
|
goto out;
|
|
@@ -7036,6 +7133,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
|
|
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
|
|
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
|
|
}
|
|
}
|
|
sd->last_balance = jiffies;
|
|
sd->last_balance = jiffies;
|
|
|
|
+ interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
|
|
}
|
|
}
|
|
if (need_serialize)
|
|
if (need_serialize)
|
|
spin_unlock(&balancing);
|
|
spin_unlock(&balancing);
|