|
@@ -20,8 +20,8 @@
|
|
|
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
|
|
|
*/
|
|
|
|
|
|
-#include <linux/latencytop.h>
|
|
|
#include <linux/sched.h>
|
|
|
+#include <linux/latencytop.h>
|
|
|
#include <linux/cpumask.h>
|
|
|
#include <linux/cpuidle.h>
|
|
|
#include <linux/slab.h>
|
|
@@ -755,7 +755,9 @@ static void
|
|
|
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
struct task_struct *p;
|
|
|
- u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
|
|
|
+ u64 delta;
|
|
|
+
|
|
|
+ delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
|
|
|
|
|
|
if (entity_is_task(se)) {
|
|
|
p = task_of(se);
|
|
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
se->statistics.wait_sum += delta;
|
|
|
se->statistics.wait_start = 0;
|
|
|
}
|
|
|
-#else
|
|
|
-static inline void
|
|
|
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
-}
|
|
|
-
|
|
|
-static inline void
|
|
|
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
-{
|
|
|
-}
|
|
|
-#endif
|
|
|
|
|
|
/*
|
|
|
* Task is being enqueued - update stats:
|
|
|
*/
|
|
|
-static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+static inline void
|
|
|
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
{
|
|
|
/*
|
|
|
* Are we enqueueing a waiting task? (for current tasks
|
|
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
}
|
|
|
|
|
|
static inline void
|
|
|
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
{
|
|
|
/*
|
|
|
* Mark the end of the wait period if dequeueing a
|
|
@@ -810,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
*/
|
|
|
if (se != cfs_rq->curr)
|
|
|
update_stats_wait_end(cfs_rq, se);
|
|
|
+
|
|
|
+ if (flags & DEQUEUE_SLEEP) {
|
|
|
+ if (entity_is_task(se)) {
|
|
|
+ struct task_struct *tsk = task_of(se);
|
|
|
+
|
|
|
+ if (tsk->state & TASK_INTERRUPTIBLE)
|
|
|
+ se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
|
|
|
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
|
|
|
+ se->statistics.block_start = rq_clock(rq_of(cfs_rq));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+#else
|
|
|
+static inline void
|
|
|
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
}
|
|
|
|
|
|
+static inline void
|
|
|
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+}
|
|
|
+
|
|
|
+static inline void
|
|
|
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
+{
|
|
|
+}
|
|
|
+
|
|
|
+static inline void
|
|
|
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
+{
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
/*
|
|
|
* We are picking a new current task - update its stats:
|
|
|
*/
|
|
@@ -907,10 +932,11 @@ struct numa_group {
|
|
|
spinlock_t lock; /* nr_tasks, tasks */
|
|
|
int nr_tasks;
|
|
|
pid_t gid;
|
|
|
+ int active_nodes;
|
|
|
|
|
|
struct rcu_head rcu;
|
|
|
- nodemask_t active_nodes;
|
|
|
unsigned long total_faults;
|
|
|
+ unsigned long max_faults_cpu;
|
|
|
/*
|
|
|
* Faults_cpu is used to decide whether memory should move
|
|
|
* towards the CPU. As a consequence, these stats are weighted
|
|
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
|
|
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * A node triggering more than 1/3 as many NUMA faults as the maximum is
|
|
|
+ * considered part of a numa group's pseudo-interleaving set. Migrations
|
|
|
+ * between these nodes are slowed down, to allow things to settle down.
|
|
|
+ */
|
|
|
+#define ACTIVE_NODE_FRACTION 3
|
|
|
+
|
|
|
+static bool numa_is_active_node(int nid, struct numa_group *ng)
|
|
|
+{
|
|
|
+ return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
|
|
|
+}
|
|
|
+
|
|
|
/* Handle placement on systems where not all nodes are directly connected. */
|
|
|
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
|
|
|
int maxdist, bool task)
|
|
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
|
|
return true;
|
|
|
|
|
|
/*
|
|
|
- * Do not migrate if the destination is not a node that
|
|
|
- * is actively used by this numa group.
|
|
|
+ * Destination node is much more heavily used than the source
|
|
|
+ * node? Allow migration.
|
|
|
*/
|
|
|
- if (!node_isset(dst_nid, ng->active_nodes))
|
|
|
- return false;
|
|
|
-
|
|
|
- /*
|
|
|
- * Source is a node that is not actively used by this
|
|
|
- * numa group, while the destination is. Migrate.
|
|
|
- */
|
|
|
- if (!node_isset(src_nid, ng->active_nodes))
|
|
|
+ if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
|
|
|
+ ACTIVE_NODE_FRACTION)
|
|
|
return true;
|
|
|
|
|
|
/*
|
|
|
- * Both source and destination are nodes in active
|
|
|
- * use by this numa group. Maximize memory bandwidth
|
|
|
- * by migrating from more heavily used groups, to less
|
|
|
- * heavily used ones, spreading the load around.
|
|
|
- * Use a 1/4 hysteresis to avoid spurious page movement.
|
|
|
+ * Distribute memory according to CPU & memory use on each node,
|
|
|
+ * with 3/4 hysteresis to avoid unnecessary memory migrations:
|
|
|
+ *
|
|
|
+ * faults_cpu(dst) 3 faults_cpu(src)
|
|
|
+ * --------------- * - > ---------------
|
|
|
+ * faults_mem(dst) 4 faults_mem(src)
|
|
|
*/
|
|
|
- return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
|
|
|
+ return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
|
|
|
+ group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
|
|
|
}
|
|
|
|
|
|
static unsigned long weighted_cpuload(const int cpu);
|
|
@@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
|
|
|
.best_task = NULL,
|
|
|
.best_imp = 0,
|
|
|
- .best_cpu = -1
|
|
|
+ .best_cpu = -1,
|
|
|
};
|
|
|
struct sched_domain *sd;
|
|
|
unsigned long taskweight, groupweight;
|
|
@@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
* multiple NUMA nodes; in order to better consolidate the group,
|
|
|
* we need to check other locations.
|
|
|
*/
|
|
|
- if (env.best_cpu == -1 || (p->numa_group &&
|
|
|
- nodes_weight(p->numa_group->active_nodes) > 1)) {
|
|
|
+ if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
|
|
|
for_each_online_node(nid) {
|
|
|
if (nid == env.src_nid || nid == p->numa_preferred_nid)
|
|
|
continue;
|
|
@@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
|
|
|
* trying for a better one later. Do not set the preferred node here.
|
|
|
*/
|
|
|
if (p->numa_group) {
|
|
|
+ struct numa_group *ng = p->numa_group;
|
|
|
+
|
|
|
if (env.best_cpu == -1)
|
|
|
nid = env.src_nid;
|
|
|
else
|
|
|
nid = env.dst_nid;
|
|
|
|
|
|
- if (node_isset(nid, p->numa_group->active_nodes))
|
|
|
+ if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
|
|
|
sched_setnuma(p, env.dst_nid);
|
|
|
}
|
|
|
|
|
@@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
- * Find the nodes on which the workload is actively running. We do this by
|
|
|
+ * Find out how many nodes on the workload is actively running on. Do this by
|
|
|
* tracking the nodes from which NUMA hinting faults are triggered. This can
|
|
|
* be different from the set of nodes where the workload's memory is currently
|
|
|
* located.
|
|
|
- *
|
|
|
- * The bitmask is used to make smarter decisions on when to do NUMA page
|
|
|
- * migrations, To prevent flip-flopping, and excessive page migrations, nodes
|
|
|
- * are added when they cause over 6/16 of the maximum number of faults, but
|
|
|
- * only removed when they drop below 3/16.
|
|
|
*/
|
|
|
-static void update_numa_active_node_mask(struct numa_group *numa_group)
|
|
|
+static void numa_group_count_active_nodes(struct numa_group *numa_group)
|
|
|
{
|
|
|
unsigned long faults, max_faults = 0;
|
|
|
- int nid;
|
|
|
+ int nid, active_nodes = 0;
|
|
|
|
|
|
for_each_online_node(nid) {
|
|
|
faults = group_faults_cpu(numa_group, nid);
|
|
@@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
|
|
|
|
|
|
for_each_online_node(nid) {
|
|
|
faults = group_faults_cpu(numa_group, nid);
|
|
|
- if (!node_isset(nid, numa_group->active_nodes)) {
|
|
|
- if (faults > max_faults * 6 / 16)
|
|
|
- node_set(nid, numa_group->active_nodes);
|
|
|
- } else if (faults < max_faults * 3 / 16)
|
|
|
- node_clear(nid, numa_group->active_nodes);
|
|
|
+ if (faults * ACTIVE_NODE_FRACTION > max_faults)
|
|
|
+ active_nodes++;
|
|
|
}
|
|
|
+
|
|
|
+ numa_group->max_faults_cpu = max_faults;
|
|
|
+ numa_group->active_nodes = active_nodes;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
|
|
|
update_task_scan_period(p, fault_types[0], fault_types[1]);
|
|
|
|
|
|
if (p->numa_group) {
|
|
|
- update_numa_active_node_mask(p->numa_group);
|
|
|
+ numa_group_count_active_nodes(p->numa_group);
|
|
|
spin_unlock_irq(group_lock);
|
|
|
max_nid = preferred_group_nid(p, max_group_nid);
|
|
|
}
|
|
@@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
return;
|
|
|
|
|
|
atomic_set(&grp->refcount, 1);
|
|
|
+ grp->active_nodes = 1;
|
|
|
+ grp->max_faults_cpu = 0;
|
|
|
spin_lock_init(&grp->lock);
|
|
|
grp->gid = p->pid;
|
|
|
/* Second half of the array tracks nids where faults happen */
|
|
|
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
|
|
|
nr_node_ids;
|
|
|
|
|
|
- node_set(task_node(current), grp->active_nodes);
|
|
|
-
|
|
|
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
|
|
grp->faults[i] = p->numa_faults[i];
|
|
|
|
|
@@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
bool migrated = flags & TNF_MIGRATED;
|
|
|
int cpu_node = task_node(current);
|
|
|
int local = !!(flags & TNF_FAULT_LOCAL);
|
|
|
+ struct numa_group *ng;
|
|
|
int priv;
|
|
|
|
|
|
if (!static_branch_likely(&sched_numa_balancing))
|
|
@@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
|
|
* actively using should be counted as local. This allows the
|
|
|
* scan rate to slow down when a workload has settled down.
|
|
|
*/
|
|
|
- if (!priv && !local && p->numa_group &&
|
|
|
- node_isset(cpu_node, p->numa_group->active_nodes) &&
|
|
|
- node_isset(mem_node, p->numa_group->active_nodes))
|
|
|
+ ng = p->numa_group;
|
|
|
+ if (!priv && !local && ng && ng->active_nodes > 1 &&
|
|
|
+ numa_is_active_node(cpu_node, ng) &&
|
|
|
+ numa_is_active_node(mem_node, ng))
|
|
|
local = 1;
|
|
|
|
|
|
task_numa_placement(p);
|
|
@@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
|
|
|
|
|
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
|
|
|
|
|
+static inline void check_schedstat_required(void)
|
|
|
+{
|
|
|
+#ifdef CONFIG_SCHEDSTATS
|
|
|
+ if (schedstat_enabled())
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Force schedstat enabled if a dependent tracepoint is active */
|
|
|
+ if (trace_sched_stat_wait_enabled() ||
|
|
|
+ trace_sched_stat_sleep_enabled() ||
|
|
|
+ trace_sched_stat_iowait_enabled() ||
|
|
|
+ trace_sched_stat_blocked_enabled() ||
|
|
|
+ trace_sched_stat_runtime_enabled()) {
|
|
|
+ pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
|
|
|
+ "stat_blocked and stat_runtime require the "
|
|
|
+ "kernel parameter schedstats=enabled or "
|
|
|
+ "kernel.sched_schedstats=1\n");
|
|
|
+ }
|
|
|
+#endif
|
|
|
+}
|
|
|
+
|
|
|
static void
|
|
|
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
{
|
|
@@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
|
|
|
if (flags & ENQUEUE_WAKEUP) {
|
|
|
place_entity(cfs_rq, se, 0);
|
|
|
- enqueue_sleeper(cfs_rq, se);
|
|
|
+ if (schedstat_enabled())
|
|
|
+ enqueue_sleeper(cfs_rq, se);
|
|
|
}
|
|
|
|
|
|
- update_stats_enqueue(cfs_rq, se);
|
|
|
- check_spread(cfs_rq, se);
|
|
|
+ check_schedstat_required();
|
|
|
+ if (schedstat_enabled()) {
|
|
|
+ update_stats_enqueue(cfs_rq, se);
|
|
|
+ check_spread(cfs_rq, se);
|
|
|
+ }
|
|
|
if (se != cfs_rq->curr)
|
|
|
__enqueue_entity(cfs_rq, se);
|
|
|
se->on_rq = 1;
|
|
@@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|
|
update_curr(cfs_rq);
|
|
|
dequeue_entity_load_avg(cfs_rq, se);
|
|
|
|
|
|
- update_stats_dequeue(cfs_rq, se);
|
|
|
- if (flags & DEQUEUE_SLEEP) {
|
|
|
-#ifdef CONFIG_SCHEDSTATS
|
|
|
- if (entity_is_task(se)) {
|
|
|
- struct task_struct *tsk = task_of(se);
|
|
|
-
|
|
|
- if (tsk->state & TASK_INTERRUPTIBLE)
|
|
|
- se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
|
|
|
- if (tsk->state & TASK_UNINTERRUPTIBLE)
|
|
|
- se->statistics.block_start = rq_clock(rq_of(cfs_rq));
|
|
|
- }
|
|
|
-#endif
|
|
|
- }
|
|
|
+ if (schedstat_enabled())
|
|
|
+ update_stats_dequeue(cfs_rq, se, flags);
|
|
|
|
|
|
clear_buddies(cfs_rq, se);
|
|
|
|
|
@@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
* a CPU. So account for the time it spent waiting on the
|
|
|
* runqueue.
|
|
|
*/
|
|
|
- update_stats_wait_end(cfs_rq, se);
|
|
|
+ if (schedstat_enabled())
|
|
|
+ update_stats_wait_end(cfs_rq, se);
|
|
|
__dequeue_entity(cfs_rq, se);
|
|
|
update_load_avg(se, 1);
|
|
|
}
|
|
@@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
* least twice that of our own weight (i.e. dont track it
|
|
|
* when there are only lesser-weight tasks around):
|
|
|
*/
|
|
|
- if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
|
|
|
+ if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
|
|
|
se->statistics.slice_max = max(se->statistics.slice_max,
|
|
|
se->sum_exec_runtime - se->prev_sum_exec_runtime);
|
|
|
}
|
|
@@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
|
|
/* throttle cfs_rqs exceeding runtime */
|
|
|
check_cfs_rq_runtime(cfs_rq);
|
|
|
|
|
|
- check_spread(cfs_rq, prev);
|
|
|
+ if (schedstat_enabled()) {
|
|
|
+ check_spread(cfs_rq, prev);
|
|
|
+ if (prev->on_rq)
|
|
|
+ update_stats_wait_start(cfs_rq, prev);
|
|
|
+ }
|
|
|
+
|
|
|
if (prev->on_rq) {
|
|
|
- update_stats_wait_start(cfs_rq, prev);
|
|
|
/* Put 'current' back into the tree. */
|
|
|
__enqueue_entity(cfs_rq, prev);
|
|
|
/* in !on_rq case, update occurred at dequeue */
|
|
@@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
|
|
|
|
|
|
/* scale is effectively 1 << i now, and >> i divides by scale */
|
|
|
|
|
|
- old_load = this_rq->cpu_load[i] - tickless_load;
|
|
|
+ old_load = this_rq->cpu_load[i];
|
|
|
old_load = decay_load_missed(old_load, pending_updates - 1, i);
|
|
|
- old_load += tickless_load;
|
|
|
+ if (tickless_load) {
|
|
|
+ old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
|
|
|
+ /*
|
|
|
+ * old_load can never be a negative value because a
|
|
|
+ * decayed tickless_load cannot be greater than the
|
|
|
+ * original tickless_load.
|
|
|
+ */
|
|
|
+ old_load += tickless_load;
|
|
|
+ }
|
|
|
new_load = this_load;
|
|
|
/*
|
|
|
* Round up the averaging division if load is increasing. This
|
|
@@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
|
|
+static void __update_cpu_load_nohz(struct rq *this_rq,
|
|
|
+ unsigned long curr_jiffies,
|
|
|
+ unsigned long load,
|
|
|
+ int active)
|
|
|
+{
|
|
|
+ unsigned long pending_updates;
|
|
|
+
|
|
|
+ pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
|
+ if (pending_updates) {
|
|
|
+ this_rq->last_load_update_tick = curr_jiffies;
|
|
|
+ /*
|
|
|
+ * In the regular NOHZ case, we were idle, this means load 0.
|
|
|
+ * In the NOHZ_FULL case, we were non-idle, we should consider
|
|
|
+ * its weighted load.
|
|
|
+ */
|
|
|
+ __update_cpu_load(this_rq, load, pending_updates, active);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* There is no sane way to deal with nohz on smp when using jiffies because the
|
|
|
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
|
|
@@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
|
|
|
* Called from nohz_idle_balance() to update the load ratings before doing the
|
|
|
* idle balance.
|
|
|
*/
|
|
|
-static void update_idle_cpu_load(struct rq *this_rq)
|
|
|
+static void update_cpu_load_idle(struct rq *this_rq)
|
|
|
{
|
|
|
- unsigned long curr_jiffies = READ_ONCE(jiffies);
|
|
|
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
|
|
|
- unsigned long pending_updates;
|
|
|
-
|
|
|
/*
|
|
|
* bail if there's load or we're actually up-to-date.
|
|
|
*/
|
|
|
- if (load || curr_jiffies == this_rq->last_load_update_tick)
|
|
|
+ if (weighted_cpuload(cpu_of(this_rq)))
|
|
|
return;
|
|
|
|
|
|
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
|
- this_rq->last_load_update_tick = curr_jiffies;
|
|
|
-
|
|
|
- __update_cpu_load(this_rq, load, pending_updates, 0);
|
|
|
+ __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active)
|
|
|
struct rq *this_rq = this_rq();
|
|
|
unsigned long curr_jiffies = READ_ONCE(jiffies);
|
|
|
unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
|
|
|
- unsigned long pending_updates;
|
|
|
|
|
|
if (curr_jiffies == this_rq->last_load_update_tick)
|
|
|
return;
|
|
|
|
|
|
raw_spin_lock(&this_rq->lock);
|
|
|
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
|
|
|
- if (pending_updates) {
|
|
|
- this_rq->last_load_update_tick = curr_jiffies;
|
|
|
- /*
|
|
|
- * In the regular NOHZ case, we were idle, this means load 0.
|
|
|
- * In the NOHZ_FULL case, we were non-idle, we should consider
|
|
|
- * its weighted load.
|
|
|
- */
|
|
|
- __update_cpu_load(this_rq, load, pending_updates, active);
|
|
|
- }
|
|
|
+ __update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
|
|
|
raw_spin_unlock(&this_rq->lock);
|
|
|
}
|
|
|
#endif /* CONFIG_NO_HZ */
|
|
@@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
|
|
|
{
|
|
|
unsigned long load = weighted_cpuload(cpu_of(this_rq));
|
|
|
/*
|
|
|
- * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
|
|
|
+ * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
|
|
|
*/
|
|
|
this_rq->last_load_update_tick = jiffies;
|
|
|
__update_cpu_load(this_rq, load, 1, 1);
|
|
@@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
|
|
if (time_after_eq(jiffies, rq->next_balance)) {
|
|
|
raw_spin_lock_irq(&rq->lock);
|
|
|
update_rq_clock(rq);
|
|
|
- update_idle_cpu_load(rq);
|
|
|
+ update_cpu_load_idle(rq);
|
|
|
raw_spin_unlock_irq(&rq->lock);
|
|
|
rebalance_domains(rq, CPU_IDLE);
|
|
|
}
|
|
@@ -8234,11 +8294,8 @@ void free_fair_sched_group(struct task_group *tg)
|
|
|
for_each_possible_cpu(i) {
|
|
|
if (tg->cfs_rq)
|
|
|
kfree(tg->cfs_rq[i]);
|
|
|
- if (tg->se) {
|
|
|
- if (tg->se[i])
|
|
|
- remove_entity_load_avg(tg->se[i]);
|
|
|
+ if (tg->se)
|
|
|
kfree(tg->se[i]);
|
|
|
- }
|
|
|
}
|
|
|
|
|
|
kfree(tg->cfs_rq);
|
|
@@ -8286,21 +8343,29 @@ err:
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-void unregister_fair_sched_group(struct task_group *tg, int cpu)
|
|
|
+void unregister_fair_sched_group(struct task_group *tg)
|
|
|
{
|
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
unsigned long flags;
|
|
|
+ struct rq *rq;
|
|
|
+ int cpu;
|
|
|
|
|
|
- /*
|
|
|
- * Only empty task groups can be destroyed; so we can speculatively
|
|
|
- * check on_list without danger of it being re-added.
|
|
|
- */
|
|
|
- if (!tg->cfs_rq[cpu]->on_list)
|
|
|
- return;
|
|
|
+ for_each_possible_cpu(cpu) {
|
|
|
+ if (tg->se[cpu])
|
|
|
+ remove_entity_load_avg(tg->se[cpu]);
|
|
|
|
|
|
- raw_spin_lock_irqsave(&rq->lock, flags);
|
|
|
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
|
|
|
- raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
+ /*
|
|
|
+ * Only empty task groups can be destroyed; so we can speculatively
|
|
|
+ * check on_list without danger of it being re-added.
|
|
|
+ */
|
|
|
+ if (!tg->cfs_rq[cpu]->on_list)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ rq = cpu_rq(cpu);
|
|
|
+
|
|
|
+ raw_spin_lock_irqsave(&rq->lock, flags);
|
|
|
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
|
|
|
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
|
@@ -8382,7 +8447,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
-void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
|
|
|
+void unregister_fair_sched_group(struct task_group *tg) { }
|
|
|
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|