|
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
+static unsigned long task_h_load(struct task_struct *p);
|
|
|
+
|
|
|
static inline void __update_task_entity_contrib(struct sched_entity *se);
|
|
|
|
|
|
/* Give new task start runnable values to heavy its load in infant time */
|
|
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
/*
|
|
|
- * numa task sample period in ms
|
|
|
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
|
|
|
+ * calculated based on the tasks virtual memory size and
|
|
|
+ * numa_balancing_scan_size.
|
|
|
*/
|
|
|
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
|
|
|
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
|
|
|
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
|
|
|
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
|
|
|
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
|
|
|
|
|
|
/* Portion of address space to scan in MB */
|
|
|
unsigned int sysctl_numa_balancing_scan_size = 256;
|
|
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
|
|
|
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
|
|
|
unsigned int sysctl_numa_balancing_scan_delay = 1000;
|
|
|
|
|
|
-static void task_numa_placement(struct task_struct *p)
|
|
|
+/*
|
|
|
+ * After skipping a page migration on a shared page, skip N more numa page
|
|
|
+ * migrations unconditionally. This reduces the number of NUMA migrations
|
|
|
+ * in shared memory workloads, and has the effect of pulling tasks towards
|
|
|
+ * where their memory lives, over pulling the memory towards the task.
|
|
|
+ */
|
|
|
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
|
|
|
+
|
|
|
+static unsigned int task_nr_scan_windows(struct task_struct *p)
|
|
|
+{
|
|
|
+ unsigned long rss = 0;
|
|
|
+ unsigned long nr_scan_pages;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Calculations based on RSS as non-present and empty pages are skipped
|
|
|
+ * by the PTE scanner and NUMA hinting faults should be trapped based
|
|
|
+ * on resident pages
|
|
|
+ */
|
|
|
+ nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
|
|
|
+ rss = get_mm_rss(p->mm);
|
|
|
+ if (!rss)
|
|
|
+ rss = nr_scan_pages;
|
|
|
+
|
|
|
+ rss = round_up(rss, nr_scan_pages);
|
|
|
+ return rss / nr_scan_pages;
|
|
|
+}
|
|
|
+
|
|
|
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
|
|
|
+#define MAX_SCAN_WINDOW 2560
|
|
|
+
|
|
|
+static unsigned int task_scan_min(struct task_struct *p)
|
|
|
+{
|
|
|
+ unsigned int scan, floor;
|
|
|
+ unsigned int windows = 1;
|
|
|
+
|
|
|
+ if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
|
|
|
+ windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
|
|
|
+ floor = 1000 / windows;
|
|
|
+
|
|
|
+ scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
|
|
|
+ return max_t(unsigned int, floor, scan);
|
|
|
+}
|
|
|
+
|
|
|
+static unsigned int task_scan_max(struct task_struct *p)
|
|
|
+{
|
|
|
+ unsigned int smin = task_scan_min(p);
|
|
|
+ unsigned int smax;
|
|
|
+
|
|
|
+ /* Watch for min being lower than max due to floor calculations */
|
|
|
+ smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
|
|
|
+ return max(smin, smax);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Once a preferred node is selected the scheduler balancer will prefer moving
|
|
|
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
|
|
|
+ * scans. This will give the process the chance to accumulate more faults on
|
|
|
+ * the preferred node but still allow the scheduler to move the task again if
|
|
|
+ * the nodes CPUs are overloaded.
|
|
|
+ */
|
|
|
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
|
|
|
+
|
|
|
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+ rq->nr_numa_running += (p->numa_preferred_nid != -1);
|
|
|
+ rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
|
|
|
+}
|
|
|
+
|
|
|
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+ rq->nr_numa_running -= (p->numa_preferred_nid != -1);
|
|
|
+ rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
|
|
|
+}
|
|
|
+
|
|
|
+struct numa_group {
|
|
|
+ atomic_t refcount;
|
|
|
+
|
|
|
+ spinlock_t lock; /* nr_tasks, tasks */
|
|
|
+ int nr_tasks;
|
|
|
+ pid_t gid;
|
|
|
+ struct list_head task_list;
|
|
|
+
|
|
|
+ struct rcu_head rcu;
|
|
|
+ unsigned long total_faults;
|
|
|
+ unsigned long faults[0];
|
|
|
+};
|
|
|
+
|
|
|
+pid_t task_numa_group_id(struct task_struct *p)
|
|
|
+{
|
|
|
+ return p->numa_group ? p->numa_group->gid : 0;
|
|
|
+}
|
|
|
+
|
|
|
+static inline int task_faults_idx(int nid, int priv)
|
|
|
+{
|
|
|
+ return 2 * nid + priv;
|
|
|
+}
|
|
|
+
|
|
|
+static inline unsigned long task_faults(struct task_struct *p, int nid)
|
|
|
+{
|
|
|
+ if (!p->numa_faults)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ return p->numa_faults[task_faults_idx(nid, 0)] +
|
|
|
+ p->numa_faults[task_faults_idx(nid, 1)];
|
|
|
+}
|
|
|
+
|
|
|
+static inline unsigned long group_faults(struct task_struct *p, int nid)
|
|
|
+{
|
|
|
+ if (!p->numa_group)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * These return the fraction of accesses done by a particular task, or
|
|
|
+ * task group, on a particular numa node. The group weight is given a
|
|
|
+ * larger multiplier, in order to group tasks together that are almost
|
|
|
+ * evenly spread out between numa nodes.
|
|
|
+ */
|
|
|
+static inline unsigned long task_weight(struct task_struct *p, int nid)
|
|
|
+{
|
|
|
+ unsigned long total_faults;
|
|
|
+
|
|
|
+ if (!p->numa_faults)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ total_faults = p->total_numa_faults;
|
|
|
+
|
|
|
+ if (!total_faults)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ return 1000 * task_faults(p, nid) / total_faults;
|
|
|
+}
|
|
|
+
|
|
|
+static inline unsigned long group_weight(struct task_struct *p, int nid)
|
|
|
{
|
|
|
- int seq;
|
|
|
+ if (!p->numa_group || !p->numa_group->total_faults)
|
|
|
+ return 0;
|
|
|
+
|
|
|
+ return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
|
|
|
+}
|
|
|
+
|
|
|
+static unsigned long weighted_cpuload(const int cpu);
|
|
|
+static unsigned long source_load(int cpu, int type);
|
|
|
+static unsigned long target_load(int cpu, int type);
|
|
|
+static unsigned long power_of(int cpu);
|
|
|
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
|
|
|
+
|
|
|
+/* Cached statistics for all CPUs within a node */
|
|
|
+struct numa_stats {
|
|
|
+ unsigned long nr_running;
|
|
|
+ unsigned long load;
|
|
|
+
|
|
|
+ /* Total compute capacity of CPUs on a node */
|
|
|
+ unsigned long power;
|
|
|
+
|
|
|
+ /* Approximate capacity in terms of runnable tasks on a node */
|
|
|
+ unsigned long capacity;
|
|
|
+ int has_capacity;
|
|
|
+};
|
|
|
+
|
|
|
+/*
|
|
|
+ * XXX borrowed from update_sg_lb_stats
|
|
|
+ */
|
|
|
+static void update_numa_stats(struct numa_stats *ns, int nid)
|
|
|
+{
|
|
|
+ int cpu;
|
|
|
+
|
|
|
+ memset(ns, 0, sizeof(*ns));
|
|
|
+ for_each_cpu(cpu, cpumask_of_node(nid)) {
|
|
|
+ struct rq *rq = cpu_rq(cpu);
|
|
|
+
|
|
|
+ ns->nr_running += rq->nr_running;
|
|
|
+ ns->load += weighted_cpuload(cpu);
|
|
|
+ ns->power += power_of(cpu);
|
|
|
+ }
|
|
|
+
|
|
|
+ ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
|
|
|
+ ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
|
|
|
+ ns->has_capacity = (ns->nr_running < ns->capacity);
|
|
|
+}
|
|
|
+
|
|
|
+struct task_numa_env {
|
|
|
+ struct task_struct *p;
|
|
|
+
|
|
|
+ int src_cpu, src_nid;
|
|
|
+ int dst_cpu, dst_nid;
|
|
|
+
|
|
|
+ struct numa_stats src_stats, dst_stats;
|
|
|
+
|
|
|
+ int imbalance_pct, idx;
|
|
|
+
|
|
|
+ struct task_struct *best_task;
|
|
|
+ long best_imp;
|
|
|
+ int best_cpu;
|
|
|
+};
|
|
|
+
|
|
|
+static void task_numa_assign(struct task_numa_env *env,
|
|
|
+ struct task_struct *p, long imp)
|
|
|
+{
|
|
|
+ if (env->best_task)
|
|
|
+ put_task_struct(env->best_task);
|
|
|
+ if (p)
|
|
|
+ get_task_struct(p);
|
|
|
+
|
|
|
+ env->best_task = p;
|
|
|
+ env->best_imp = imp;
|
|
|
+ env->best_cpu = env->dst_cpu;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * This checks if the overall compute and NUMA accesses of the system would
|
|
|
+ * be improved if the source tasks was migrated to the target dst_cpu taking
|
|
|
+ * into account that it might be best if task running on the dst_cpu should
|
|
|
+ * be exchanged with the source task
|
|
|
+ */
|
|
|
+static void task_numa_compare(struct task_numa_env *env,
|
|
|
+ long taskimp, long groupimp)
|
|
|
+{
|
|
|
+ struct rq *src_rq = cpu_rq(env->src_cpu);
|
|
|
+ struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
|
|
+ struct task_struct *cur;
|
|
|
+ long dst_load, src_load;
|
|
|
+ long load;
|
|
|
+ long imp = (groupimp > 0) ? groupimp : taskimp;
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+ cur = ACCESS_ONCE(dst_rq->curr);
|
|
|
+ if (cur->pid == 0) /* idle */
|
|
|
+ cur = NULL;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * "imp" is the fault differential for the source task between the
|
|
|
+ * source and destination node. Calculate the total differential for
|
|
|
+ * the source task and potential destination task. The more negative
|
|
|
+ * the value is, the more rmeote accesses that would be expected to
|
|
|
+ * be incurred if the tasks were swapped.
|
|
|
+ */
|
|
|
+ if (cur) {
|
|
|
+ /* Skip this swap candidate if cannot move to the source cpu */
|
|
|
+ if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If dst and source tasks are in the same NUMA group, or not
|
|
|
+ * in any group then look only at task weights.
|
|
|
+ */
|
|
|
+ if (cur->numa_group == env->p->numa_group) {
|
|
|
+ imp = taskimp + task_weight(cur, env->src_nid) -
|
|
|
+ task_weight(cur, env->dst_nid);
|
|
|
+ /*
|
|
|
+ * Add some hysteresis to prevent swapping the
|
|
|
+ * tasks within a group over tiny differences.
|
|
|
+ */
|
|
|
+ if (cur->numa_group)
|
|
|
+ imp -= imp/16;
|
|
|
+ } else {
|
|
|
+ /*
|
|
|
+ * Compare the group weights. If a task is all by
|
|
|
+ * itself (not part of a group), use the task weight
|
|
|
+ * instead.
|
|
|
+ */
|
|
|
+ if (env->p->numa_group)
|
|
|
+ imp = groupimp;
|
|
|
+ else
|
|
|
+ imp = taskimp;
|
|
|
+
|
|
|
+ if (cur->numa_group)
|
|
|
+ imp += group_weight(cur, env->src_nid) -
|
|
|
+ group_weight(cur, env->dst_nid);
|
|
|
+ else
|
|
|
+ imp += task_weight(cur, env->src_nid) -
|
|
|
+ task_weight(cur, env->dst_nid);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (imp < env->best_imp)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ if (!cur) {
|
|
|
+ /* Is there capacity at our destination? */
|
|
|
+ if (env->src_stats.has_capacity &&
|
|
|
+ !env->dst_stats.has_capacity)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+ goto balance;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Balance doesn't matter much if we're running a task per cpu */
|
|
|
+ if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
|
|
|
+ goto assign;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * In the overloaded case, try and keep the load balanced.
|
|
|
+ */
|
|
|
+balance:
|
|
|
+ dst_load = env->dst_stats.load;
|
|
|
+ src_load = env->src_stats.load;
|
|
|
+
|
|
|
+ /* XXX missing power terms */
|
|
|
+ load = task_h_load(env->p);
|
|
|
+ dst_load += load;
|
|
|
+ src_load -= load;
|
|
|
+
|
|
|
+ if (cur) {
|
|
|
+ load = task_h_load(cur);
|
|
|
+ dst_load -= load;
|
|
|
+ src_load += load;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* make src_load the smaller */
|
|
|
+ if (dst_load < src_load)
|
|
|
+ swap(dst_load, src_load);
|
|
|
+
|
|
|
+ if (src_load * env->imbalance_pct < dst_load * 100)
|
|
|
+ goto unlock;
|
|
|
+
|
|
|
+assign:
|
|
|
+ task_numa_assign(env, cur, imp);
|
|
|
+unlock:
|
|
|
+ rcu_read_unlock();
|
|
|
+}
|
|
|
+
|
|
|
+static void task_numa_find_cpu(struct task_numa_env *env,
|
|
|
+ long taskimp, long groupimp)
|
|
|
+{
|
|
|
+ int cpu;
|
|
|
+
|
|
|
+ for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
|
|
|
+ /* Skip this CPU if the source task cannot migrate */
|
|
|
+ if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ env->dst_cpu = cpu;
|
|
|
+ task_numa_compare(env, taskimp, groupimp);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static int task_numa_migrate(struct task_struct *p)
|
|
|
+{
|
|
|
+ struct task_numa_env env = {
|
|
|
+ .p = p,
|
|
|
+
|
|
|
+ .src_cpu = task_cpu(p),
|
|
|
+ .src_nid = task_node(p),
|
|
|
+
|
|
|
+ .imbalance_pct = 112,
|
|
|
+
|
|
|
+ .best_task = NULL,
|
|
|
+ .best_imp = 0,
|
|
|
+ .best_cpu = -1
|
|
|
+ };
|
|
|
+ struct sched_domain *sd;
|
|
|
+ unsigned long taskweight, groupweight;
|
|
|
+ int nid, ret;
|
|
|
+ long taskimp, groupimp;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Pick the lowest SD_NUMA domain, as that would have the smallest
|
|
|
+ * imbalance and would be the first to start moving tasks about.
|
|
|
+ *
|
|
|
+ * And we want to avoid any moving of tasks about, as that would create
|
|
|
+ * random movement of tasks -- counter the numa conditions we're trying
|
|
|
+ * to satisfy here.
|
|
|
+ */
|
|
|
+ rcu_read_lock();
|
|
|
+ sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
|
|
|
+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
|
|
|
+ rcu_read_unlock();
|
|
|
+
|
|
|
+ taskweight = task_weight(p, env.src_nid);
|
|
|
+ groupweight = group_weight(p, env.src_nid);
|
|
|
+ update_numa_stats(&env.src_stats, env.src_nid);
|
|
|
+ env.dst_nid = p->numa_preferred_nid;
|
|
|
+ taskimp = task_weight(p, env.dst_nid) - taskweight;
|
|
|
+ groupimp = group_weight(p, env.dst_nid) - groupweight;
|
|
|
+ update_numa_stats(&env.dst_stats, env.dst_nid);
|
|
|
+
|
|
|
+ /* If the preferred nid has capacity, try to use it. */
|
|
|
+ if (env.dst_stats.has_capacity)
|
|
|
+ task_numa_find_cpu(&env, taskimp, groupimp);
|
|
|
+
|
|
|
+ /* No space available on the preferred nid. Look elsewhere. */
|
|
|
+ if (env.best_cpu == -1) {
|
|
|
+ for_each_online_node(nid) {
|
|
|
+ if (nid == env.src_nid || nid == p->numa_preferred_nid)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /* Only consider nodes where both task and groups benefit */
|
|
|
+ taskimp = task_weight(p, nid) - taskweight;
|
|
|
+ groupimp = group_weight(p, nid) - groupweight;
|
|
|
+ if (taskimp < 0 && groupimp < 0)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ env.dst_nid = nid;
|
|
|
+ update_numa_stats(&env.dst_stats, env.dst_nid);
|
|
|
+ task_numa_find_cpu(&env, taskimp, groupimp);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /* No better CPU than the current one was found. */
|
|
|
+ if (env.best_cpu == -1)
|
|
|
+ return -EAGAIN;
|
|
|
+
|
|
|
+ sched_setnuma(p, env.dst_nid);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Reset the scan period if the task is being rescheduled on an
|
|
|
+ * alternative node to recheck if the tasks is now properly placed.
|
|
|
+ */
|
|
|
+ p->numa_scan_period = task_scan_min(p);
|
|
|
+
|
|
|
+ if (env.best_task == NULL) {
|
|
|
+ int ret = migrate_task_to(p, env.best_cpu);
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = migrate_swap(p, env.best_task);
|
|
|
+ put_task_struct(env.best_task);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/* Attempt to migrate a task to a CPU on the preferred node. */
|
|
|
+static void numa_migrate_preferred(struct task_struct *p)
|
|
|
+{
|
|
|
+ /* This task has no NUMA fault statistics yet */
|
|
|
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Periodically retry migrating the task to the preferred node */
|
|
|
+ p->numa_migrate_retry = jiffies + HZ;
|
|
|
+
|
|
|
+ /* Success if task is already running on preferred CPU */
|
|
|
+ if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Otherwise, try migrate to a CPU on the preferred node */
|
|
|
+ task_numa_migrate(p);
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
|
|
|
+ * increments. The more local the fault statistics are, the higher the scan
|
|
|
+ * period will be for the next scan window. If local/remote ratio is below
|
|
|
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
|
|
|
+ * scan period will decrease
|
|
|
+ */
|
|
|
+#define NUMA_PERIOD_SLOTS 10
|
|
|
+#define NUMA_PERIOD_THRESHOLD 3
|
|
|
+
|
|
|
+/*
|
|
|
+ * Increase the scan period (slow down scanning) if the majority of
|
|
|
+ * our memory is already on our local node, or if the majority of
|
|
|
+ * the page accesses are shared with other processes.
|
|
|
+ * Otherwise, decrease the scan period.
|
|
|
+ */
|
|
|
+static void update_task_scan_period(struct task_struct *p,
|
|
|
+ unsigned long shared, unsigned long private)
|
|
|
+{
|
|
|
+ unsigned int period_slot;
|
|
|
+ int ratio;
|
|
|
+ int diff;
|
|
|
+
|
|
|
+ unsigned long remote = p->numa_faults_locality[0];
|
|
|
+ unsigned long local = p->numa_faults_locality[1];
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If there were no record hinting faults then either the task is
|
|
|
+ * completely idle or all activity is areas that are not of interest
|
|
|
+ * to automatic numa balancing. Scan slower
|
|
|
+ */
|
|
|
+ if (local + shared == 0) {
|
|
|
+ p->numa_scan_period = min(p->numa_scan_period_max,
|
|
|
+ p->numa_scan_period << 1);
|
|
|
+
|
|
|
+ p->mm->numa_next_scan = jiffies +
|
|
|
+ msecs_to_jiffies(p->numa_scan_period);
|
|
|
|
|
|
- if (!p->mm) /* for example, ksmd faulting in a user's mm */
|
|
|
return;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Prepare to scale scan period relative to the current period.
|
|
|
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
|
|
|
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
|
|
|
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
|
|
|
+ */
|
|
|
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
|
|
|
+ ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
|
|
|
+ if (ratio >= NUMA_PERIOD_THRESHOLD) {
|
|
|
+ int slot = ratio - NUMA_PERIOD_THRESHOLD;
|
|
|
+ if (!slot)
|
|
|
+ slot = 1;
|
|
|
+ diff = slot * period_slot;
|
|
|
+ } else {
|
|
|
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Scale scan rate increases based on sharing. There is an
|
|
|
+ * inverse relationship between the degree of sharing and
|
|
|
+ * the adjustment made to the scanning period. Broadly
|
|
|
+ * speaking the intent is that there is little point
|
|
|
+ * scanning faster if shared accesses dominate as it may
|
|
|
+ * simply bounce migrations uselessly
|
|
|
+ */
|
|
|
+ period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
|
|
|
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
|
|
|
+ diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
|
|
|
+ }
|
|
|
+
|
|
|
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
|
|
|
+ task_scan_min(p), task_scan_max(p));
|
|
|
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
|
|
|
+}
|
|
|
+
|
|
|
+static void task_numa_placement(struct task_struct *p)
|
|
|
+{
|
|
|
+ int seq, nid, max_nid = -1, max_group_nid = -1;
|
|
|
+ unsigned long max_faults = 0, max_group_faults = 0;
|
|
|
+ unsigned long fault_types[2] = { 0, 0 };
|
|
|
+ spinlock_t *group_lock = NULL;
|
|
|
+
|
|
|
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
|
|
|
if (p->numa_scan_seq == seq)
|
|
|
return;
|
|
|
p->numa_scan_seq = seq;
|
|
|
+ p->numa_scan_period_max = task_scan_max(p);
|
|
|
+
|
|
|
+ /* If the task is part of a group prevent parallel updates to group stats */
|
|
|
+ if (p->numa_group) {
|
|
|
+ group_lock = &p->numa_group->lock;
|
|
|
+ spin_lock(group_lock);
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Find the node with the highest number of faults */
|
|
|
+ for_each_online_node(nid) {
|
|
|
+ unsigned long faults = 0, group_faults = 0;
|
|
|
+ int priv, i;
|
|
|
+
|
|
|
+ for (priv = 0; priv < 2; priv++) {
|
|
|
+ long diff;
|
|
|
+
|
|
|
+ i = task_faults_idx(nid, priv);
|
|
|
+ diff = -p->numa_faults[i];
|
|
|
+
|
|
|
+ /* Decay existing window, copy faults since last scan */
|
|
|
+ p->numa_faults[i] >>= 1;
|
|
|
+ p->numa_faults[i] += p->numa_faults_buffer[i];
|
|
|
+ fault_types[priv] += p->numa_faults_buffer[i];
|
|
|
+ p->numa_faults_buffer[i] = 0;
|
|
|
+
|
|
|
+ faults += p->numa_faults[i];
|
|
|
+ diff += p->numa_faults[i];
|
|
|
+ p->total_numa_faults += diff;
|
|
|
+ if (p->numa_group) {
|
|
|
+ /* safe because we can only change our own group */
|
|
|
+ p->numa_group->faults[i] += diff;
|
|
|
+ p->numa_group->total_faults += diff;
|
|
|
+ group_faults += p->numa_group->faults[i];
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if (faults > max_faults) {
|
|
|
+ max_faults = faults;
|
|
|
+ max_nid = nid;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (group_faults > max_group_faults) {
|
|
|
+ max_group_faults = group_faults;
|
|
|
+ max_group_nid = nid;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
|
|
|
+
|
|
|
+ if (p->numa_group) {
|
|
|
+ /*
|
|
|
+ * If the preferred task and group nids are different,
|
|
|
+ * iterate over the nodes again to find the best place.
|
|
|
+ */
|
|
|
+ if (max_nid != max_group_nid) {
|
|
|
+ unsigned long weight, max_weight = 0;
|
|
|
+
|
|
|
+ for_each_online_node(nid) {
|
|
|
+ weight = task_weight(p, nid) + group_weight(p, nid);
|
|
|
+ if (weight > max_weight) {
|
|
|
+ max_weight = weight;
|
|
|
+ max_nid = nid;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ spin_unlock(group_lock);
|
|
|
+ }
|
|
|
|
|
|
- /* FIXME: Scheduling placement policy hints go here */
|
|
|
+ /* Preferred node as the node with the most faults */
|
|
|
+ if (max_faults && max_nid != p->numa_preferred_nid) {
|
|
|
+ /* Update the preferred nid and migrate task if possible */
|
|
|
+ sched_setnuma(p, max_nid);
|
|
|
+ numa_migrate_preferred(p);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static inline int get_numa_group(struct numa_group *grp)
|
|
|
+{
|
|
|
+ return atomic_inc_not_zero(&grp->refcount);
|
|
|
+}
|
|
|
+
|
|
|
+static inline void put_numa_group(struct numa_group *grp)
|
|
|
+{
|
|
|
+ if (atomic_dec_and_test(&grp->refcount))
|
|
|
+ kfree_rcu(grp, rcu);
|
|
|
+}
|
|
|
+
|
|
|
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
|
|
+ int *priv)
|
|
|
+{
|
|
|
+ struct numa_group *grp, *my_grp;
|
|
|
+ struct task_struct *tsk;
|
|
|
+ bool join = false;
|
|
|
+ int cpu = cpupid_to_cpu(cpupid);
|
|
|
+ int i;
|
|
|
+
|
|
|
+ if (unlikely(!p->numa_group)) {
|
|
|
+ unsigned int size = sizeof(struct numa_group) +
|
|
|
+ 2*nr_node_ids*sizeof(unsigned long);
|
|
|
+
|
|
|
+ grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
|
|
|
+ if (!grp)
|
|
|
+ return;
|
|
|
+
|
|
|
+ atomic_set(&grp->refcount, 1);
|
|
|
+ spin_lock_init(&grp->lock);
|
|
|
+ INIT_LIST_HEAD(&grp->task_list);
|
|
|
+ grp->gid = p->pid;
|
|
|
+
|
|
|
+ for (i = 0; i < 2*nr_node_ids; i++)
|
|
|
+ grp->faults[i] = p->numa_faults[i];
|
|
|
+
|
|
|
+ grp->total_faults = p->total_numa_faults;
|
|
|
+
|
|
|
+ list_add(&p->numa_entry, &grp->task_list);
|
|
|
+ grp->nr_tasks++;
|
|
|
+ rcu_assign_pointer(p->numa_group, grp);
|
|
|
+ }
|
|
|
+
|
|
|
+ rcu_read_lock();
|
|
|
+ tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
|
|
|
+
|
|
|
+ if (!cpupid_match_pid(tsk, cpupid))
|
|
|
+ goto no_join;
|
|
|
+
|
|
|
+ grp = rcu_dereference(tsk->numa_group);
|
|
|
+ if (!grp)
|
|
|
+ goto no_join;
|
|
|
+
|
|
|
+ my_grp = p->numa_group;
|
|
|
+ if (grp == my_grp)
|
|
|
+ goto no_join;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Only join the other group if its bigger; if we're the bigger group,
|
|
|
+ * the other task will join us.
|
|
|
+ */
|
|
|
+ if (my_grp->nr_tasks > grp->nr_tasks)
|
|
|
+ goto no_join;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Tie-break on the grp address.
|
|
|
+ */
|
|
|
+ if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
|
|
|
+ goto no_join;
|
|
|
+
|
|
|
+ /* Always join threads in the same process. */
|
|
|
+ if (tsk->mm == current->mm)
|
|
|
+ join = true;
|
|
|
+
|
|
|
+ /* Simple filter to avoid false positives due to PID collisions */
|
|
|
+ if (flags & TNF_SHARED)
|
|
|
+ join = true;
|
|
|
+
|
|
|
+ /* Update priv based on whether false sharing was detected */
|
|
|
+ *priv = !join;
|
|
|
+
|
|
|
+ if (join && !get_numa_group(grp))
|
|
|
+ goto no_join;
|
|
|
+
|
|
|
+ rcu_read_unlock();
|
|
|
+
|
|
|
+ if (!join)
|
|
|
+ return;
|
|
|
+
|
|
|
+ double_lock(&my_grp->lock, &grp->lock);
|
|
|
+
|
|
|
+ for (i = 0; i < 2*nr_node_ids; i++) {
|
|
|
+ my_grp->faults[i] -= p->numa_faults[i];
|
|
|
+ grp->faults[i] += p->numa_faults[i];
|
|
|
+ }
|
|
|
+ my_grp->total_faults -= p->total_numa_faults;
|
|
|
+ grp->total_faults += p->total_numa_faults;
|
|
|
+
|
|
|
+ list_move(&p->numa_entry, &grp->task_list);
|
|
|
+ my_grp->nr_tasks--;
|
|
|
+ grp->nr_tasks++;
|
|
|
+
|
|
|
+ spin_unlock(&my_grp->lock);
|
|
|
+ spin_unlock(&grp->lock);
|
|
|
+
|
|
|
+ rcu_assign_pointer(p->numa_group, grp);
|
|
|
+
|
|
|
+ put_numa_group(my_grp);
|
|
|
+ return;
|
|
|
+
|
|
|
+no_join:
|
|
|
+ rcu_read_unlock();
|
|
|
+ return;
|
|
|
+}
|
|
|
+
|
|
|
+void task_numa_free(struct task_struct *p)
|
|
|
+{
|
|
|
+ struct numa_group *grp = p->numa_group;
|
|
|
+ int i;
|
|
|
+ void *numa_faults = p->numa_faults;
|
|
|
+
|
|
|
+ if (grp) {
|
|
|
+ spin_lock(&grp->lock);
|
|
|
+ for (i = 0; i < 2*nr_node_ids; i++)
|
|
|
+ grp->faults[i] -= p->numa_faults[i];
|
|
|
+ grp->total_faults -= p->total_numa_faults;
|
|
|
+
|
|
|
+ list_del(&p->numa_entry);
|
|
|
+ grp->nr_tasks--;
|
|
|
+ spin_unlock(&grp->lock);
|
|
|
+ rcu_assign_pointer(p->numa_group, NULL);
|
|
|
+ put_numa_group(grp);
|
|
|
+ }
|
|
|
+
|
|
|
+ p->numa_faults = NULL;
|
|
|
+ p->numa_faults_buffer = NULL;
|
|
|
+ kfree(numa_faults);
|
|
|
}
|
|
|
|
|
|
/*
|
|
|
* Got a PROT_NONE fault for a page on @node.
|
|
|
*/
|
|
|
-void task_numa_fault(int node, int pages, bool migrated)
|
|
|
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
|
|
|
{
|
|
|
struct task_struct *p = current;
|
|
|
+ bool migrated = flags & TNF_MIGRATED;
|
|
|
+ int priv;
|
|
|
|
|
|
if (!numabalancing_enabled)
|
|
|
return;
|
|
|
|
|
|
- /* FIXME: Allocate task-specific structure for placement policy here */
|
|
|
+ /* for example, ksmd faulting in a user's mm */
|
|
|
+ if (!p->mm)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Do not worry about placement if exiting */
|
|
|
+ if (p->state == TASK_DEAD)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Allocate buffer to track faults on a per-node basis */
|
|
|
+ if (unlikely(!p->numa_faults)) {
|
|
|
+ int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
|
|
|
+
|
|
|
+ /* numa_faults and numa_faults_buffer share the allocation */
|
|
|
+ p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
|
|
|
+ if (!p->numa_faults)
|
|
|
+ return;
|
|
|
+
|
|
|
+ BUG_ON(p->numa_faults_buffer);
|
|
|
+ p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
|
|
|
+ p->total_numa_faults = 0;
|
|
|
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
|
|
|
+ }
|
|
|
|
|
|
/*
|
|
|
- * If pages are properly placed (did not migrate) then scan slower.
|
|
|
- * This is reset periodically in case of phase changes
|
|
|
+ * First accesses are treated as private, otherwise consider accesses
|
|
|
+ * to be private if the accessing pid has not changed
|
|
|
*/
|
|
|
- if (!migrated)
|
|
|
- p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
|
|
|
- p->numa_scan_period + jiffies_to_msecs(10));
|
|
|
+ if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
|
|
|
+ priv = 1;
|
|
|
+ } else {
|
|
|
+ priv = cpupid_match_pid(p, last_cpupid);
|
|
|
+ if (!priv && !(flags & TNF_NO_GROUP))
|
|
|
+ task_numa_group(p, last_cpupid, flags, &priv);
|
|
|
+ }
|
|
|
|
|
|
task_numa_placement(p);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Retry task to preferred node migration periodically, in case it
|
|
|
+ * case it previously failed, or the scheduler moved us.
|
|
|
+ */
|
|
|
+ if (time_after(jiffies, p->numa_migrate_retry))
|
|
|
+ numa_migrate_preferred(p);
|
|
|
+
|
|
|
+ if (migrated)
|
|
|
+ p->numa_pages_migrated += pages;
|
|
|
+
|
|
|
+ p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
|
|
|
+ p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
|
|
|
}
|
|
|
|
|
|
static void reset_ptenuma_scan(struct task_struct *p)
|
|
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
|
|
|
struct mm_struct *mm = p->mm;
|
|
|
struct vm_area_struct *vma;
|
|
|
unsigned long start, end;
|
|
|
+ unsigned long nr_pte_updates = 0;
|
|
|
long pages;
|
|
|
|
|
|
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
|
|
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
|
|
|
if (p->flags & PF_EXITING)
|
|
|
return;
|
|
|
|
|
|
- /*
|
|
|
- * We do not care about task placement until a task runs on a node
|
|
|
- * other than the first one used by the address space. This is
|
|
|
- * largely because migrations are driven by what CPU the task
|
|
|
- * is running on. If it's never scheduled on another node, it'll
|
|
|
- * not migrate so why bother trapping the fault.
|
|
|
- */
|
|
|
- if (mm->first_nid == NUMA_PTE_SCAN_INIT)
|
|
|
- mm->first_nid = numa_node_id();
|
|
|
- if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
|
|
|
- /* Are we running on a new node yet? */
|
|
|
- if (numa_node_id() == mm->first_nid &&
|
|
|
- !sched_feat_numa(NUMA_FORCE))
|
|
|
- return;
|
|
|
-
|
|
|
- mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- * Reset the scan period if enough time has gone by. Objective is that
|
|
|
- * scanning will be reduced if pages are properly placed. As tasks
|
|
|
- * can enter different phases this needs to be re-examined. Lacking
|
|
|
- * proper tracking of reference behaviour, this blunt hammer is used.
|
|
|
- */
|
|
|
- migrate = mm->numa_next_reset;
|
|
|
- if (time_after(now, migrate)) {
|
|
|
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
|
|
|
- next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
|
|
|
- xchg(&mm->numa_next_reset, next_scan);
|
|
|
+ if (!mm->numa_next_scan) {
|
|
|
+ mm->numa_next_scan = now +
|
|
|
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
|
|
|
if (time_before(now, migrate))
|
|
|
return;
|
|
|
|
|
|
- if (p->numa_scan_period == 0)
|
|
|
- p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
|
|
|
+ if (p->numa_scan_period == 0) {
|
|
|
+ p->numa_scan_period_max = task_scan_max(p);
|
|
|
+ p->numa_scan_period = task_scan_min(p);
|
|
|
+ }
|
|
|
|
|
|
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
|
|
|
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
|
|
|
return;
|
|
|
|
|
|
/*
|
|
|
- * Do not set pte_numa if the current running node is rate-limited.
|
|
|
- * This loses statistics on the fault but if we are unwilling to
|
|
|
- * migrate to this node, it is less likely we can do useful work
|
|
|
+ * Delay this task enough that another task of this mm will likely win
|
|
|
+ * the next time around.
|
|
|
*/
|
|
|
- if (migrate_ratelimited(numa_node_id()))
|
|
|
- return;
|
|
|
+ p->node_stamp += 2 * TICK_NSEC;
|
|
|
|
|
|
start = mm->numa_scan_offset;
|
|
|
pages = sysctl_numa_balancing_scan_size;
|
|
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
|
|
|
vma = mm->mmap;
|
|
|
}
|
|
|
for (; vma; vma = vma->vm_next) {
|
|
|
- if (!vma_migratable(vma))
|
|
|
+ if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
|
|
|
continue;
|
|
|
|
|
|
- /* Skip small VMAs. They are not likely to be of relevance */
|
|
|
- if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
|
|
|
+ /*
|
|
|
+ * Shared library pages mapped by multiple processes are not
|
|
|
+ * migrated as it is expected they are cache replicated. Avoid
|
|
|
+ * hinting faults in read-only file-backed mappings or the vdso
|
|
|
+ * as migrating the pages will be of marginal benefit.
|
|
|
+ */
|
|
|
+ if (!vma->vm_mm ||
|
|
|
+ (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
|
|
|
continue;
|
|
|
|
|
|
do {
|
|
|
start = max(start, vma->vm_start);
|
|
|
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
|
|
|
end = min(end, vma->vm_end);
|
|
|
- pages -= change_prot_numa(vma, start, end);
|
|
|
+ nr_pte_updates += change_prot_numa(vma, start, end);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Scan sysctl_numa_balancing_scan_size but ensure that
|
|
|
+ * at least one PTE is updated so that unused virtual
|
|
|
+ * address space is quickly skipped.
|
|
|
+ */
|
|
|
+ if (nr_pte_updates)
|
|
|
+ pages -= (end - start) >> PAGE_SHIFT;
|
|
|
|
|
|
start = end;
|
|
|
if (pages <= 0)
|
|
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
|
|
|
|
|
|
out:
|
|
|
/*
|
|
|
- * It is possible to reach the end of the VMA list but the last few VMAs are
|
|
|
- * not guaranteed to the vma_migratable. If they are not, we would find the
|
|
|
- * !migratable VMA on the next scan but not reset the scanner to the start
|
|
|
- * so check it now.
|
|
|
+ * It is possible to reach the end of the VMA list but the last few
|
|
|
+ * VMAs are not guaranteed to the vma_migratable. If they are not, we
|
|
|
+ * would find the !migratable VMA on the next scan but not reset the
|
|
|
+ * scanner to the start so check it now.
|
|
|
*/
|
|
|
if (vma)
|
|
|
mm->numa_scan_offset = start;
|
|
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
|
|
|
if (now - curr->node_stamp > period) {
|
|
|
if (!curr->node_stamp)
|
|
|
- curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
|
|
|
- curr->node_stamp = now;
|
|
|
+ curr->numa_scan_period = task_scan_min(curr);
|
|
|
+ curr->node_stamp += period;
|
|
|
|
|
|
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
|
|
|
init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
|
|
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
|
|
|
{
|
|
|
}
|
|
|
+
|
|
|
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+}
|
|
|
+
|
|
|
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
|
|
|
+{
|
|
|
+}
|
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
|
|
static void
|
|
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
if (!parent_entity(se))
|
|
|
update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
|
|
|
#ifdef CONFIG_SMP
|
|
|
- if (entity_is_task(se))
|
|
|
- list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
|
|
|
+ if (entity_is_task(se)) {
|
|
|
+ struct rq *rq = rq_of(cfs_rq);
|
|
|
+
|
|
|
+ account_numa_enqueue(rq, task_of(se));
|
|
|
+ list_add(&se->group_node, &rq->cfs_tasks);
|
|
|
+ }
|
|
|
#endif
|
|
|
cfs_rq->nr_running++;
|
|
|
}
|
|
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|
|
update_load_sub(&cfs_rq->load, se->load.weight);
|
|
|
if (!parent_entity(se))
|
|
|
update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
|
|
|
- if (entity_is_task(se))
|
|
|
+ if (entity_is_task(se)) {
|
|
|
+ account_numa_dequeue(rq_of(cfs_rq), task_of(se));
|
|
|
list_del_init(&se->group_node);
|
|
|
+ }
|
|
|
cfs_rq->nr_running--;
|
|
|
}
|
|
|
|
|
@@ -2070,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void)
|
|
|
return static_key_false(&__cfs_bandwidth_used);
|
|
|
}
|
|
|
|
|
|
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
|
|
|
+void cfs_bandwidth_usage_inc(void)
|
|
|
{
|
|
|
- /* only need to count groups transitioning between enabled/!enabled */
|
|
|
- if (enabled && !was_enabled)
|
|
|
- static_key_slow_inc(&__cfs_bandwidth_used);
|
|
|
- else if (!enabled && was_enabled)
|
|
|
- static_key_slow_dec(&__cfs_bandwidth_used);
|
|
|
+ static_key_slow_inc(&__cfs_bandwidth_used);
|
|
|
+}
|
|
|
+
|
|
|
+void cfs_bandwidth_usage_dec(void)
|
|
|
+{
|
|
|
+ static_key_slow_dec(&__cfs_bandwidth_used);
|
|
|
}
|
|
|
#else /* HAVE_JUMP_LABEL */
|
|
|
static bool cfs_bandwidth_used(void)
|
|
@@ -2084,7 +2860,8 @@ static bool cfs_bandwidth_used(void)
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
|
|
|
+void cfs_bandwidth_usage_inc(void) {}
|
|
|
+void cfs_bandwidth_usage_dec(void) {}
|
|
|
#endif /* HAVE_JUMP_LABEL */
|
|
|
|
|
|
/*
|
|
@@ -2335,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
cfs_rq->throttled_clock = rq_clock(rq);
|
|
|
raw_spin_lock(&cfs_b->lock);
|
|
|
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
|
|
|
+ if (!cfs_b->timer_active)
|
|
|
+ __start_cfs_bandwidth(cfs_b);
|
|
|
raw_spin_unlock(&cfs_b->lock);
|
|
|
}
|
|
|
|
|
@@ -2448,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
|
|
if (idle)
|
|
|
goto out_unlock;
|
|
|
|
|
|
+ /*
|
|
|
+ * if we have relooped after returning idle once, we need to update our
|
|
|
+ * status as actually running, so that other cpus doing
|
|
|
+ * __start_cfs_bandwidth will stop trying to cancel us.
|
|
|
+ */
|
|
|
+ cfs_b->timer_active = 1;
|
|
|
+
|
|
|
__refill_cfs_bandwidth_runtime(cfs_b);
|
|
|
|
|
|
if (!throttled) {
|
|
@@ -2508,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
|
|
|
/* how long we wait to gather additional slack before distributing */
|
|
|
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
|
|
|
|
|
|
-/* are we near the end of the current quota period? */
|
|
|
+/*
|
|
|
+ * Are we near the end of the current quota period?
|
|
|
+ *
|
|
|
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
|
|
|
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
|
|
|
+ * migrate_hrtimers, base is never cleared, so we are fine.
|
|
|
+ */
|
|
|
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
|
|
|
{
|
|
|
struct hrtimer *refresh_timer = &cfs_b->period_timer;
|
|
@@ -2584,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
|
|
u64 expires;
|
|
|
|
|
|
/* confirm we're still not at a refresh boundary */
|
|
|
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
|
|
|
+ raw_spin_lock(&cfs_b->lock);
|
|
|
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
|
|
|
+ raw_spin_unlock(&cfs_b->lock);
|
|
|
return;
|
|
|
+ }
|
|
|
|
|
|
- raw_spin_lock(&cfs_b->lock);
|
|
|
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
|
|
|
runtime = cfs_b->runtime;
|
|
|
cfs_b->runtime = 0;
|
|
@@ -2708,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
|
|
* (timer_active==0 becomes visible before the hrtimer call-back
|
|
|
* terminates). In either case we ensure that it's re-programmed
|
|
|
*/
|
|
|
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
|
|
|
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
|
|
|
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
|
|
|
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
|
|
|
raw_spin_unlock(&cfs_b->lock);
|
|
|
- /* ensure cfs_b->lock is available while we wait */
|
|
|
- hrtimer_cancel(&cfs_b->period_timer);
|
|
|
-
|
|
|
+ cpu_relax();
|
|
|
raw_spin_lock(&cfs_b->lock);
|
|
|
/* if someone else restarted the timer then we're done */
|
|
|
if (cfs_b->timer_active)
|
|
@@ -3113,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
{
|
|
|
struct sched_entity *se = tg->se[cpu];
|
|
|
|
|
|
- if (!tg->parent) /* the trivial, non-cgroup case */
|
|
|
+ if (!tg->parent || !wl) /* the trivial, non-cgroup case */
|
|
|
return wl;
|
|
|
|
|
|
for_each_sched_entity(se) {
|
|
@@ -3166,8 +3960,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
}
|
|
|
#else
|
|
|
|
|
|
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
|
|
|
- unsigned long wl, unsigned long wg)
|
|
|
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
|
|
{
|
|
|
return wl;
|
|
|
}
|
|
@@ -3420,11 +4213,10 @@ done:
|
|
|
* preempt must be disabled.
|
|
|
*/
|
|
|
static int
|
|
|
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
|
|
|
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
|
|
|
{
|
|
|
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
|
|
|
int cpu = smp_processor_id();
|
|
|
- int prev_cpu = task_cpu(p);
|
|
|
int new_cpu = cpu;
|
|
|
int want_affine = 0;
|
|
|
int sync = wake_flags & WF_SYNC;
|
|
@@ -3904,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
|
|
|
|
|
|
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
|
|
|
|
|
+enum fbq_type { regular, remote, all };
|
|
|
+
|
|
|
#define LBF_ALL_PINNED 0x01
|
|
|
#define LBF_NEED_BREAK 0x02
|
|
|
-#define LBF_SOME_PINNED 0x04
|
|
|
+#define LBF_DST_PINNED 0x04
|
|
|
+#define LBF_SOME_PINNED 0x08
|
|
|
|
|
|
struct lb_env {
|
|
|
struct sched_domain *sd;
|
|
@@ -3929,6 +4724,8 @@ struct lb_env {
|
|
|
unsigned int loop;
|
|
|
unsigned int loop_break;
|
|
|
unsigned int loop_max;
|
|
|
+
|
|
|
+ enum fbq_type fbq_type;
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -3975,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
|
|
|
return delta < (s64)sysctl_sched_migration_cost;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+/* Returns true if the destination node has incurred more faults */
|
|
|
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
|
|
|
+{
|
|
|
+ int src_nid, dst_nid;
|
|
|
+
|
|
|
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
|
|
|
+ !(env->sd->flags & SD_NUMA)) {
|
|
|
+ return false;
|
|
|
+ }
|
|
|
+
|
|
|
+ src_nid = cpu_to_node(env->src_cpu);
|
|
|
+ dst_nid = cpu_to_node(env->dst_cpu);
|
|
|
+
|
|
|
+ if (src_nid == dst_nid)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* Always encourage migration to the preferred node. */
|
|
|
+ if (dst_nid == p->numa_preferred_nid)
|
|
|
+ return true;
|
|
|
+
|
|
|
+ /* If both task and group weight improve, this move is a winner. */
|
|
|
+ if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
|
|
|
+ group_weight(p, dst_nid) > group_weight(p, src_nid))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
|
|
|
+{
|
|
|
+ int src_nid, dst_nid;
|
|
|
+
|
|
|
+ if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
|
|
|
+ return false;
|
|
|
+
|
|
|
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
|
|
|
+ return false;
|
|
|
+
|
|
|
+ src_nid = cpu_to_node(env->src_cpu);
|
|
|
+ dst_nid = cpu_to_node(env->dst_cpu);
|
|
|
+
|
|
|
+ if (src_nid == dst_nid)
|
|
|
+ return false;
|
|
|
+
|
|
|
+ /* Migrating away from the preferred node is always bad. */
|
|
|
+ if (src_nid == p->numa_preferred_nid)
|
|
|
+ return true;
|
|
|
+
|
|
|
+ /* If either task or group weight get worse, don't do it. */
|
|
|
+ if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
|
|
|
+ group_weight(p, dst_nid) < group_weight(p, src_nid))
|
|
|
+ return true;
|
|
|
+
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+#else
|
|
|
+static inline bool migrate_improves_locality(struct task_struct *p,
|
|
|
+ struct lb_env *env)
|
|
|
+{
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static inline bool migrate_degrades_locality(struct task_struct *p,
|
|
|
+ struct lb_env *env)
|
|
|
+{
|
|
|
+ return false;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
/*
|
|
|
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
|
|
|
*/
|
|
@@ -3997,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
|
|
|
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
|
|
|
|
|
|
+ env->flags |= LBF_SOME_PINNED;
|
|
|
+
|
|
|
/*
|
|
|
* Remember if this task can be migrated to any other cpu in
|
|
|
* our sched_group. We may want to revisit it if we couldn't
|
|
@@ -4005,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
* Also avoid computing new_dst_cpu if we have already computed
|
|
|
* one in current iteration.
|
|
|
*/
|
|
|
- if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
|
|
|
+ if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
|
|
|
return 0;
|
|
|
|
|
|
/* Prevent to re-select dst_cpu via env's cpus */
|
|
|
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
|
|
|
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
|
|
|
- env->flags |= LBF_SOME_PINNED;
|
|
|
+ env->flags |= LBF_DST_PINNED;
|
|
|
env->new_dst_cpu = cpu;
|
|
|
break;
|
|
|
}
|
|
@@ -4030,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|
|
|
|
|
/*
|
|
|
* Aggressive migration if:
|
|
|
- * 1) task is cache cold, or
|
|
|
- * 2) too many balance attempts have failed.
|
|
|
+ * 1) destination numa is preferred
|
|
|
+ * 2) task is cache cold, or
|
|
|
+ * 3) too many balance attempts have failed.
|
|
|
*/
|
|
|
-
|
|
|
tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
|
|
|
+ if (!tsk_cache_hot)
|
|
|
+ tsk_cache_hot = migrate_degrades_locality(p, env);
|
|
|
+
|
|
|
+ if (migrate_improves_locality(p, env)) {
|
|
|
+#ifdef CONFIG_SCHEDSTATS
|
|
|
+ if (tsk_cache_hot) {
|
|
|
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
|
|
|
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
|
|
|
+ }
|
|
|
+#endif
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
if (!tsk_cache_hot ||
|
|
|
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
|
|
|
|
|
@@ -4077,8 +4961,6 @@ static int move_one_task(struct lb_env *env)
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static unsigned long task_h_load(struct task_struct *p);
|
|
|
-
|
|
|
static const unsigned int sched_nr_migrate_break = 32;
|
|
|
|
|
|
/*
|
|
@@ -4291,6 +5173,10 @@ struct sg_lb_stats {
|
|
|
unsigned int group_weight;
|
|
|
int group_imb; /* Is there an imbalance in the group ? */
|
|
|
int group_has_capacity; /* Is there extra capacity in the group? */
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+ unsigned int nr_numa_running;
|
|
|
+ unsigned int nr_preferred_running;
|
|
|
+#endif
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -4330,7 +5216,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
|
|
|
/**
|
|
|
* get_sd_load_idx - Obtain the load index for a given sched domain.
|
|
|
* @sd: The sched_domain whose load_idx is to be obtained.
|
|
|
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
|
|
|
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
|
|
|
*
|
|
|
* Return: The load index.
|
|
|
*/
|
|
@@ -4447,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
|
|
|
{
|
|
|
struct sched_domain *child = sd->child;
|
|
|
struct sched_group *group, *sdg = sd->groups;
|
|
|
- unsigned long power;
|
|
|
+ unsigned long power, power_orig;
|
|
|
unsigned long interval;
|
|
|
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
@@ -4459,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
- power = 0;
|
|
|
+ power_orig = power = 0;
|
|
|
|
|
|
if (child->flags & SD_OVERLAP) {
|
|
|
/*
|
|
@@ -4467,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
|
|
|
* span the current group.
|
|
|
*/
|
|
|
|
|
|
- for_each_cpu(cpu, sched_group_cpus(sdg))
|
|
|
- power += power_of(cpu);
|
|
|
+ for_each_cpu(cpu, sched_group_cpus(sdg)) {
|
|
|
+ struct sched_group *sg = cpu_rq(cpu)->sd->groups;
|
|
|
+
|
|
|
+ power_orig += sg->sgp->power_orig;
|
|
|
+ power += sg->sgp->power;
|
|
|
+ }
|
|
|
} else {
|
|
|
/*
|
|
|
* !SD_OVERLAP domains can assume that child groups
|
|
@@ -4477,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
|
|
|
|
|
|
group = child->groups;
|
|
|
do {
|
|
|
+ power_orig += group->sgp->power_orig;
|
|
|
power += group->sgp->power;
|
|
|
group = group->next;
|
|
|
} while (group != child->groups);
|
|
|
}
|
|
|
|
|
|
- sdg->sgp->power_orig = sdg->sgp->power = power;
|
|
|
+ sdg->sgp->power_orig = power_orig;
|
|
|
+ sdg->sgp->power = power;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -4526,13 +5418,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
|
|
|
* cpu 3 and leave one of the cpus in the second group unused.
|
|
|
*
|
|
|
* The current solution to this issue is detecting the skew in the first group
|
|
|
- * by noticing it has a cpu that is overloaded while the remaining cpus are
|
|
|
- * idle -- or rather, there's a distinct imbalance in the cpus; see
|
|
|
- * sg_imbalanced().
|
|
|
+ * by noticing the lower domain failed to reach balance and had difficulty
|
|
|
+ * moving tasks due to affinity constraints.
|
|
|
*
|
|
|
* When this is so detected; this group becomes a candidate for busiest; see
|
|
|
- * update_sd_pick_busiest(). And calculcate_imbalance() and
|
|
|
- * find_busiest_group() avoid some of the usual balance conditional to allow it
|
|
|
+ * update_sd_pick_busiest(). And calculate_imbalance() and
|
|
|
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
|
|
|
* to create an effective group imbalance.
|
|
|
*
|
|
|
* This is a somewhat tricky proposition since the next run might not find the
|
|
@@ -4540,49 +5431,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
|
|
|
* subtle and fragile situation.
|
|
|
*/
|
|
|
|
|
|
-struct sg_imb_stats {
|
|
|
- unsigned long max_nr_running, min_nr_running;
|
|
|
- unsigned long max_cpu_load, min_cpu_load;
|
|
|
-};
|
|
|
-
|
|
|
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
|
|
|
+static inline int sg_imbalanced(struct sched_group *group)
|
|
|
{
|
|
|
- sgi->max_cpu_load = sgi->max_nr_running = 0UL;
|
|
|
- sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
|
|
|
+ return group->sgp->imbalance;
|
|
|
}
|
|
|
|
|
|
-static inline void
|
|
|
-update_sg_imb_stats(struct sg_imb_stats *sgi,
|
|
|
- unsigned long load, unsigned long nr_running)
|
|
|
+/*
|
|
|
+ * Compute the group capacity.
|
|
|
+ *
|
|
|
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
|
|
|
+ * first dividing out the smt factor and computing the actual number of cores
|
|
|
+ * and limit power unit capacity with that.
|
|
|
+ */
|
|
|
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
|
|
|
{
|
|
|
- if (load > sgi->max_cpu_load)
|
|
|
- sgi->max_cpu_load = load;
|
|
|
- if (sgi->min_cpu_load > load)
|
|
|
- sgi->min_cpu_load = load;
|
|
|
+ unsigned int capacity, smt, cpus;
|
|
|
+ unsigned int power, power_orig;
|
|
|
|
|
|
- if (nr_running > sgi->max_nr_running)
|
|
|
- sgi->max_nr_running = nr_running;
|
|
|
- if (sgi->min_nr_running > nr_running)
|
|
|
- sgi->min_nr_running = nr_running;
|
|
|
-}
|
|
|
+ power = group->sgp->power;
|
|
|
+ power_orig = group->sgp->power_orig;
|
|
|
+ cpus = group->group_weight;
|
|
|
|
|
|
-static inline int
|
|
|
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
|
|
|
-{
|
|
|
- /*
|
|
|
- * Consider the group unbalanced when the imbalance is larger
|
|
|
- * than the average weight of a task.
|
|
|
- *
|
|
|
- * APZ: with cgroup the avg task weight can vary wildly and
|
|
|
- * might not be a suitable number - should we keep a
|
|
|
- * normalized nr_running number somewhere that negates
|
|
|
- * the hierarchy?
|
|
|
- */
|
|
|
- if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
|
|
|
- (sgi->max_nr_running - sgi->min_nr_running) > 1)
|
|
|
- return 1;
|
|
|
+ /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
|
|
|
+ smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
|
|
|
+ capacity = cpus / smt; /* cores */
|
|
|
|
|
|
- return 0;
|
|
|
+ capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
|
|
|
+ if (!capacity)
|
|
|
+ capacity = fix_small_capacity(env->sd, group);
|
|
|
+
|
|
|
+ return capacity;
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -4597,12 +5475,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
struct sched_group *group, int load_idx,
|
|
|
int local_group, struct sg_lb_stats *sgs)
|
|
|
{
|
|
|
- struct sg_imb_stats sgi;
|
|
|
unsigned long nr_running;
|
|
|
unsigned long load;
|
|
|
int i;
|
|
|
|
|
|
- init_sg_imb_stats(&sgi);
|
|
|
+ memset(sgs, 0, sizeof(*sgs));
|
|
|
|
|
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
|
|
struct rq *rq = cpu_rq(i);
|
|
@@ -4610,24 +5487,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
nr_running = rq->nr_running;
|
|
|
|
|
|
/* Bias balancing toward cpus of our domain */
|
|
|
- if (local_group) {
|
|
|
+ if (local_group)
|
|
|
load = target_load(i, load_idx);
|
|
|
- } else {
|
|
|
+ else
|
|
|
load = source_load(i, load_idx);
|
|
|
- update_sg_imb_stats(&sgi, load, nr_running);
|
|
|
- }
|
|
|
|
|
|
sgs->group_load += load;
|
|
|
sgs->sum_nr_running += nr_running;
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+ sgs->nr_numa_running += rq->nr_numa_running;
|
|
|
+ sgs->nr_preferred_running += rq->nr_preferred_running;
|
|
|
+#endif
|
|
|
sgs->sum_weighted_load += weighted_cpuload(i);
|
|
|
if (idle_cpu(i))
|
|
|
sgs->idle_cpus++;
|
|
|
}
|
|
|
|
|
|
- if (local_group && (env->idle != CPU_NEWLY_IDLE ||
|
|
|
- time_after_eq(jiffies, group->sgp->next_update)))
|
|
|
- update_group_power(env->sd, env->dst_cpu);
|
|
|
-
|
|
|
/* Adjust by relative CPU power of the group */
|
|
|
sgs->group_power = group->sgp->power;
|
|
|
sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
|
|
@@ -4635,16 +5510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|
|
if (sgs->sum_nr_running)
|
|
|
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
|
|
|
|
|
- sgs->group_imb = sg_imbalanced(sgs, &sgi);
|
|
|
-
|
|
|
- sgs->group_capacity =
|
|
|
- DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
|
|
|
-
|
|
|
- if (!sgs->group_capacity)
|
|
|
- sgs->group_capacity = fix_small_capacity(env->sd, group);
|
|
|
-
|
|
|
sgs->group_weight = group->group_weight;
|
|
|
|
|
|
+ sgs->group_imb = sg_imbalanced(group);
|
|
|
+ sgs->group_capacity = sg_capacity(env, group);
|
|
|
+
|
|
|
if (sgs->group_capacity > sgs->sum_nr_running)
|
|
|
sgs->group_has_capacity = 1;
|
|
|
}
|
|
@@ -4693,14 +5563,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+#ifdef CONFIG_NUMA_BALANCING
|
|
|
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
|
|
|
+{
|
|
|
+ if (sgs->sum_nr_running > sgs->nr_numa_running)
|
|
|
+ return regular;
|
|
|
+ if (sgs->sum_nr_running > sgs->nr_preferred_running)
|
|
|
+ return remote;
|
|
|
+ return all;
|
|
|
+}
|
|
|
+
|
|
|
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
|
|
|
+{
|
|
|
+ if (rq->nr_running > rq->nr_numa_running)
|
|
|
+ return regular;
|
|
|
+ if (rq->nr_running > rq->nr_preferred_running)
|
|
|
+ return remote;
|
|
|
+ return all;
|
|
|
+}
|
|
|
+#else
|
|
|
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
|
|
|
+{
|
|
|
+ return all;
|
|
|
+}
|
|
|
+
|
|
|
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
|
|
|
+{
|
|
|
+ return regular;
|
|
|
+}
|
|
|
+#endif /* CONFIG_NUMA_BALANCING */
|
|
|
+
|
|
|
/**
|
|
|
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
|
|
|
* @env: The load balancing environment.
|
|
|
- * @balance: Should we balance.
|
|
|
* @sds: variable to hold the statistics for this sched_domain.
|
|
|
*/
|
|
|
-static inline void update_sd_lb_stats(struct lb_env *env,
|
|
|
- struct sd_lb_stats *sds)
|
|
|
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
|
|
|
{
|
|
|
struct sched_domain *child = env->sd->child;
|
|
|
struct sched_group *sg = env->sd->groups;
|
|
@@ -4720,11 +5618,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
|
|
|
if (local_group) {
|
|
|
sds->local = sg;
|
|
|
sgs = &sds->local_stat;
|
|
|
+
|
|
|
+ if (env->idle != CPU_NEWLY_IDLE ||
|
|
|
+ time_after_eq(jiffies, sg->sgp->next_update))
|
|
|
+ update_group_power(env->sd, env->dst_cpu);
|
|
|
}
|
|
|
|
|
|
- memset(sgs, 0, sizeof(*sgs));
|
|
|
update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
|
|
|
|
|
|
+ if (local_group)
|
|
|
+ goto next_group;
|
|
|
+
|
|
|
/*
|
|
|
* In case the child domain prefers tasks go to siblings
|
|
|
* first, lower the sg capacity to one so that we'll try
|
|
@@ -4735,21 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
|
|
|
* heaviest group when it is already under-utilized (possible
|
|
|
* with a large weight task outweighs the tasks on the system).
|
|
|
*/
|
|
|
- if (prefer_sibling && !local_group &&
|
|
|
- sds->local && sds->local_stat.group_has_capacity)
|
|
|
+ if (prefer_sibling && sds->local &&
|
|
|
+ sds->local_stat.group_has_capacity)
|
|
|
sgs->group_capacity = min(sgs->group_capacity, 1U);
|
|
|
|
|
|
- /* Now, start updating sd_lb_stats */
|
|
|
- sds->total_load += sgs->group_load;
|
|
|
- sds->total_pwr += sgs->group_power;
|
|
|
-
|
|
|
- if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
|
|
|
+ if (update_sd_pick_busiest(env, sds, sg, sgs)) {
|
|
|
sds->busiest = sg;
|
|
|
sds->busiest_stat = *sgs;
|
|
|
}
|
|
|
|
|
|
+next_group:
|
|
|
+ /* Now, start updating sd_lb_stats */
|
|
|
+ sds->total_load += sgs->group_load;
|
|
|
+ sds->total_pwr += sgs->group_power;
|
|
|
+
|
|
|
sg = sg->next;
|
|
|
} while (sg != env->sd->groups);
|
|
|
+
|
|
|
+ if (env->sd->flags & SD_NUMA)
|
|
|
+ env->fbq_type = fbq_classify_group(&sds->busiest_stat);
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -5053,15 +5961,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|
|
int i;
|
|
|
|
|
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
|
|
- unsigned long power = power_of(i);
|
|
|
- unsigned long capacity = DIV_ROUND_CLOSEST(power,
|
|
|
- SCHED_POWER_SCALE);
|
|
|
- unsigned long wl;
|
|
|
+ unsigned long power, capacity, wl;
|
|
|
+ enum fbq_type rt;
|
|
|
|
|
|
+ rq = cpu_rq(i);
|
|
|
+ rt = fbq_classify_rq(rq);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We classify groups/runqueues into three groups:
|
|
|
+ * - regular: there are !numa tasks
|
|
|
+ * - remote: there are numa tasks that run on the 'wrong' node
|
|
|
+ * - all: there is no distinction
|
|
|
+ *
|
|
|
+ * In order to avoid migrating ideally placed numa tasks,
|
|
|
+ * ignore those when there's better options.
|
|
|
+ *
|
|
|
+ * If we ignore the actual busiest queue to migrate another
|
|
|
+ * task, the next balance pass can still reduce the busiest
|
|
|
+ * queue by moving tasks around inside the node.
|
|
|
+ *
|
|
|
+ * If we cannot move enough load due to this classification
|
|
|
+ * the next pass will adjust the group classification and
|
|
|
+ * allow migration of more tasks.
|
|
|
+ *
|
|
|
+ * Both cases only affect the total convergence complexity.
|
|
|
+ */
|
|
|
+ if (rt > env->fbq_type)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ power = power_of(i);
|
|
|
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
|
|
|
if (!capacity)
|
|
|
capacity = fix_small_capacity(env->sd, group);
|
|
|
|
|
|
- rq = cpu_rq(i);
|
|
|
wl = weighted_cpuload(i);
|
|
|
|
|
|
/*
|
|
@@ -5164,6 +6096,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
|
int *continue_balancing)
|
|
|
{
|
|
|
int ld_moved, cur_ld_moved, active_balance = 0;
|
|
|
+ struct sched_domain *sd_parent = sd->parent;
|
|
|
struct sched_group *group;
|
|
|
struct rq *busiest;
|
|
|
unsigned long flags;
|
|
@@ -5177,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|
|
.idle = idle,
|
|
|
.loop_break = sched_nr_migrate_break,
|
|
|
.cpus = cpus,
|
|
|
+ .fbq_type = all,
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -5268,17 +6202,17 @@ more_balance:
|
|
|
* moreover subsequent load balance cycles should correct the
|
|
|
* excess load moved.
|
|
|
*/
|
|
|
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
|
|
|
+ if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
|
|
|
+
|
|
|
+ /* Prevent to re-select dst_cpu via env's cpus */
|
|
|
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
|
|
|
|
|
|
env.dst_rq = cpu_rq(env.new_dst_cpu);
|
|
|
env.dst_cpu = env.new_dst_cpu;
|
|
|
- env.flags &= ~LBF_SOME_PINNED;
|
|
|
+ env.flags &= ~LBF_DST_PINNED;
|
|
|
env.loop = 0;
|
|
|
env.loop_break = sched_nr_migrate_break;
|
|
|
|
|
|
- /* Prevent to re-select dst_cpu via env's cpus */
|
|
|
- cpumask_clear_cpu(env.dst_cpu, env.cpus);
|
|
|
-
|
|
|
/*
|
|
|
* Go back to "more_balance" rather than "redo" since we
|
|
|
* need to continue with same src_cpu.
|
|
@@ -5286,6 +6220,18 @@ more_balance:
|
|
|
goto more_balance;
|
|
|
}
|
|
|
|
|
|
+ /*
|
|
|
+ * We failed to reach balance because of affinity.
|
|
|
+ */
|
|
|
+ if (sd_parent) {
|
|
|
+ int *group_imbalance = &sd_parent->groups->sgp->imbalance;
|
|
|
+
|
|
|
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
|
|
|
+ *group_imbalance = 1;
|
|
|
+ } else if (*group_imbalance)
|
|
|
+ *group_imbalance = 0;
|
|
|
+ }
|
|
|
+
|
|
|
/* All tasks on this runqueue were pinned by CPU affinity */
|
|
|
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
|
|
cpumask_clear_cpu(cpu_of(busiest), cpus);
|
|
@@ -5393,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
|
|
|
struct sched_domain *sd;
|
|
|
int pulled_task = 0;
|
|
|
unsigned long next_balance = jiffies + HZ;
|
|
|
+ u64 curr_cost = 0;
|
|
|
|
|
|
this_rq->idle_stamp = rq_clock(this_rq);
|
|
|
|
|
@@ -5409,15 +6356,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
|
|
|
for_each_domain(this_cpu, sd) {
|
|
|
unsigned long interval;
|
|
|
int continue_balancing = 1;
|
|
|
+ u64 t0, domain_cost;
|
|
|
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE))
|
|
|
continue;
|
|
|
|
|
|
+ if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
|
|
|
+ break;
|
|
|
+
|
|
|
if (sd->flags & SD_BALANCE_NEWIDLE) {
|
|
|
+ t0 = sched_clock_cpu(this_cpu);
|
|
|
+
|
|
|
/* If we've pulled tasks over stop searching: */
|
|
|
pulled_task = load_balance(this_cpu, this_rq,
|
|
|
sd, CPU_NEWLY_IDLE,
|
|
|
&continue_balancing);
|
|
|
+
|
|
|
+ domain_cost = sched_clock_cpu(this_cpu) - t0;
|
|
|
+ if (domain_cost > sd->max_newidle_lb_cost)
|
|
|
+ sd->max_newidle_lb_cost = domain_cost;
|
|
|
+
|
|
|
+ curr_cost += domain_cost;
|
|
|
}
|
|
|
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
@@ -5439,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
|
|
|
*/
|
|
|
this_rq->next_balance = next_balance;
|
|
|
}
|
|
|
+
|
|
|
+ if (curr_cost > this_rq->max_idle_balance_cost)
|
|
|
+ this_rq->max_idle_balance_cost = curr_cost;
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -5662,15 +6624,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
|
|
|
/* Earliest time when we have to do rebalance again */
|
|
|
unsigned long next_balance = jiffies + 60*HZ;
|
|
|
int update_next_balance = 0;
|
|
|
- int need_serialize;
|
|
|
+ int need_serialize, need_decay = 0;
|
|
|
+ u64 max_cost = 0;
|
|
|
|
|
|
update_blocked_averages(cpu);
|
|
|
|
|
|
rcu_read_lock();
|
|
|
for_each_domain(cpu, sd) {
|
|
|
+ /*
|
|
|
+ * Decay the newidle max times here because this is a regular
|
|
|
+ * visit to all the domains. Decay ~1% per second.
|
|
|
+ */
|
|
|
+ if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
|
|
|
+ sd->max_newidle_lb_cost =
|
|
|
+ (sd->max_newidle_lb_cost * 253) / 256;
|
|
|
+ sd->next_decay_max_lb_cost = jiffies + HZ;
|
|
|
+ need_decay = 1;
|
|
|
+ }
|
|
|
+ max_cost += sd->max_newidle_lb_cost;
|
|
|
+
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE))
|
|
|
continue;
|
|
|
|
|
|
+ /*
|
|
|
+ * Stop the load balance at this level. There is another
|
|
|
+ * CPU in our sched group which is doing load balancing more
|
|
|
+ * actively.
|
|
|
+ */
|
|
|
+ if (!continue_balancing) {
|
|
|
+ if (need_decay)
|
|
|
+ continue;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
interval = sd->balance_interval;
|
|
|
if (idle != CPU_IDLE)
|
|
|
interval *= sd->busy_factor;
|
|
@@ -5689,7 +6675,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
|
|
|
if (time_after_eq(jiffies, sd->last_balance + interval)) {
|
|
|
if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
|
|
|
/*
|
|
|
- * The LBF_SOME_PINNED logic could have changed
|
|
|
+ * The LBF_DST_PINNED logic could have changed
|
|
|
* env->dst_cpu, so we can't know our idle
|
|
|
* state even if we migrated tasks. Update it.
|
|
|
*/
|
|
@@ -5704,14 +6690,14 @@ out:
|
|
|
next_balance = sd->last_balance + interval;
|
|
|
update_next_balance = 1;
|
|
|
}
|
|
|
-
|
|
|
+ }
|
|
|
+ if (need_decay) {
|
|
|
/*
|
|
|
- * Stop the load balance at this level. There is another
|
|
|
- * CPU in our sched group which is doing load balancing more
|
|
|
- * actively.
|
|
|
+ * Ensure the rq-wide value also decays but keep it at a
|
|
|
+ * reasonable floor to avoid funnies with rq->avg_idle.
|
|
|
*/
|
|
|
- if (!continue_balancing)
|
|
|
- break;
|
|
|
+ rq->max_idle_balance_cost =
|
|
|
+ max((u64)sysctl_sched_migration_cost, max_cost);
|
|
|
}
|
|
|
rcu_read_unlock();
|
|
|
|
|
@@ -6214,7 +7200,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
|
|
se->cfs_rq = parent->my_q;
|
|
|
|
|
|
se->my_q = cfs_rq;
|
|
|
- update_load_set(&se->load, 0);
|
|
|
+ /* guarantee group entities always have weight */
|
|
|
+ update_load_set(&se->load, NICE_0_LOAD);
|
|
|
se->parent = parent;
|
|
|
}
|
|
|
|